diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-10-11 21:19:31 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-10-11 21:19:31 +0200 |
commit | d09ba13110e303d7baa29d170da94cd24f7662b2 (patch) | |
tree | 570b4d5e11889a6f951dee963c6549b2c8ca0293 /drivers | |
parent | Merge branch 'for-linus-4.9' of git://git.kernel.org/pub/scm/linux/kernel/git... (diff) | |
parent | Merge branch 'for-4.9/dax' into libnvdimm-for-next (diff) | |
download | linux-d09ba13110e303d7baa29d170da94cd24f7662b2.tar.xz linux-d09ba13110e303d7baa29d170da94cd24f7662b2.zip |
Merge tag 'libnvdimm-for-4.9' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm
Pull libnvdimm updates from Dan Williams:
"Aside from the recently added pmem sub-division support these have
been in -next for several releases with no reported issues. The sub-
division support was included in next-20161010 with no reported
issues. It passes all unit tests including new tests for all the new
functionality below.
Summary:
- PMEM sub-division support: Allow a single PMEM region to be divided
into multiple namespaces. Originally, ~2 years ago, it was thought
that partitions of a /dev/pmemX block device could handle
sub-allocations of persistent memory for different use cases. With
the decision to not support DAX mappings of raw block-devices, and
the genesis of device-dax, the need for having multiple
pmem-namespace per region has grown.
- Device-DAX unified inode: In support of dynamic-resizing of a
device-dax instance the kernel arranges for all mappings of a
device-dax node to share the same inode. This allows unmap /
truncate / invalidation events to affect all instances of the
device similar to the behavior of mmap on block devices.
- Hardware error scrubbing reworks: The original address-range-scrub
and badblocks tracking solution allowed clearing entries at the
individual namespace level, but it failed to clear the internal
list of media errors maintained at the bus level. The result was
that the next scrub or namespace disable/re-enable event would
restore the cleared badblocks, but now that is fixed. The v4.8
kernel introduced an auto-scrub-on-machine-check behavior to
repopulate the badblocks list. Now, in v4.9, the auto-scrub
behavior can be disabled and simply arrange for the error reported
in the machine-check to be added to the list.
- DIMM health-event notification support: ACPI 6.1 defines a
notification event code that can be send to ACPI NVDIMM devices. A
poll(2) capable file descriptor for these events can be obtained
from the nmemX/nfit/flags sysfs-attribute of a libnvdimm memory
device.
- Miscellaneous fixes: NVDIMM-N probe error, device-dax build error,
and a change to dedup the flush hint list to not flush the memory
controller more than necessary"
* tag 'libnvdimm-for-4.9' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm: (39 commits)
/dev/dax: fix Kconfig dependency build breakage
dax: use correct dev_t value
dax: convert devm_create_dax_dev to PTR_ERR
libnvdimm, namespace: allow creation of multiple pmem-namespaces per region
libnvdimm, namespace: lift single pmem limit in scan_labels()
libnvdimm, namespace: filter out of range labels in scan_labels()
libnvdimm, namespace: enable allocation of multiple pmem namespaces
libnvdimm, namespace: update label implementation for multi-pmem
libnvdimm, namespace: expand pmem device naming scheme for multi-pmem
libnvdimm, region: update nd_region_available_dpa() for multi-pmem support
libnvdimm, namespace: sort namespaces by dpa at init
libnvdimm, namespace: allow multiple pmem-namespaces per region at scan time
tools/testing/nvdimm: support for sub-dividing a pmem region
libnvdimm, namespace: unify blk and pmem label scanning
libnvdimm, namespace: refactor uuid_show() into a namespace_to_uuid() helper
libnvdimm, label: convert label tracking to a linked list
libnvdimm, region: move region-mapping input-paramters to nd_mapping_desc
nvdimm: reduce duplicated wpq flushes
libnvdimm: clear the internal poison_list when clearing badblocks
pmem: reduce kmap_atomic sections to the memcpys only
...
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/acpi/nfit/core.c | 210 | ||||
-rw-r--r-- | drivers/acpi/nfit/mce.c | 24 | ||||
-rw-r--r-- | drivers/acpi/nfit/nfit.h | 17 | ||||
-rw-r--r-- | drivers/dax/Kconfig | 5 | ||||
-rw-r--r-- | drivers/dax/dax.c | 577 | ||||
-rw-r--r-- | drivers/dax/dax.h | 5 | ||||
-rw-r--r-- | drivers/dax/pmem.c | 7 | ||||
-rw-r--r-- | drivers/nvdimm/Kconfig | 2 | ||||
-rw-r--r-- | drivers/nvdimm/bus.c | 2 | ||||
-rw-r--r-- | drivers/nvdimm/core.c | 73 | ||||
-rw-r--r-- | drivers/nvdimm/dimm.c | 11 | ||||
-rw-r--r-- | drivers/nvdimm/dimm_devs.c | 226 | ||||
-rw-r--r-- | drivers/nvdimm/label.c | 192 | ||||
-rw-r--r-- | drivers/nvdimm/namespace_devs.c | 792 | ||||
-rw-r--r-- | drivers/nvdimm/nd-core.h | 24 | ||||
-rw-r--r-- | drivers/nvdimm/nd.h | 29 | ||||
-rw-r--r-- | drivers/nvdimm/pmem.c | 28 | ||||
-rw-r--r-- | drivers/nvdimm/region_devs.c | 75 |
18 files changed, 1630 insertions, 669 deletions
diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index e1d5ea6d5e40..71a7d07c28c9 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -886,6 +886,58 @@ static ssize_t revision_show(struct device *dev, } static DEVICE_ATTR_RO(revision); +static ssize_t hw_error_scrub_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev); + struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus); + struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc); + + return sprintf(buf, "%d\n", acpi_desc->scrub_mode); +} + +/* + * The 'hw_error_scrub' attribute can have the following values written to it: + * '0': Switch to the default mode where an exception will only insert + * the address of the memory error into the poison and badblocks lists. + * '1': Enable a full scrub to happen if an exception for a memory error is + * received. + */ +static ssize_t hw_error_scrub_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t size) +{ + struct nvdimm_bus_descriptor *nd_desc; + ssize_t rc; + long val; + + rc = kstrtol(buf, 0, &val); + if (rc) + return rc; + + device_lock(dev); + nd_desc = dev_get_drvdata(dev); + if (nd_desc) { + struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc); + + switch (val) { + case HW_ERROR_SCRUB_ON: + acpi_desc->scrub_mode = HW_ERROR_SCRUB_ON; + break; + case HW_ERROR_SCRUB_OFF: + acpi_desc->scrub_mode = HW_ERROR_SCRUB_OFF; + break; + default: + rc = -EINVAL; + break; + } + } + device_unlock(dev); + if (rc) + return rc; + return size; +} +static DEVICE_ATTR_RW(hw_error_scrub); + /* * This shows the number of full Address Range Scrubs that have been * completed since driver load time. Userspace can wait on this using @@ -958,6 +1010,7 @@ static umode_t nfit_visible(struct kobject *kobj, struct attribute *a, int n) static struct attribute *acpi_nfit_attributes[] = { &dev_attr_revision.attr, &dev_attr_scrub.attr, + &dev_attr_hw_error_scrub.attr, NULL, }; @@ -1256,6 +1309,44 @@ static struct nvdimm *acpi_nfit_dimm_by_handle(struct acpi_nfit_desc *acpi_desc, return NULL; } +void __acpi_nvdimm_notify(struct device *dev, u32 event) +{ + struct nfit_mem *nfit_mem; + struct acpi_nfit_desc *acpi_desc; + + dev_dbg(dev->parent, "%s: %s: event: %d\n", dev_name(dev), __func__, + event); + + if (event != NFIT_NOTIFY_DIMM_HEALTH) { + dev_dbg(dev->parent, "%s: unknown event: %d\n", dev_name(dev), + event); + return; + } + + acpi_desc = dev_get_drvdata(dev->parent); + if (!acpi_desc) + return; + + /* + * If we successfully retrieved acpi_desc, then we know nfit_mem data + * is still valid. + */ + nfit_mem = dev_get_drvdata(dev); + if (nfit_mem && nfit_mem->flags_attr) + sysfs_notify_dirent(nfit_mem->flags_attr); +} +EXPORT_SYMBOL_GPL(__acpi_nvdimm_notify); + +static void acpi_nvdimm_notify(acpi_handle handle, u32 event, void *data) +{ + struct acpi_device *adev = data; + struct device *dev = &adev->dev; + + device_lock(dev->parent); + __acpi_nvdimm_notify(dev, event); + device_unlock(dev->parent); +} + static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc, struct nfit_mem *nfit_mem, u32 device_handle) { @@ -1280,6 +1371,13 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc, return force_enable_dimms ? 0 : -ENODEV; } + if (ACPI_FAILURE(acpi_install_notify_handler(adev_dimm->handle, + ACPI_DEVICE_NOTIFY, acpi_nvdimm_notify, adev_dimm))) { + dev_err(dev, "%s: notification registration failed\n", + dev_name(&adev_dimm->dev)); + return -ENXIO; + } + /* * Until standardization materializes we need to consider 4 * different command sets. Note, that checking for function0 (bit0) @@ -1318,18 +1416,41 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc, return 0; } +static void shutdown_dimm_notify(void *data) +{ + struct acpi_nfit_desc *acpi_desc = data; + struct nfit_mem *nfit_mem; + + mutex_lock(&acpi_desc->init_mutex); + /* + * Clear out the nfit_mem->flags_attr and shut down dimm event + * notifications. + */ + list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) { + struct acpi_device *adev_dimm = nfit_mem->adev; + + if (nfit_mem->flags_attr) { + sysfs_put(nfit_mem->flags_attr); + nfit_mem->flags_attr = NULL; + } + if (adev_dimm) + acpi_remove_notify_handler(adev_dimm->handle, + ACPI_DEVICE_NOTIFY, acpi_nvdimm_notify); + } + mutex_unlock(&acpi_desc->init_mutex); +} + static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc) { struct nfit_mem *nfit_mem; - int dimm_count = 0; + int dimm_count = 0, rc; + struct nvdimm *nvdimm; list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) { struct acpi_nfit_flush_address *flush; unsigned long flags = 0, cmd_mask; - struct nvdimm *nvdimm; u32 device_handle; u16 mem_flags; - int rc; device_handle = __to_nfit_memdev(nfit_mem)->device_handle; nvdimm = acpi_nfit_dimm_by_handle(acpi_desc, device_handle); @@ -1382,7 +1503,30 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc) } - return nvdimm_bus_check_dimm_count(acpi_desc->nvdimm_bus, dimm_count); + rc = nvdimm_bus_check_dimm_count(acpi_desc->nvdimm_bus, dimm_count); + if (rc) + return rc; + + /* + * Now that dimms are successfully registered, and async registration + * is flushed, attempt to enable event notification. + */ + list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) { + struct kernfs_node *nfit_kernfs; + + nvdimm = nfit_mem->nvdimm; + nfit_kernfs = sysfs_get_dirent(nvdimm_kobj(nvdimm)->sd, "nfit"); + if (nfit_kernfs) + nfit_mem->flags_attr = sysfs_get_dirent(nfit_kernfs, + "flags"); + sysfs_put(nfit_kernfs); + if (!nfit_mem->flags_attr) + dev_warn(acpi_desc->dev, "%s: notifications disabled\n", + nvdimm_name(nvdimm)); + } + + return devm_add_action_or_reset(acpi_desc->dev, shutdown_dimm_notify, + acpi_desc); } static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc) @@ -1491,9 +1635,9 @@ static int acpi_nfit_init_interleave_set(struct acpi_nfit_desc *acpi_desc, if (!info) return -ENOMEM; for (i = 0; i < nr; i++) { - struct nd_mapping *nd_mapping = &ndr_desc->nd_mapping[i]; + struct nd_mapping_desc *mapping = &ndr_desc->mapping[i]; struct nfit_set_info_map *map = &info->mapping[i]; - struct nvdimm *nvdimm = nd_mapping->nvdimm; + struct nvdimm *nvdimm = mapping->nvdimm; struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); struct acpi_nfit_memory_map *memdev = memdev_from_spa(acpi_desc, spa->range_index, i); @@ -1917,7 +2061,7 @@ static int acpi_nfit_insert_resource(struct acpi_nfit_desc *acpi_desc, } static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc, - struct nd_mapping *nd_mapping, struct nd_region_desc *ndr_desc, + struct nd_mapping_desc *mapping, struct nd_region_desc *ndr_desc, struct acpi_nfit_memory_map *memdev, struct nfit_spa *nfit_spa) { @@ -1934,12 +2078,12 @@ static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc, return -ENODEV; } - nd_mapping->nvdimm = nvdimm; + mapping->nvdimm = nvdimm; switch (nfit_spa_type(spa)) { case NFIT_SPA_PM: case NFIT_SPA_VOLATILE: - nd_mapping->start = memdev->address; - nd_mapping->size = memdev->region_size; + mapping->start = memdev->address; + mapping->size = memdev->region_size; break; case NFIT_SPA_DCR: nfit_mem = nvdimm_provider_data(nvdimm); @@ -1947,13 +2091,13 @@ static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc, dev_dbg(acpi_desc->dev, "spa%d %s missing bdw\n", spa->range_index, nvdimm_name(nvdimm)); } else { - nd_mapping->size = nfit_mem->bdw->capacity; - nd_mapping->start = nfit_mem->bdw->start_address; + mapping->size = nfit_mem->bdw->capacity; + mapping->start = nfit_mem->bdw->start_address; ndr_desc->num_lanes = nfit_mem->bdw->windows; blk_valid = 1; } - ndr_desc->nd_mapping = nd_mapping; + ndr_desc->mapping = mapping; ndr_desc->num_mappings = blk_valid; ndbr_desc = to_blk_region_desc(ndr_desc); ndbr_desc->enable = acpi_nfit_blk_region_enable; @@ -1979,7 +2123,7 @@ static bool nfit_spa_is_virtual(struct acpi_nfit_system_address *spa) static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, struct nfit_spa *nfit_spa) { - static struct nd_mapping nd_mappings[ND_MAX_MAPPINGS]; + static struct nd_mapping_desc mappings[ND_MAX_MAPPINGS]; struct acpi_nfit_system_address *spa = nfit_spa->spa; struct nd_blk_region_desc ndbr_desc; struct nd_region_desc *ndr_desc; @@ -1998,7 +2142,7 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, } memset(&res, 0, sizeof(res)); - memset(&nd_mappings, 0, sizeof(nd_mappings)); + memset(&mappings, 0, sizeof(mappings)); memset(&ndbr_desc, 0, sizeof(ndbr_desc)); res.start = spa->address; res.end = res.start + spa->length - 1; @@ -2014,7 +2158,7 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) { struct acpi_nfit_memory_map *memdev = nfit_memdev->memdev; - struct nd_mapping *nd_mapping; + struct nd_mapping_desc *mapping; if (memdev->range_index != spa->range_index) continue; @@ -2023,14 +2167,14 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, spa->range_index, ND_MAX_MAPPINGS); return -ENXIO; } - nd_mapping = &nd_mappings[count++]; - rc = acpi_nfit_init_mapping(acpi_desc, nd_mapping, ndr_desc, + mapping = &mappings[count++]; + rc = acpi_nfit_init_mapping(acpi_desc, mapping, ndr_desc, memdev, nfit_spa); if (rc) goto out; } - ndr_desc->nd_mapping = nd_mappings; + ndr_desc->mapping = mappings; ndr_desc->num_mappings = count; rc = acpi_nfit_init_interleave_set(acpi_desc, ndr_desc, spa); if (rc) @@ -2678,29 +2822,30 @@ static int acpi_nfit_remove(struct acpi_device *adev) return 0; } -static void acpi_nfit_notify(struct acpi_device *adev, u32 event) +void __acpi_nfit_notify(struct device *dev, acpi_handle handle, u32 event) { - struct acpi_nfit_desc *acpi_desc = dev_get_drvdata(&adev->dev); + struct acpi_nfit_desc *acpi_desc = dev_get_drvdata(dev); struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL }; - struct device *dev = &adev->dev; union acpi_object *obj; acpi_status status; int ret; dev_dbg(dev, "%s: event: %d\n", __func__, event); - device_lock(dev); + if (event != NFIT_NOTIFY_UPDATE) + return; + if (!dev->driver) { /* dev->driver may be null if we're being removed */ dev_dbg(dev, "%s: no driver found for dev\n", __func__); - goto out_unlock; + return; } if (!acpi_desc) { acpi_desc = devm_kzalloc(dev, sizeof(*acpi_desc), GFP_KERNEL); if (!acpi_desc) - goto out_unlock; - acpi_nfit_desc_init(acpi_desc, &adev->dev); + return; + acpi_nfit_desc_init(acpi_desc, dev); } else { /* * Finish previous registration before considering new @@ -2710,10 +2855,10 @@ static void acpi_nfit_notify(struct acpi_device *adev, u32 event) } /* Evaluate _FIT */ - status = acpi_evaluate_object(adev->handle, "_FIT", NULL, &buf); + status = acpi_evaluate_object(handle, "_FIT", NULL, &buf); if (ACPI_FAILURE(status)) { dev_err(dev, "failed to evaluate _FIT\n"); - goto out_unlock; + return; } obj = buf.pointer; @@ -2725,9 +2870,14 @@ static void acpi_nfit_notify(struct acpi_device *adev, u32 event) } else dev_err(dev, "Invalid _FIT\n"); kfree(buf.pointer); +} +EXPORT_SYMBOL_GPL(__acpi_nfit_notify); - out_unlock: - device_unlock(dev); +static void acpi_nfit_notify(struct acpi_device *adev, u32 event) +{ + device_lock(&adev->dev); + __acpi_nfit_notify(&adev->dev, adev->handle, event); + device_unlock(&adev->dev); } static const struct acpi_device_id acpi_nfit_ids[] = { diff --git a/drivers/acpi/nfit/mce.c b/drivers/acpi/nfit/mce.c index 161f91539ae6..e5ce81c38eed 100644 --- a/drivers/acpi/nfit/mce.c +++ b/drivers/acpi/nfit/mce.c @@ -14,6 +14,7 @@ */ #include <linux/notifier.h> #include <linux/acpi.h> +#include <linux/nd.h> #include <asm/mce.h> #include "nfit.h" @@ -62,12 +63,25 @@ static int nfit_handle_mce(struct notifier_block *nb, unsigned long val, } mutex_unlock(&acpi_desc->init_mutex); - /* - * We can ignore an -EBUSY here because if an ARS is already - * in progress, just let that be the last authoritative one - */ - if (found_match) + if (!found_match) + continue; + + /* If this fails due to an -ENOMEM, there is little we can do */ + nvdimm_bus_add_poison(acpi_desc->nvdimm_bus, + ALIGN(mce->addr, L1_CACHE_BYTES), + L1_CACHE_BYTES); + nvdimm_region_notify(nfit_spa->nd_region, + NVDIMM_REVALIDATE_POISON); + + if (acpi_desc->scrub_mode == HW_ERROR_SCRUB_ON) { + /* + * We can ignore an -EBUSY here because if an ARS is + * already in progress, just let that be the last + * authoritative one + */ acpi_nfit_ars_rescan(acpi_desc); + } + break; } mutex_unlock(&acpi_desc_lock); diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h index e894ded24d99..14296f5267c8 100644 --- a/drivers/acpi/nfit/nfit.h +++ b/drivers/acpi/nfit/nfit.h @@ -78,6 +78,14 @@ enum { NFIT_ARS_TIMEOUT = 90, }; +enum nfit_root_notifiers { + NFIT_NOTIFY_UPDATE = 0x80, +}; + +enum nfit_dimm_notifiers { + NFIT_NOTIFY_DIMM_HEALTH = 0x81, +}; + struct nfit_spa { struct list_head list; struct nd_region *nd_region; @@ -124,6 +132,7 @@ struct nfit_mem { struct acpi_nfit_system_address *spa_bdw; struct acpi_nfit_interleave *idt_dcr; struct acpi_nfit_interleave *idt_bdw; + struct kernfs_node *flags_attr; struct nfit_flush *nfit_flush; struct list_head list; struct acpi_device *adev; @@ -152,6 +161,7 @@ struct acpi_nfit_desc { struct list_head list; struct kernfs_node *scrub_count_state; unsigned int scrub_count; + unsigned int scrub_mode; unsigned int cancel:1; unsigned long dimm_cmd_force_en; unsigned long bus_cmd_force_en; @@ -159,6 +169,11 @@ struct acpi_nfit_desc { void *iobuf, u64 len, int rw); }; +enum scrub_mode { + HW_ERROR_SCRUB_OFF, + HW_ERROR_SCRUB_ON, +}; + enum nd_blk_mmio_selector { BDW, DCR, @@ -223,5 +238,7 @@ static inline struct acpi_nfit_desc *to_acpi_desc( const u8 *to_nfit_uuid(enum nfit_uuids id); int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, void *nfit, acpi_size sz); +void __acpi_nfit_notify(struct device *dev, acpi_handle handle, u32 event); +void __acpi_nvdimm_notify(struct device *dev, u32 event); void acpi_nfit_desc_init(struct acpi_nfit_desc *acpi_desc, struct device *dev); #endif /* __NFIT_H__ */ diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig index cedab7572de3..daadd20aa936 100644 --- a/drivers/dax/Kconfig +++ b/drivers/dax/Kconfig @@ -23,4 +23,9 @@ config DEV_DAX_PMEM Say Y if unsure +config NR_DEV_DAX + int "Maximum number of Device-DAX instances" + default 32768 + range 256 2147483647 + endif diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index 29f600f2c447..0e499bfca41c 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -13,15 +13,25 @@ #include <linux/pagemap.h> #include <linux/module.h> #include <linux/device.h> +#include <linux/mount.h> #include <linux/pfn_t.h> +#include <linux/hash.h> +#include <linux/cdev.h> #include <linux/slab.h> #include <linux/dax.h> #include <linux/fs.h> #include <linux/mm.h> +#include "dax.h" -static int dax_major; +static dev_t dax_devt; static struct class *dax_class; static DEFINE_IDA(dax_minor_ida); +static int nr_dax = CONFIG_NR_DEV_DAX; +module_param(nr_dax, int, S_IRUGO); +static struct vfsmount *dax_mnt; +static struct kmem_cache *dax_cache __read_mostly; +static struct super_block *dax_superblock __read_mostly; +MODULE_PARM_DESC(nr_dax, "max number of device-dax instances"); /** * struct dax_region - mapping infrastructure for dax devices @@ -48,7 +58,7 @@ struct dax_region { * struct dax_dev - subdivision of a dax region * @region - parent region * @dev - device backing the character device - * @kref - enable this data to be tracked in filp->private_data + * @cdev - core chardev data * @alive - !alive + rcu grace period == no new mappings can be established * @id - child id in the region * @num_resources - number of physical address extents in this device @@ -56,41 +66,139 @@ struct dax_region { */ struct dax_dev { struct dax_region *region; - struct device *dev; - struct kref kref; + struct inode *inode; + struct device dev; + struct cdev cdev; bool alive; int id; int num_resources; struct resource res[0]; }; -static void dax_region_free(struct kref *kref) +static struct inode *dax_alloc_inode(struct super_block *sb) { - struct dax_region *dax_region; + return kmem_cache_alloc(dax_cache, GFP_KERNEL); +} - dax_region = container_of(kref, struct dax_region, kref); - kfree(dax_region); +static void dax_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + + kmem_cache_free(dax_cache, inode); } -void dax_region_put(struct dax_region *dax_region) +static void dax_destroy_inode(struct inode *inode) { - kref_put(&dax_region->kref, dax_region_free); + call_rcu(&inode->i_rcu, dax_i_callback); } -EXPORT_SYMBOL_GPL(dax_region_put); -static void dax_dev_free(struct kref *kref) +static const struct super_operations dax_sops = { + .statfs = simple_statfs, + .alloc_inode = dax_alloc_inode, + .destroy_inode = dax_destroy_inode, + .drop_inode = generic_delete_inode, +}; + +static struct dentry *dax_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - struct dax_dev *dax_dev; + return mount_pseudo(fs_type, "dax:", &dax_sops, NULL, DAXFS_MAGIC); +} - dax_dev = container_of(kref, struct dax_dev, kref); - dax_region_put(dax_dev->region); - kfree(dax_dev); +static struct file_system_type dax_type = { + .name = "dax", + .mount = dax_mount, + .kill_sb = kill_anon_super, +}; + +static int dax_test(struct inode *inode, void *data) +{ + return inode->i_cdev == data; +} + +static int dax_set(struct inode *inode, void *data) +{ + inode->i_cdev = data; + return 0; +} + +static struct inode *dax_inode_get(struct cdev *cdev, dev_t devt) +{ + struct inode *inode; + + inode = iget5_locked(dax_superblock, hash_32(devt + DAXFS_MAGIC, 31), + dax_test, dax_set, cdev); + + if (!inode) + return NULL; + + if (inode->i_state & I_NEW) { + inode->i_mode = S_IFCHR; + inode->i_flags = S_DAX; + inode->i_rdev = devt; + mapping_set_gfp_mask(&inode->i_data, GFP_USER); + unlock_new_inode(inode); + } + return inode; +} + +static void init_once(void *inode) +{ + inode_init_once(inode); +} + +static int dax_inode_init(void) +{ + int rc; + + dax_cache = kmem_cache_create("dax_cache", sizeof(struct inode), 0, + (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD|SLAB_ACCOUNT), + init_once); + if (!dax_cache) + return -ENOMEM; + + rc = register_filesystem(&dax_type); + if (rc) + goto err_register_fs; + + dax_mnt = kern_mount(&dax_type); + if (IS_ERR(dax_mnt)) { + rc = PTR_ERR(dax_mnt); + goto err_mount; + } + dax_superblock = dax_mnt->mnt_sb; + + return 0; + + err_mount: + unregister_filesystem(&dax_type); + err_register_fs: + kmem_cache_destroy(dax_cache); + + return rc; } -static void dax_dev_put(struct dax_dev *dax_dev) +static void dax_inode_exit(void) +{ + kern_unmount(dax_mnt); + unregister_filesystem(&dax_type); + kmem_cache_destroy(dax_cache); +} + +static void dax_region_free(struct kref *kref) +{ + struct dax_region *dax_region; + + dax_region = container_of(kref, struct dax_region, kref); + kfree(dax_region); +} + +void dax_region_put(struct dax_region *dax_region) { - kref_put(&dax_dev->kref, dax_dev_free); + kref_put(&dax_region->kref, dax_region_free); } +EXPORT_SYMBOL_GPL(dax_region_put); struct dax_region *alloc_dax_region(struct device *parent, int region_id, struct resource *res, unsigned int align, void *addr, @@ -98,8 +206,11 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id, { struct dax_region *dax_region; - dax_region = kzalloc(sizeof(*dax_region), GFP_KERNEL); + if (!IS_ALIGNED(res->start, align) + || !IS_ALIGNED(resource_size(res), align)) + return NULL; + dax_region = kzalloc(sizeof(*dax_region), GFP_KERNEL); if (!dax_region) return NULL; @@ -116,10 +227,15 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id, } EXPORT_SYMBOL_GPL(alloc_dax_region); +static struct dax_dev *to_dax_dev(struct device *dev) +{ + return container_of(dev, struct dax_dev, dev); +} + static ssize_t size_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct dax_dev *dax_dev = dev_get_drvdata(dev); + struct dax_dev *dax_dev = to_dax_dev(dev); unsigned long long size = 0; int i; @@ -144,180 +260,11 @@ static const struct attribute_group *dax_attribute_groups[] = { NULL, }; -static void unregister_dax_dev(void *_dev) -{ - struct device *dev = _dev; - struct dax_dev *dax_dev = dev_get_drvdata(dev); - struct dax_region *dax_region = dax_dev->region; - - dev_dbg(dev, "%s\n", __func__); - - /* - * Note, rcu is not protecting the liveness of dax_dev, rcu is - * ensuring that any fault handlers that might have seen - * dax_dev->alive == true, have completed. Any fault handlers - * that start after synchronize_rcu() has started will abort - * upon seeing dax_dev->alive == false. - */ - dax_dev->alive = false; - synchronize_rcu(); - - get_device(dev); - device_unregister(dev); - ida_simple_remove(&dax_region->ida, dax_dev->id); - ida_simple_remove(&dax_minor_ida, MINOR(dev->devt)); - put_device(dev); - dax_dev_put(dax_dev); -} - -int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res, - int count) -{ - struct device *parent = dax_region->dev; - struct dax_dev *dax_dev; - struct device *dev; - int rc, minor; - dev_t dev_t; - - dax_dev = kzalloc(sizeof(*dax_dev) + sizeof(*res) * count, GFP_KERNEL); - if (!dax_dev) - return -ENOMEM; - memcpy(dax_dev->res, res, sizeof(*res) * count); - dax_dev->num_resources = count; - kref_init(&dax_dev->kref); - dax_dev->alive = true; - dax_dev->region = dax_region; - kref_get(&dax_region->kref); - - dax_dev->id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL); - if (dax_dev->id < 0) { - rc = dax_dev->id; - goto err_id; - } - - minor = ida_simple_get(&dax_minor_ida, 0, 0, GFP_KERNEL); - if (minor < 0) { - rc = minor; - goto err_minor; - } - - dev_t = MKDEV(dax_major, minor); - dev = device_create_with_groups(dax_class, parent, dev_t, dax_dev, - dax_attribute_groups, "dax%d.%d", dax_region->id, - dax_dev->id); - if (IS_ERR(dev)) { - rc = PTR_ERR(dev); - goto err_create; - } - dax_dev->dev = dev; - - rc = devm_add_action_or_reset(dax_region->dev, unregister_dax_dev, dev); - if (rc) - return rc; - - return 0; - - err_create: - ida_simple_remove(&dax_minor_ida, minor); - err_minor: - ida_simple_remove(&dax_region->ida, dax_dev->id); - err_id: - dax_dev_put(dax_dev); - - return rc; -} -EXPORT_SYMBOL_GPL(devm_create_dax_dev); - -/* return an unmapped area aligned to the dax region specified alignment */ -static unsigned long dax_dev_get_unmapped_area(struct file *filp, - unsigned long addr, unsigned long len, unsigned long pgoff, - unsigned long flags) -{ - unsigned long off, off_end, off_align, len_align, addr_align, align; - struct dax_dev *dax_dev = filp ? filp->private_data : NULL; - struct dax_region *dax_region; - - if (!dax_dev || addr) - goto out; - - dax_region = dax_dev->region; - align = dax_region->align; - off = pgoff << PAGE_SHIFT; - off_end = off + len; - off_align = round_up(off, align); - - if ((off_end <= off_align) || ((off_end - off_align) < align)) - goto out; - - len_align = len + align; - if ((off + len_align) < off) - goto out; - - addr_align = current->mm->get_unmapped_area(filp, addr, len_align, - pgoff, flags); - if (!IS_ERR_VALUE(addr_align)) { - addr_align += (off - addr_align) & (align - 1); - return addr_align; - } - out: - return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); -} - -static int __match_devt(struct device *dev, const void *data) -{ - const dev_t *devt = data; - - return dev->devt == *devt; -} - -static struct device *dax_dev_find(dev_t dev_t) -{ - return class_find_device(dax_class, NULL, &dev_t, __match_devt); -} - -static int dax_dev_open(struct inode *inode, struct file *filp) -{ - struct dax_dev *dax_dev = NULL; - struct device *dev; - - dev = dax_dev_find(inode->i_rdev); - if (!dev) - return -ENXIO; - - device_lock(dev); - dax_dev = dev_get_drvdata(dev); - if (dax_dev) { - dev_dbg(dev, "%s\n", __func__); - filp->private_data = dax_dev; - kref_get(&dax_dev->kref); - inode->i_flags = S_DAX; - } - device_unlock(dev); - - if (!dax_dev) { - put_device(dev); - return -ENXIO; - } - return 0; -} - -static int dax_dev_release(struct inode *inode, struct file *filp) -{ - struct dax_dev *dax_dev = filp->private_data; - struct device *dev = dax_dev->dev; - - dev_dbg(dax_dev->dev, "%s\n", __func__); - dax_dev_put(dax_dev); - put_device(dev); - - return 0; -} - static int check_vma(struct dax_dev *dax_dev, struct vm_area_struct *vma, const char *func) { struct dax_region *dax_region = dax_dev->region; - struct device *dev = dax_dev->dev; + struct device *dev = &dax_dev->dev; unsigned long mask; if (!dax_dev->alive) @@ -382,7 +329,7 @@ static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma, struct vm_fault *vmf) { unsigned long vaddr = (unsigned long) vmf->virtual_address; - struct device *dev = dax_dev->dev; + struct device *dev = &dax_dev->dev; struct dax_region *dax_region; int rc = VM_FAULT_SIGBUS; phys_addr_t phys; @@ -422,7 +369,7 @@ static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct file *filp = vma->vm_file; struct dax_dev *dax_dev = filp->private_data; - dev_dbg(dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__, + dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__, current->comm, (vmf->flags & FAULT_FLAG_WRITE) ? "write" : "read", vma->vm_start, vma->vm_end); rcu_read_lock(); @@ -437,7 +384,7 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, unsigned int flags) { unsigned long pmd_addr = addr & PMD_MASK; - struct device *dev = dax_dev->dev; + struct device *dev = &dax_dev->dev; struct dax_region *dax_region; phys_addr_t phys; pgoff_t pgoff; @@ -479,7 +426,7 @@ static int dax_dev_pmd_fault(struct vm_area_struct *vma, unsigned long addr, struct file *filp = vma->vm_file; struct dax_dev *dax_dev = filp->private_data; - dev_dbg(dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__, + dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__, current->comm, (flags & FAULT_FLAG_WRITE) ? "write" : "read", vma->vm_start, vma->vm_end); @@ -490,81 +437,257 @@ static int dax_dev_pmd_fault(struct vm_area_struct *vma, unsigned long addr, return rc; } -static void dax_dev_vm_open(struct vm_area_struct *vma) -{ - struct file *filp = vma->vm_file; - struct dax_dev *dax_dev = filp->private_data; - - dev_dbg(dax_dev->dev, "%s\n", __func__); - kref_get(&dax_dev->kref); -} - -static void dax_dev_vm_close(struct vm_area_struct *vma) -{ - struct file *filp = vma->vm_file; - struct dax_dev *dax_dev = filp->private_data; - - dev_dbg(dax_dev->dev, "%s\n", __func__); - dax_dev_put(dax_dev); -} - static const struct vm_operations_struct dax_dev_vm_ops = { .fault = dax_dev_fault, .pmd_fault = dax_dev_pmd_fault, - .open = dax_dev_vm_open, - .close = dax_dev_vm_close, }; -static int dax_dev_mmap(struct file *filp, struct vm_area_struct *vma) +static int dax_mmap(struct file *filp, struct vm_area_struct *vma) { struct dax_dev *dax_dev = filp->private_data; int rc; - dev_dbg(dax_dev->dev, "%s\n", __func__); + dev_dbg(&dax_dev->dev, "%s\n", __func__); rc = check_vma(dax_dev, vma, __func__); if (rc) return rc; - kref_get(&dax_dev->kref); vma->vm_ops = &dax_dev_vm_ops; vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; return 0; +} + +/* return an unmapped area aligned to the dax region specified alignment */ +static unsigned long dax_get_unmapped_area(struct file *filp, + unsigned long addr, unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + unsigned long off, off_end, off_align, len_align, addr_align, align; + struct dax_dev *dax_dev = filp ? filp->private_data : NULL; + struct dax_region *dax_region; + + if (!dax_dev || addr) + goto out; + + dax_region = dax_dev->region; + align = dax_region->align; + off = pgoff << PAGE_SHIFT; + off_end = off + len; + off_align = round_up(off, align); + + if ((off_end <= off_align) || ((off_end - off_align) < align)) + goto out; + + len_align = len + align; + if ((off + len_align) < off) + goto out; + addr_align = current->mm->get_unmapped_area(filp, addr, len_align, + pgoff, flags); + if (!IS_ERR_VALUE(addr_align)) { + addr_align += (off - addr_align) & (align - 1); + return addr_align; + } + out: + return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); +} + +static int dax_open(struct inode *inode, struct file *filp) +{ + struct dax_dev *dax_dev; + + dax_dev = container_of(inode->i_cdev, struct dax_dev, cdev); + dev_dbg(&dax_dev->dev, "%s\n", __func__); + inode->i_mapping = dax_dev->inode->i_mapping; + inode->i_mapping->host = dax_dev->inode; + filp->f_mapping = inode->i_mapping; + filp->private_data = dax_dev; + inode->i_flags = S_DAX; + + return 0; +} + +static int dax_release(struct inode *inode, struct file *filp) +{ + struct dax_dev *dax_dev = filp->private_data; + + dev_dbg(&dax_dev->dev, "%s\n", __func__); + return 0; } static const struct file_operations dax_fops = { .llseek = noop_llseek, .owner = THIS_MODULE, - .open = dax_dev_open, - .release = dax_dev_release, - .get_unmapped_area = dax_dev_get_unmapped_area, - .mmap = dax_dev_mmap, + .open = dax_open, + .release = dax_release, + .get_unmapped_area = dax_get_unmapped_area, + .mmap = dax_mmap, }; +static void dax_dev_release(struct device *dev) +{ + struct dax_dev *dax_dev = to_dax_dev(dev); + struct dax_region *dax_region = dax_dev->region; + + ida_simple_remove(&dax_region->ida, dax_dev->id); + ida_simple_remove(&dax_minor_ida, MINOR(dev->devt)); + dax_region_put(dax_region); + iput(dax_dev->inode); + kfree(dax_dev); +} + +static void unregister_dax_dev(void *dev) +{ + struct dax_dev *dax_dev = to_dax_dev(dev); + struct cdev *cdev = &dax_dev->cdev; + + dev_dbg(dev, "%s\n", __func__); + + /* + * Note, rcu is not protecting the liveness of dax_dev, rcu is + * ensuring that any fault handlers that might have seen + * dax_dev->alive == true, have completed. Any fault handlers + * that start after synchronize_rcu() has started will abort + * upon seeing dax_dev->alive == false. + */ + dax_dev->alive = false; + synchronize_rcu(); + unmap_mapping_range(dax_dev->inode->i_mapping, 0, 0, 1); + cdev_del(cdev); + device_unregister(dev); +} + +struct dax_dev *devm_create_dax_dev(struct dax_region *dax_region, + struct resource *res, int count) +{ + struct device *parent = dax_region->dev; + struct dax_dev *dax_dev; + int rc = 0, minor, i; + struct device *dev; + struct cdev *cdev; + dev_t dev_t; + + dax_dev = kzalloc(sizeof(*dax_dev) + sizeof(*res) * count, GFP_KERNEL); + if (!dax_dev) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < count; i++) { + if (!IS_ALIGNED(res[i].start, dax_region->align) + || !IS_ALIGNED(resource_size(&res[i]), + dax_region->align)) { + rc = -EINVAL; + break; + } + dax_dev->res[i].start = res[i].start; + dax_dev->res[i].end = res[i].end; + } + + if (i < count) + goto err_id; + + dax_dev->id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL); + if (dax_dev->id < 0) { + rc = dax_dev->id; + goto err_id; + } + + minor = ida_simple_get(&dax_minor_ida, 0, 0, GFP_KERNEL); + if (minor < 0) { + rc = minor; + goto err_minor; + } + + dev_t = MKDEV(MAJOR(dax_devt), minor); + dev = &dax_dev->dev; + dax_dev->inode = dax_inode_get(&dax_dev->cdev, dev_t); + if (!dax_dev->inode) { + rc = -ENOMEM; + goto err_inode; + } + + /* device_initialize() so cdev can reference kobj parent */ + device_initialize(dev); + + cdev = &dax_dev->cdev; + cdev_init(cdev, &dax_fops); + cdev->owner = parent->driver->owner; + cdev->kobj.parent = &dev->kobj; + rc = cdev_add(&dax_dev->cdev, dev_t, 1); + if (rc) + goto err_cdev; + + /* from here on we're committed to teardown via dax_dev_release() */ + dax_dev->num_resources = count; + dax_dev->alive = true; + dax_dev->region = dax_region; + kref_get(&dax_region->kref); + + dev->devt = dev_t; + dev->class = dax_class; + dev->parent = parent; + dev->groups = dax_attribute_groups; + dev->release = dax_dev_release; + dev_set_name(dev, "dax%d.%d", dax_region->id, dax_dev->id); + rc = device_add(dev); + if (rc) { + put_device(dev); + return ERR_PTR(rc); + } + + rc = devm_add_action_or_reset(dax_region->dev, unregister_dax_dev, dev); + if (rc) + return ERR_PTR(rc); + + return dax_dev; + + err_cdev: + iput(dax_dev->inode); + err_inode: + ida_simple_remove(&dax_minor_ida, minor); + err_minor: + ida_simple_remove(&dax_region->ida, dax_dev->id); + err_id: + kfree(dax_dev); + + return ERR_PTR(rc); +} +EXPORT_SYMBOL_GPL(devm_create_dax_dev); + static int __init dax_init(void) { int rc; - rc = register_chrdev(0, "dax", &dax_fops); - if (rc < 0) + rc = dax_inode_init(); + if (rc) return rc; - dax_major = rc; + + nr_dax = max(nr_dax, 256); + rc = alloc_chrdev_region(&dax_devt, 0, nr_dax, "dax"); + if (rc) + goto err_chrdev; dax_class = class_create(THIS_MODULE, "dax"); if (IS_ERR(dax_class)) { - unregister_chrdev(dax_major, "dax"); - return PTR_ERR(dax_class); + rc = PTR_ERR(dax_class); + goto err_class; } return 0; + + err_class: + unregister_chrdev_region(dax_devt, nr_dax); + err_chrdev: + dax_inode_exit(); + return rc; } static void __exit dax_exit(void) { class_destroy(dax_class); - unregister_chrdev(dax_major, "dax"); + unregister_chrdev_region(dax_devt, nr_dax); ida_destroy(&dax_minor_ida); + dax_inode_exit(); } MODULE_AUTHOR("Intel Corporation"); diff --git a/drivers/dax/dax.h b/drivers/dax/dax.h index d8b8f1f25054..ddd829ab58c0 100644 --- a/drivers/dax/dax.h +++ b/drivers/dax/dax.h @@ -13,12 +13,13 @@ #ifndef __DAX_H__ #define __DAX_H__ struct device; +struct dax_dev; struct resource; struct dax_region; void dax_region_put(struct dax_region *dax_region); struct dax_region *alloc_dax_region(struct device *parent, int region_id, struct resource *res, unsigned int align, void *addr, unsigned long flags); -int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res, - int count); +struct dax_dev *devm_create_dax_dev(struct dax_region *dax_region, + struct resource *res, int count); #endif /* __DAX_H__ */ diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c index 1f01e98c83c7..9630d8837ba9 100644 --- a/drivers/dax/pmem.c +++ b/drivers/dax/pmem.c @@ -24,7 +24,7 @@ struct dax_pmem { struct completion cmp; }; -struct dax_pmem *to_dax_pmem(struct percpu_ref *ref) +static struct dax_pmem *to_dax_pmem(struct percpu_ref *ref) { return container_of(ref, struct dax_pmem, ref); } @@ -61,6 +61,7 @@ static int dax_pmem_probe(struct device *dev) int rc; void *addr; struct resource res; + struct dax_dev *dax_dev; struct nd_pfn_sb *pfn_sb; struct dax_pmem *dax_pmem; struct nd_region *nd_region; @@ -126,12 +127,12 @@ static int dax_pmem_probe(struct device *dev) return -ENOMEM; /* TODO: support for subdividing a dax region... */ - rc = devm_create_dax_dev(dax_region, &res, 1); + dax_dev = devm_create_dax_dev(dax_region, &res, 1); /* child dax_dev instances now own the lifetime of the dax_region */ dax_region_put(dax_region); - return rc; + return PTR_ERR_OR_ZERO(dax_dev); } static struct nd_device_driver dax_pmem_driver = { diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig index 124c2432ac9c..8b2b740d6679 100644 --- a/drivers/nvdimm/Kconfig +++ b/drivers/nvdimm/Kconfig @@ -89,7 +89,7 @@ config NVDIMM_PFN Select Y if unsure config NVDIMM_DAX - bool "NVDIMM DAX: Raw access to persistent memory" + tristate "NVDIMM DAX: Raw access to persistent memory" default LIBNVDIMM depends on NVDIMM_PFN help diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c index 935866fe5ec2..a8b6949a8778 100644 --- a/drivers/nvdimm/bus.c +++ b/drivers/nvdimm/bus.c @@ -217,6 +217,8 @@ long nvdimm_clear_poison(struct device *dev, phys_addr_t phys, return rc; if (cmd_rc < 0) return cmd_rc; + + nvdimm_clear_from_poison_list(nvdimm_bus, phys, len); return clear_err.cleared; } EXPORT_SYMBOL_GPL(nvdimm_clear_poison); diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c index 4d7bbd2df5c0..7ceba08774b6 100644 --- a/drivers/nvdimm/core.c +++ b/drivers/nvdimm/core.c @@ -547,11 +547,12 @@ void nvdimm_badblocks_populate(struct nd_region *nd_region, } EXPORT_SYMBOL_GPL(nvdimm_badblocks_populate); -static int add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length) +static int add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length, + gfp_t flags) { struct nd_poison *pl; - pl = kzalloc(sizeof(*pl), GFP_KERNEL); + pl = kzalloc(sizeof(*pl), flags); if (!pl) return -ENOMEM; @@ -567,7 +568,7 @@ static int bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length) struct nd_poison *pl; if (list_empty(&nvdimm_bus->poison_list)) - return add_poison(nvdimm_bus, addr, length); + return add_poison(nvdimm_bus, addr, length, GFP_KERNEL); /* * There is a chance this is a duplicate, check for those first. @@ -587,7 +588,7 @@ static int bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length) * as any overlapping ranges will get resolved when the list is consumed * and converted to badblocks */ - return add_poison(nvdimm_bus, addr, length); + return add_poison(nvdimm_bus, addr, length, GFP_KERNEL); } int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length) @@ -602,6 +603,70 @@ int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length) } EXPORT_SYMBOL_GPL(nvdimm_bus_add_poison); +void nvdimm_clear_from_poison_list(struct nvdimm_bus *nvdimm_bus, + phys_addr_t start, unsigned int len) +{ + struct list_head *poison_list = &nvdimm_bus->poison_list; + u64 clr_end = start + len - 1; + struct nd_poison *pl, *next; + + nvdimm_bus_lock(&nvdimm_bus->dev); + WARN_ON_ONCE(list_empty(poison_list)); + + /* + * [start, clr_end] is the poison interval being cleared. + * [pl->start, pl_end] is the poison_list entry we're comparing + * the above interval against. The poison list entry may need + * to be modified (update either start or length), deleted, or + * split into two based on the overlap characteristics + */ + + list_for_each_entry_safe(pl, next, poison_list, list) { + u64 pl_end = pl->start + pl->length - 1; + + /* Skip intervals with no intersection */ + if (pl_end < start) + continue; + if (pl->start > clr_end) + continue; + /* Delete completely overlapped poison entries */ + if ((pl->start >= start) && (pl_end <= clr_end)) { + list_del(&pl->list); + kfree(pl); + continue; + } + /* Adjust start point of partially cleared entries */ + if ((start <= pl->start) && (clr_end > pl->start)) { + pl->length -= clr_end - pl->start + 1; + pl->start = clr_end + 1; + continue; + } + /* Adjust pl->length for partial clearing at the tail end */ + if ((pl->start < start) && (pl_end <= clr_end)) { + /* pl->start remains the same */ + pl->length = start - pl->start; + continue; + } + /* + * If clearing in the middle of an entry, we split it into + * two by modifying the current entry to represent one half of + * the split, and adding a new entry for the second half. + */ + if ((pl->start < start) && (pl_end > clr_end)) { + u64 new_start = clr_end + 1; + u64 new_len = pl_end - new_start + 1; + + /* Add new entry covering the right half */ + add_poison(nvdimm_bus, new_start, new_len, GFP_NOIO); + /* Adjust this entry to cover the left half */ + pl->length = start - pl->start; + continue; + } + } + nvdimm_bus_unlock(&nvdimm_bus->dev); +} +EXPORT_SYMBOL_GPL(nvdimm_clear_from_poison_list); + #ifdef CONFIG_BLK_DEV_INTEGRITY int nd_integrity_init(struct gendisk *disk, unsigned long meta_size) { diff --git a/drivers/nvdimm/dimm.c b/drivers/nvdimm/dimm.c index 71d12bb67339..619834e144d1 100644 --- a/drivers/nvdimm/dimm.c +++ b/drivers/nvdimm/dimm.c @@ -26,6 +26,14 @@ static int nvdimm_probe(struct device *dev) struct nvdimm_drvdata *ndd; int rc; + rc = nvdimm_check_config_data(dev); + if (rc) { + /* not required for non-aliased nvdimm, ex. NVDIMM-N */ + if (rc == -ENOTTY) + rc = 0; + return rc; + } + ndd = kzalloc(sizeof(*ndd), GFP_KERNEL); if (!ndd) return -ENOMEM; @@ -72,6 +80,9 @@ static int nvdimm_remove(struct device *dev) { struct nvdimm_drvdata *ndd = dev_get_drvdata(dev); + if (!ndd) + return 0; + nvdimm_bus_lock(dev); dev_set_drvdata(dev, NULL); nvdimm_bus_unlock(dev); diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c index d9bba5edd8dc..d614493ad5ac 100644 --- a/drivers/nvdimm/dimm_devs.c +++ b/drivers/nvdimm/dimm_devs.c @@ -28,28 +28,30 @@ static DEFINE_IDA(dimm_ida); * Retrieve bus and dimm handle and return if this bus supports * get_config_data commands */ -static int __validate_dimm(struct nvdimm_drvdata *ndd) +int nvdimm_check_config_data(struct device *dev) { - struct nvdimm *nvdimm; - - if (!ndd) - return -EINVAL; - - nvdimm = to_nvdimm(ndd->dev); + struct nvdimm *nvdimm = to_nvdimm(dev); - if (!nvdimm->cmd_mask) - return -ENXIO; - if (!test_bit(ND_CMD_GET_CONFIG_DATA, &nvdimm->cmd_mask)) - return -ENXIO; + if (!nvdimm->cmd_mask || + !test_bit(ND_CMD_GET_CONFIG_DATA, &nvdimm->cmd_mask)) { + if (nvdimm->flags & NDD_ALIASING) + return -ENXIO; + else + return -ENOTTY; + } return 0; } static int validate_dimm(struct nvdimm_drvdata *ndd) { - int rc = __validate_dimm(ndd); + int rc; - if (rc && ndd) + if (!ndd) + return -EINVAL; + + rc = nvdimm_check_config_data(ndd->dev); + if (rc) dev_dbg(ndd->dev, "%pf: %s error: %d\n", __builtin_return_address(0), __func__, rc); return rc; @@ -263,6 +265,12 @@ const char *nvdimm_name(struct nvdimm *nvdimm) } EXPORT_SYMBOL_GPL(nvdimm_name); +struct kobject *nvdimm_kobj(struct nvdimm *nvdimm) +{ + return &nvdimm->dev.kobj; +} +EXPORT_SYMBOL_GPL(nvdimm_kobj); + unsigned long nvdimm_cmd_mask(struct nvdimm *nvdimm) { return nvdimm->cmd_mask; @@ -378,40 +386,166 @@ struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data, } EXPORT_SYMBOL_GPL(nvdimm_create); +int alias_dpa_busy(struct device *dev, void *data) +{ + resource_size_t map_end, blk_start, new, busy; + struct blk_alloc_info *info = data; + struct nd_mapping *nd_mapping; + struct nd_region *nd_region; + struct nvdimm_drvdata *ndd; + struct resource *res; + int i; + + if (!is_nd_pmem(dev)) + return 0; + + nd_region = to_nd_region(dev); + for (i = 0; i < nd_region->ndr_mappings; i++) { + nd_mapping = &nd_region->mapping[i]; + if (nd_mapping->nvdimm == info->nd_mapping->nvdimm) + break; + } + + if (i >= nd_region->ndr_mappings) + return 0; + + ndd = to_ndd(nd_mapping); + map_end = nd_mapping->start + nd_mapping->size - 1; + blk_start = nd_mapping->start; + + /* + * In the allocation case ->res is set to free space that we are + * looking to validate against PMEM aliasing collision rules + * (i.e. BLK is allocated after all aliased PMEM). + */ + if (info->res) { + if (info->res->start >= nd_mapping->start + && info->res->start < map_end) + /* pass */; + else + return 0; + } + + retry: + /* + * Find the free dpa from the end of the last pmem allocation to + * the end of the interleave-set mapping that is not already + * covered by a blk allocation. + */ + busy = 0; + for_each_dpa_resource(ndd, res) { + if ((res->start >= blk_start && res->start < map_end) + || (res->end >= blk_start + && res->end <= map_end)) { + if (strncmp(res->name, "pmem", 4) == 0) { + new = max(blk_start, min(map_end + 1, + res->end + 1)); + if (new != blk_start) { + blk_start = new; + goto retry; + } + } else + busy += min(map_end, res->end) + - max(nd_mapping->start, res->start) + 1; + } else if (nd_mapping->start > res->start + && map_end < res->end) { + /* total eclipse of the PMEM region mapping */ + busy += nd_mapping->size; + break; + } + } + + /* update the free space range with the probed blk_start */ + if (info->res && blk_start > info->res->start) { + info->res->start = max(info->res->start, blk_start); + if (info->res->start > info->res->end) + info->res->end = info->res->start - 1; + return 1; + } + + info->available -= blk_start - nd_mapping->start + busy; + + return 0; +} + +static int blk_dpa_busy(struct device *dev, void *data) +{ + struct blk_alloc_info *info = data; + struct nd_mapping *nd_mapping; + struct nd_region *nd_region; + resource_size_t map_end; + int i; + + if (!is_nd_pmem(dev)) + return 0; + + nd_region = to_nd_region(dev); + for (i = 0; i < nd_region->ndr_mappings; i++) { + nd_mapping = &nd_region->mapping[i]; + if (nd_mapping->nvdimm == info->nd_mapping->nvdimm) + break; + } + + if (i >= nd_region->ndr_mappings) + return 0; + + map_end = nd_mapping->start + nd_mapping->size - 1; + if (info->res->start >= nd_mapping->start + && info->res->start < map_end) { + if (info->res->end <= map_end) { + info->busy = 0; + return 1; + } else { + info->busy -= info->res->end - map_end; + return 0; + } + } else if (info->res->end >= nd_mapping->start + && info->res->end <= map_end) { + info->busy -= nd_mapping->start - info->res->start; + return 0; + } else { + info->busy -= nd_mapping->size; + return 0; + } +} + /** * nd_blk_available_dpa - account the unused dpa of BLK region * @nd_mapping: container of dpa-resource-root + labels * - * Unlike PMEM, BLK namespaces can occupy discontiguous DPA ranges. + * Unlike PMEM, BLK namespaces can occupy discontiguous DPA ranges, but + * we arrange for them to never start at an lower dpa than the last + * PMEM allocation in an aliased region. */ -resource_size_t nd_blk_available_dpa(struct nd_mapping *nd_mapping) +resource_size_t nd_blk_available_dpa(struct nd_region *nd_region) { + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev); + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); - resource_size_t map_end, busy = 0, available; + struct blk_alloc_info info = { + .nd_mapping = nd_mapping, + .available = nd_mapping->size, + .res = NULL, + }; struct resource *res; if (!ndd) return 0; - map_end = nd_mapping->start + nd_mapping->size - 1; - for_each_dpa_resource(ndd, res) - if (res->start >= nd_mapping->start && res->start < map_end) { - resource_size_t end = min(map_end, res->end); + device_for_each_child(&nvdimm_bus->dev, &info, alias_dpa_busy); - busy += end - res->start + 1; - } else if (res->end >= nd_mapping->start - && res->end <= map_end) { - busy += res->end - nd_mapping->start; - } else if (nd_mapping->start > res->start - && nd_mapping->start < res->end) { - /* total eclipse of the BLK region mapping */ - busy += nd_mapping->size; - } + /* now account for busy blk allocations in unaliased dpa */ + for_each_dpa_resource(ndd, res) { + if (strncmp(res->name, "blk", 3) != 0) + continue; - available = map_end - nd_mapping->start + 1; - if (busy < available) - return available - busy; - return 0; + info.res = res; + info.busy = resource_size(res); + device_for_each_child(&nvdimm_bus->dev, &info, blk_dpa_busy); + info.available -= info.busy; + } + + return info.available; } /** @@ -443,21 +577,16 @@ resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region, map_start = nd_mapping->start; map_end = map_start + nd_mapping->size - 1; blk_start = max(map_start, map_end + 1 - *overlap); - for_each_dpa_resource(ndd, res) + for_each_dpa_resource(ndd, res) { if (res->start >= map_start && res->start < map_end) { if (strncmp(res->name, "blk", 3) == 0) - blk_start = min(blk_start, res->start); - else if (res->start != map_start) { + blk_start = min(blk_start, + max(map_start, res->start)); + else if (res->end > map_end) { reason = "misaligned to iset"; goto err; - } else { - if (busy) { - reason = "duplicate overlapping PMEM reservations?"; - goto err; - } + } else busy += resource_size(res); - continue; - } } else if (res->end >= map_start && res->end <= map_end) { if (strncmp(res->name, "blk", 3) == 0) { /* @@ -466,15 +595,14 @@ resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region, * be used for BLK. */ blk_start = map_start; - } else { - reason = "misaligned to iset"; - goto err; - } + } else + busy += resource_size(res); } else if (map_start > res->start && map_start < res->end) { /* total eclipse of the mapping */ busy += nd_mapping->size; blk_start = map_start; } + } *overlap = map_end + 1 - blk_start; available = blk_start - map_start; @@ -483,10 +611,6 @@ resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region, return 0; err: - /* - * Something is wrong, PMEM must align with the start of the - * interleave set, and there can only be one allocation per set. - */ nd_dbg_dpa(nd_region, ndd, res, "%s\n", reason); return 0; } diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c index 96526dcfdd37..fac7cabe8f56 100644 --- a/drivers/nvdimm/label.c +++ b/drivers/nvdimm/label.c @@ -494,11 +494,13 @@ static int __pmem_label_update(struct nd_region *nd_region, struct nd_mapping *nd_mapping, struct nd_namespace_pmem *nspm, int pos) { - u64 cookie = nd_region_interleave_set_cookie(nd_region), rawsize; + u64 cookie = nd_region_interleave_set_cookie(nd_region); struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); - struct nd_namespace_label *victim_label; + struct nd_label_ent *label_ent, *victim = NULL; struct nd_namespace_label *nd_label; struct nd_namespace_index *nsindex; + struct nd_label_id label_id; + struct resource *res; unsigned long *free; u32 nslot, slot; size_t offset; @@ -507,6 +509,16 @@ static int __pmem_label_update(struct nd_region *nd_region, if (!preamble_next(ndd, &nsindex, &free, &nslot)) return -ENXIO; + nd_label_gen_id(&label_id, nspm->uuid, 0); + for_each_dpa_resource(ndd, res) + if (strcmp(res->name, label_id.id) == 0) + break; + + if (!res) { + WARN_ON_ONCE(1); + return -ENXIO; + } + /* allocate and write the label to the staging (next) index */ slot = nd_label_alloc_slot(ndd); if (slot == UINT_MAX) @@ -522,11 +534,10 @@ static int __pmem_label_update(struct nd_region *nd_region, nd_label->nlabel = __cpu_to_le16(nd_region->ndr_mappings); nd_label->position = __cpu_to_le16(pos); nd_label->isetcookie = __cpu_to_le64(cookie); - rawsize = div_u64(resource_size(&nspm->nsio.res), - nd_region->ndr_mappings); - nd_label->rawsize = __cpu_to_le64(rawsize); - nd_label->dpa = __cpu_to_le64(nd_mapping->start); + nd_label->rawsize = __cpu_to_le64(resource_size(res)); + nd_label->dpa = __cpu_to_le64(res->start); nd_label->slot = __cpu_to_le32(slot); + nd_dbg_dpa(nd_region, ndd, res, "%s\n", __func__); /* update label */ offset = nd_label_offset(ndd, nd_label); @@ -536,38 +547,43 @@ static int __pmem_label_update(struct nd_region *nd_region, return rc; /* Garbage collect the previous label */ - victim_label = nd_mapping->labels[0]; - if (victim_label) { - slot = to_slot(ndd, victim_label); - nd_label_free_slot(ndd, slot); + mutex_lock(&nd_mapping->lock); + list_for_each_entry(label_ent, &nd_mapping->labels, list) { + if (!label_ent->label) + continue; + if (memcmp(nspm->uuid, label_ent->label->uuid, + NSLABEL_UUID_LEN) != 0) + continue; + victim = label_ent; + list_move_tail(&victim->list, &nd_mapping->labels); + break; + } + if (victim) { dev_dbg(ndd->dev, "%s: free: %d\n", __func__, slot); + slot = to_slot(ndd, victim->label); + nd_label_free_slot(ndd, slot); + victim->label = NULL; } /* update index */ rc = nd_label_write_index(ndd, ndd->ns_next, nd_inc_seq(__le32_to_cpu(nsindex->seq)), 0); - if (rc < 0) - return rc; - - nd_mapping->labels[0] = nd_label; - - return 0; -} - -static void del_label(struct nd_mapping *nd_mapping, int l) -{ - struct nd_namespace_label *next_label, *nd_label; - struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); - unsigned int slot; - int j; - - nd_label = nd_mapping->labels[l]; - slot = to_slot(ndd, nd_label); - dev_vdbg(ndd->dev, "%s: clear: %d\n", __func__, slot); + if (rc == 0) { + list_for_each_entry(label_ent, &nd_mapping->labels, list) + if (!label_ent->label) { + label_ent->label = nd_label; + nd_label = NULL; + break; + } + dev_WARN_ONCE(&nspm->nsio.common.dev, nd_label, + "failed to track label: %d\n", + to_slot(ndd, nd_label)); + if (nd_label) + rc = -ENXIO; + } + mutex_unlock(&nd_mapping->lock); - for (j = l; (next_label = nd_mapping->labels[j + 1]); j++) - nd_mapping->labels[j] = next_label; - nd_mapping->labels[j] = NULL; + return rc; } static bool is_old_resource(struct resource *res, struct resource **list, int n) @@ -607,14 +623,16 @@ static int __blk_label_update(struct nd_region *nd_region, struct nd_mapping *nd_mapping, struct nd_namespace_blk *nsblk, int num_labels) { - int i, l, alloc, victims, nfree, old_num_resources, nlabel, rc = -ENXIO; + int i, alloc, victims, nfree, old_num_resources, nlabel, rc = -ENXIO; struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); struct nd_namespace_label *nd_label; + struct nd_label_ent *label_ent, *e; struct nd_namespace_index *nsindex; unsigned long *free, *victim_map = NULL; struct resource *res, **old_res_list; struct nd_label_id label_id; u8 uuid[NSLABEL_UUID_LEN]; + LIST_HEAD(list); u32 nslot, slot; if (!preamble_next(ndd, &nsindex, &free, &nslot)) @@ -736,15 +754,22 @@ static int __blk_label_update(struct nd_region *nd_region, * entries in nd_mapping->labels */ nlabel = 0; - for_each_label(l, nd_label, nd_mapping->labels) { + mutex_lock(&nd_mapping->lock); + list_for_each_entry_safe(label_ent, e, &nd_mapping->labels, list) { + nd_label = label_ent->label; + if (!nd_label) + continue; nlabel++; memcpy(uuid, nd_label->uuid, NSLABEL_UUID_LEN); if (memcmp(uuid, nsblk->uuid, NSLABEL_UUID_LEN) != 0) continue; nlabel--; - del_label(nd_mapping, l); - l--; /* retry with the new label at this index */ + list_move(&label_ent->list, &list); + label_ent->label = NULL; } + list_splice_tail_init(&list, &nd_mapping->labels); + mutex_unlock(&nd_mapping->lock); + if (nlabel + nsblk->num_resources > num_labels) { /* * Bug, we can't end up with more resources than @@ -755,6 +780,15 @@ static int __blk_label_update(struct nd_region *nd_region, goto out; } + mutex_lock(&nd_mapping->lock); + label_ent = list_first_entry_or_null(&nd_mapping->labels, + typeof(*label_ent), list); + if (!label_ent) { + WARN_ON(1); + mutex_unlock(&nd_mapping->lock); + rc = -ENXIO; + goto out; + } for_each_clear_bit_le(slot, free, nslot) { nd_label = nd_label_base(ndd) + slot; memcpy(uuid, nd_label->uuid, NSLABEL_UUID_LEN); @@ -762,11 +796,19 @@ static int __blk_label_update(struct nd_region *nd_region, continue; res = to_resource(ndd, nd_label); res->flags &= ~DPA_RESOURCE_ADJUSTED; - dev_vdbg(&nsblk->common.dev, "assign label[%d] slot: %d\n", - l, slot); - nd_mapping->labels[l++] = nd_label; + dev_vdbg(&nsblk->common.dev, "assign label slot: %d\n", slot); + list_for_each_entry_from(label_ent, &nd_mapping->labels, list) { + if (label_ent->label) + continue; + label_ent->label = nd_label; + nd_label = NULL; + break; + } + if (nd_label) + dev_WARN(&nsblk->common.dev, + "failed to track label slot%d\n", slot); } - nd_mapping->labels[l] = NULL; + mutex_unlock(&nd_mapping->lock); out: kfree(old_res_list); @@ -788,32 +830,28 @@ static int __blk_label_update(struct nd_region *nd_region, static int init_labels(struct nd_mapping *nd_mapping, int num_labels) { - int i, l, old_num_labels = 0; + int i, old_num_labels = 0; + struct nd_label_ent *label_ent; struct nd_namespace_index *nsindex; - struct nd_namespace_label *nd_label; struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); - size_t size = (num_labels + 1) * sizeof(struct nd_namespace_label *); - for_each_label(l, nd_label, nd_mapping->labels) + mutex_lock(&nd_mapping->lock); + list_for_each_entry(label_ent, &nd_mapping->labels, list) old_num_labels++; + mutex_unlock(&nd_mapping->lock); /* * We need to preserve all the old labels for the mapping so * they can be garbage collected after writing the new labels. */ - if (num_labels > old_num_labels) { - struct nd_namespace_label **labels; - - labels = krealloc(nd_mapping->labels, size, GFP_KERNEL); - if (!labels) + for (i = old_num_labels; i < num_labels; i++) { + label_ent = kzalloc(sizeof(*label_ent), GFP_KERNEL); + if (!label_ent) return -ENOMEM; - nd_mapping->labels = labels; + mutex_lock(&nd_mapping->lock); + list_add_tail(&label_ent->list, &nd_mapping->labels); + mutex_unlock(&nd_mapping->lock); } - if (!nd_mapping->labels) - return -ENOMEM; - - for (i = old_num_labels; i <= num_labels; i++) - nd_mapping->labels[i] = NULL; if (ndd->ns_current == -1 || ndd->ns_next == -1) /* pass */; @@ -837,42 +875,45 @@ static int init_labels(struct nd_mapping *nd_mapping, int num_labels) static int del_labels(struct nd_mapping *nd_mapping, u8 *uuid) { struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); - struct nd_namespace_label *nd_label; + struct nd_label_ent *label_ent, *e; struct nd_namespace_index *nsindex; u8 label_uuid[NSLABEL_UUID_LEN]; - int l, num_freed = 0; unsigned long *free; + LIST_HEAD(list); u32 nslot, slot; + int active = 0; if (!uuid) return 0; /* no index || no labels == nothing to delete */ - if (!preamble_next(ndd, &nsindex, &free, &nslot) - || !nd_mapping->labels) + if (!preamble_next(ndd, &nsindex, &free, &nslot)) return 0; - for_each_label(l, nd_label, nd_mapping->labels) { + mutex_lock(&nd_mapping->lock); + list_for_each_entry_safe(label_ent, e, &nd_mapping->labels, list) { + struct nd_namespace_label *nd_label = label_ent->label; + + if (!nd_label) + continue; + active++; memcpy(label_uuid, nd_label->uuid, NSLABEL_UUID_LEN); if (memcmp(label_uuid, uuid, NSLABEL_UUID_LEN) != 0) continue; + active--; slot = to_slot(ndd, nd_label); nd_label_free_slot(ndd, slot); dev_dbg(ndd->dev, "%s: free: %d\n", __func__, slot); - del_label(nd_mapping, l); - num_freed++; - l--; /* retry with new label at this index */ + list_move_tail(&label_ent->list, &list); + label_ent->label = NULL; } + list_splice_tail_init(&list, &nd_mapping->labels); - if (num_freed > l) { - /* - * num_freed will only ever be > l when we delete the last - * label - */ - kfree(nd_mapping->labels); - nd_mapping->labels = NULL; - dev_dbg(ndd->dev, "%s: no more labels\n", __func__); + if (active == 0) { + nd_mapping_free_labels(nd_mapping); + dev_dbg(ndd->dev, "%s: no more active labels\n", __func__); } + mutex_unlock(&nd_mapping->lock); return nd_label_write_index(ndd, ndd->ns_next, nd_inc_seq(__le32_to_cpu(nsindex->seq)), 0); @@ -885,7 +926,9 @@ int nd_pmem_namespace_label_update(struct nd_region *nd_region, for (i = 0; i < nd_region->ndr_mappings; i++) { struct nd_mapping *nd_mapping = &nd_region->mapping[i]; - int rc; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct resource *res; + int rc, count = 0; if (size == 0) { rc = del_labels(nd_mapping, nspm->uuid); @@ -894,7 +937,12 @@ int nd_pmem_namespace_label_update(struct nd_region *nd_region, continue; } - rc = init_labels(nd_mapping, 1); + for_each_dpa_resource(ndd, res) + if (strncmp(res->name, "pmem", 3) == 0) + count++; + WARN_ON_ONCE(!count); + + rc = init_labels(nd_mapping, count); if (rc < 0) return rc; diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index c5e3196c45b0..3509cff68ef9 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -12,8 +12,10 @@ */ #include <linux/module.h> #include <linux/device.h> +#include <linux/sort.h> #include <linux/slab.h> #include <linux/pmem.h> +#include <linux/list.h> #include <linux/nd.h> #include "nd-core.h" #include "nd.h" @@ -28,7 +30,10 @@ static void namespace_io_release(struct device *dev) static void namespace_pmem_release(struct device *dev) { struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + struct nd_region *nd_region = to_nd_region(dev->parent); + if (nspm->id >= 0) + ida_simple_remove(&nd_region->ns_ida, nspm->id); kfree(nspm->alt_name); kfree(nspm->uuid); kfree(nspm); @@ -62,17 +67,17 @@ static struct device_type namespace_blk_device_type = { .release = namespace_blk_release, }; -static bool is_namespace_pmem(struct device *dev) +static bool is_namespace_pmem(const struct device *dev) { return dev ? dev->type == &namespace_pmem_device_type : false; } -static bool is_namespace_blk(struct device *dev) +static bool is_namespace_blk(const struct device *dev) { return dev ? dev->type == &namespace_blk_device_type : false; } -static bool is_namespace_io(struct device *dev) +static bool is_namespace_io(const struct device *dev) { return dev ? dev->type == &namespace_io_device_type : false; } @@ -168,7 +173,21 @@ const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns, suffix = "s"; if (is_namespace_pmem(&ndns->dev) || is_namespace_io(&ndns->dev)) { - sprintf(name, "pmem%d%s", nd_region->id, suffix ? suffix : ""); + int nsidx = 0; + + if (is_namespace_pmem(&ndns->dev)) { + struct nd_namespace_pmem *nspm; + + nspm = to_nd_namespace_pmem(&ndns->dev); + nsidx = nspm->id; + } + + if (nsidx) + sprintf(name, "pmem%d.%d%s", nd_region->id, nsidx, + suffix ? suffix : ""); + else + sprintf(name, "pmem%d%s", nd_region->id, + suffix ? suffix : ""); } else if (is_namespace_blk(&ndns->dev)) { struct nd_namespace_blk *nsblk; @@ -294,7 +313,7 @@ static bool __nd_namespace_blk_validate(struct nd_namespace_blk *nsblk) if (strcmp(res->name, label_id.id) != 0) continue; /* - * Resources with unacknoweldged adjustments indicate a + * Resources with unacknowledged adjustments indicate a * failure to update labels */ if (res->flags & DPA_RESOURCE_ADJUSTED) @@ -510,19 +529,68 @@ static resource_size_t init_dpa_allocation(struct nd_label_id *label_id, return rc ? n : 0; } -static bool space_valid(bool is_pmem, bool is_reserve, - struct nd_label_id *label_id, struct resource *res) + +/** + * space_valid() - validate free dpa space against constraints + * @nd_region: hosting region of the free space + * @ndd: dimm device data for debug + * @label_id: namespace id to allocate space + * @prev: potential allocation that precedes free space + * @next: allocation that follows the given free space range + * @exist: first allocation with same id in the mapping + * @n: range that must satisfied for pmem allocations + * @valid: free space range to validate + * + * BLK-space is valid as long as it does not precede a PMEM + * allocation in a given region. PMEM-space must be contiguous + * and adjacent to an existing existing allocation (if one + * exists). If reserving PMEM any space is valid. + */ +static void space_valid(struct nd_region *nd_region, struct nvdimm_drvdata *ndd, + struct nd_label_id *label_id, struct resource *prev, + struct resource *next, struct resource *exist, + resource_size_t n, struct resource *valid) { - /* - * For BLK-space any space is valid, for PMEM-space, it must be - * contiguous with an existing allocation unless we are - * reserving pmem. - */ - if (is_reserve || !is_pmem) - return true; - if (!res || strcmp(res->name, label_id->id) == 0) - return true; - return false; + bool is_reserve = strcmp(label_id->id, "pmem-reserve") == 0; + bool is_pmem = strncmp(label_id->id, "pmem", 4) == 0; + + if (valid->start >= valid->end) + goto invalid; + + if (is_reserve) + return; + + if (!is_pmem) { + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + struct nvdimm_bus *nvdimm_bus; + struct blk_alloc_info info = { + .nd_mapping = nd_mapping, + .available = nd_mapping->size, + .res = valid, + }; + + WARN_ON(!is_nd_blk(&nd_region->dev)); + nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev); + device_for_each_child(&nvdimm_bus->dev, &info, alias_dpa_busy); + return; + } + + /* allocation needs to be contiguous, so this is all or nothing */ + if (resource_size(valid) < n) + goto invalid; + + /* we've got all the space we need and no existing allocation */ + if (!exist) + return; + + /* allocation needs to be contiguous with the existing namespace */ + if (valid->start == exist->end + 1 + || valid->end == exist->start - 1) + return; + + invalid: + /* truncate @valid size to 0 */ + valid->end = valid->start - 1; } enum alloc_loc { @@ -534,18 +602,24 @@ static resource_size_t scan_allocate(struct nd_region *nd_region, resource_size_t n) { resource_size_t mapping_end = nd_mapping->start + nd_mapping->size - 1; - bool is_reserve = strcmp(label_id->id, "pmem-reserve") == 0; bool is_pmem = strncmp(label_id->id, "pmem", 4) == 0; struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct resource *res, *exist = NULL, valid; const resource_size_t to_allocate = n; - struct resource *res; int first; + for_each_dpa_resource(ndd, res) + if (strcmp(label_id->id, res->name) == 0) + exist = res; + + valid.start = nd_mapping->start; + valid.end = mapping_end; + valid.name = "free space"; retry: first = 0; for_each_dpa_resource(ndd, res) { - resource_size_t allocate, available = 0, free_start, free_end; struct resource *next = res->sibling, *new_res = NULL; + resource_size_t allocate, available = 0; enum alloc_loc loc = ALLOC_ERR; const char *action; int rc = 0; @@ -558,32 +632,35 @@ static resource_size_t scan_allocate(struct nd_region *nd_region, /* space at the beginning of the mapping */ if (!first++ && res->start > nd_mapping->start) { - free_start = nd_mapping->start; - available = res->start - free_start; - if (space_valid(is_pmem, is_reserve, label_id, NULL)) + valid.start = nd_mapping->start; + valid.end = res->start - 1; + space_valid(nd_region, ndd, label_id, NULL, next, exist, + to_allocate, &valid); + available = resource_size(&valid); + if (available) loc = ALLOC_BEFORE; } /* space between allocations */ if (!loc && next) { - free_start = res->start + resource_size(res); - free_end = min(mapping_end, next->start - 1); - if (space_valid(is_pmem, is_reserve, label_id, res) - && free_start < free_end) { - available = free_end + 1 - free_start; + valid.start = res->start + resource_size(res); + valid.end = min(mapping_end, next->start - 1); + space_valid(nd_region, ndd, label_id, res, next, exist, + to_allocate, &valid); + available = resource_size(&valid); + if (available) loc = ALLOC_MID; - } } /* space at the end of the mapping */ if (!loc && !next) { - free_start = res->start + resource_size(res); - free_end = mapping_end; - if (space_valid(is_pmem, is_reserve, label_id, res) - && free_start < free_end) { - available = free_end + 1 - free_start; + valid.start = res->start + resource_size(res); + valid.end = mapping_end; + space_valid(nd_region, ndd, label_id, res, next, exist, + to_allocate, &valid); + available = resource_size(&valid); + if (available) loc = ALLOC_AFTER; - } } if (!loc || !available) @@ -593,8 +670,6 @@ static resource_size_t scan_allocate(struct nd_region *nd_region, case ALLOC_BEFORE: if (strcmp(res->name, label_id->id) == 0) { /* adjust current resource up */ - if (is_pmem && !is_reserve) - return n; rc = adjust_resource(res, res->start - allocate, resource_size(res) + allocate); action = "cur grow up"; @@ -604,8 +679,6 @@ static resource_size_t scan_allocate(struct nd_region *nd_region, case ALLOC_MID: if (strcmp(next->name, label_id->id) == 0) { /* adjust next resource up */ - if (is_pmem && !is_reserve) - return n; rc = adjust_resource(next, next->start - allocate, resource_size(next) + allocate); @@ -629,12 +702,10 @@ static resource_size_t scan_allocate(struct nd_region *nd_region, if (strcmp(action, "allocate") == 0) { /* BLK allocate bottom up */ if (!is_pmem) - free_start += available - allocate; - else if (!is_reserve && free_start != nd_mapping->start) - return n; + valid.start += available - allocate; new_res = nvdimm_allocate_dpa(ndd, label_id, - free_start, allocate); + valid.start, allocate); if (!new_res) rc = -EBUSY; } else if (strcmp(action, "grow down") == 0) { @@ -832,13 +903,45 @@ static int grow_dpa_allocation(struct nd_region *nd_region, return 0; } -static void nd_namespace_pmem_set_size(struct nd_region *nd_region, +static void nd_namespace_pmem_set_resource(struct nd_region *nd_region, struct nd_namespace_pmem *nspm, resource_size_t size) { struct resource *res = &nspm->nsio.res; + resource_size_t offset = 0; - res->start = nd_region->ndr_start; - res->end = nd_region->ndr_start + size - 1; + if (size && !nspm->uuid) { + WARN_ON_ONCE(1); + size = 0; + } + + if (size && nspm->uuid) { + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct nd_label_id label_id; + struct resource *res; + + if (!ndd) { + size = 0; + goto out; + } + + nd_label_gen_id(&label_id, nspm->uuid, 0); + + /* calculate a spa offset from the dpa allocation offset */ + for_each_dpa_resource(ndd, res) + if (strcmp(res->name, label_id.id) == 0) { + offset = (res->start - nd_mapping->start) + * nd_region->ndr_mappings; + goto out; + } + + WARN_ON_ONCE(1); + size = 0; + } + + out: + res->start = nd_region->ndr_start + offset; + res->end = res->start + size - 1; } static bool uuid_not_set(const u8 *uuid, struct device *dev, const char *where) @@ -929,7 +1032,7 @@ static ssize_t __size_store(struct device *dev, unsigned long long val) if (is_namespace_pmem(dev)) { struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); - nd_namespace_pmem_set_size(nd_region, nspm, + nd_namespace_pmem_set_resource(nd_region, nspm, val * nd_region->ndr_mappings); } else if (is_namespace_blk(dev)) { struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); @@ -1031,22 +1134,27 @@ static ssize_t size_show(struct device *dev, } static DEVICE_ATTR(size, S_IRUGO, size_show, size_store); -static ssize_t uuid_show(struct device *dev, - struct device_attribute *attr, char *buf) +static u8 *namespace_to_uuid(struct device *dev) { - u8 *uuid; - if (is_namespace_pmem(dev)) { struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); - uuid = nspm->uuid; + return nspm->uuid; } else if (is_namespace_blk(dev)) { struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); - uuid = nsblk->uuid; + return nsblk->uuid; } else - return -ENXIO; + return ERR_PTR(-ENXIO); +} +static ssize_t uuid_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + u8 *uuid = namespace_to_uuid(dev); + + if (IS_ERR(uuid)) + return PTR_ERR(uuid); if (uuid) return sprintf(buf, "%pUb\n", uuid); return sprintf(buf, "\n"); @@ -1089,7 +1197,7 @@ static int namespace_update_uuid(struct nd_region *nd_region, * * FIXME: can we delete uuid with zero dpa allocated? */ - if (nd_mapping->labels) + if (list_empty(&nd_mapping->labels)) return -EBUSY; } @@ -1491,14 +1599,19 @@ static bool has_uuid_at_pos(struct nd_region *nd_region, u8 *uuid, for (i = 0; i < nd_region->ndr_mappings; i++) { struct nd_mapping *nd_mapping = &nd_region->mapping[i]; - struct nd_namespace_label *nd_label; + struct nd_label_ent *label_ent; bool found_uuid = false; - int l; - for_each_label(l, nd_label, nd_mapping->labels) { - u64 isetcookie = __le64_to_cpu(nd_label->isetcookie); - u16 position = __le16_to_cpu(nd_label->position); - u16 nlabel = __le16_to_cpu(nd_label->nlabel); + list_for_each_entry(label_ent, &nd_mapping->labels, list) { + struct nd_namespace_label *nd_label = label_ent->label; + u16 position, nlabel; + u64 isetcookie; + + if (!nd_label) + continue; + isetcookie = __le64_to_cpu(nd_label->isetcookie); + position = __le16_to_cpu(nd_label->position); + nlabel = __le16_to_cpu(nd_label->nlabel); if (isetcookie != cookie) continue; @@ -1528,7 +1641,6 @@ static bool has_uuid_at_pos(struct nd_region *nd_region, u8 *uuid, static int select_pmem_id(struct nd_region *nd_region, u8 *pmem_id) { - struct nd_namespace_label *select = NULL; int i; if (!pmem_id) @@ -1536,90 +1648,106 @@ static int select_pmem_id(struct nd_region *nd_region, u8 *pmem_id) for (i = 0; i < nd_region->ndr_mappings; i++) { struct nd_mapping *nd_mapping = &nd_region->mapping[i]; - struct nd_namespace_label *nd_label; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct nd_namespace_label *nd_label = NULL; u64 hw_start, hw_end, pmem_start, pmem_end; - int l; + struct nd_label_ent *label_ent; - for_each_label(l, nd_label, nd_mapping->labels) + WARN_ON(!mutex_is_locked(&nd_mapping->lock)); + list_for_each_entry(label_ent, &nd_mapping->labels, list) { + nd_label = label_ent->label; + if (!nd_label) + continue; if (memcmp(nd_label->uuid, pmem_id, NSLABEL_UUID_LEN) == 0) break; + nd_label = NULL; + } if (!nd_label) { WARN_ON(1); return -EINVAL; } - select = nd_label; /* * Check that this label is compliant with the dpa * range published in NFIT */ hw_start = nd_mapping->start; hw_end = hw_start + nd_mapping->size; - pmem_start = __le64_to_cpu(select->dpa); - pmem_end = pmem_start + __le64_to_cpu(select->rawsize); - if (pmem_start == hw_start && pmem_end <= hw_end) + pmem_start = __le64_to_cpu(nd_label->dpa); + pmem_end = pmem_start + __le64_to_cpu(nd_label->rawsize); + if (pmem_start >= hw_start && pmem_start < hw_end + && pmem_end <= hw_end && pmem_end > hw_start) /* pass */; - else + else { + dev_dbg(&nd_region->dev, "%s invalid label for %pUb\n", + dev_name(ndd->dev), nd_label->uuid); return -EINVAL; + } - nd_mapping->labels[0] = select; - nd_mapping->labels[1] = NULL; + /* move recently validated label to the front of the list */ + list_move(&label_ent->list, &nd_mapping->labels); } return 0; } /** - * find_pmem_label_set - validate interleave set labelling, retrieve label0 + * create_namespace_pmem - validate interleave set labelling, retrieve label0 * @nd_region: region with mappings to validate + * @nspm: target namespace to create + * @nd_label: target pmem namespace label to evaluate */ -static int find_pmem_label_set(struct nd_region *nd_region, - struct nd_namespace_pmem *nspm) +struct device *create_namespace_pmem(struct nd_region *nd_region, + struct nd_namespace_label *nd_label) { u64 cookie = nd_region_interleave_set_cookie(nd_region); - struct nd_namespace_label *nd_label; - u8 select_id[NSLABEL_UUID_LEN]; + struct nd_label_ent *label_ent; + struct nd_namespace_pmem *nspm; + struct nd_mapping *nd_mapping; resource_size_t size = 0; - u8 *pmem_id = NULL; - int rc = -ENODEV, l; + struct resource *res; + struct device *dev; + int rc = 0; u16 i; - if (cookie == 0) - return -ENXIO; + if (cookie == 0) { + dev_dbg(&nd_region->dev, "invalid interleave-set-cookie\n"); + return ERR_PTR(-ENXIO); + } - /* - * Find a complete set of labels by uuid. By definition we can start - * with any mapping as the reference label - */ - for_each_label(l, nd_label, nd_region->mapping[0].labels) { - u64 isetcookie = __le64_to_cpu(nd_label->isetcookie); + if (__le64_to_cpu(nd_label->isetcookie) != cookie) { + dev_dbg(&nd_region->dev, "invalid cookie in label: %pUb\n", + nd_label->uuid); + return ERR_PTR(-EAGAIN); + } - if (isetcookie != cookie) - continue; + nspm = kzalloc(sizeof(*nspm), GFP_KERNEL); + if (!nspm) + return ERR_PTR(-ENOMEM); - for (i = 0; nd_region->ndr_mappings; i++) - if (!has_uuid_at_pos(nd_region, nd_label->uuid, - cookie, i)) - break; - if (i < nd_region->ndr_mappings) { - /* - * Give up if we don't find an instance of a - * uuid at each position (from 0 to - * nd_region->ndr_mappings - 1), or if we find a - * dimm with two instances of the same uuid. - */ - rc = -EINVAL; - goto err; - } else if (pmem_id) { - /* - * If there is more than one valid uuid set, we - * need userspace to clean this up. - */ - rc = -EBUSY; - goto err; - } - memcpy(select_id, nd_label->uuid, NSLABEL_UUID_LEN); - pmem_id = select_id; + nspm->id = -1; + dev = &nspm->nsio.common.dev; + dev->type = &namespace_pmem_device_type; + dev->parent = &nd_region->dev; + res = &nspm->nsio.res; + res->name = dev_name(&nd_region->dev); + res->flags = IORESOURCE_MEM; + + for (i = 0; i < nd_region->ndr_mappings; i++) + if (!has_uuid_at_pos(nd_region, nd_label->uuid, cookie, i)) + break; + if (i < nd_region->ndr_mappings) { + struct nvdimm_drvdata *ndd = to_ndd(&nd_region->mapping[i]); + + /* + * Give up if we don't find an instance of a uuid at each + * position (from 0 to nd_region->ndr_mappings - 1), or if we + * find a dimm with two instances of the same uuid. + */ + dev_err(&nd_region->dev, "%s missing label for %pUb\n", + dev_name(ndd->dev), nd_label->uuid); + rc = -EINVAL; + goto err; } /* @@ -1630,14 +1758,23 @@ static int find_pmem_label_set(struct nd_region *nd_region, * the dimm being enabled (i.e. nd_label_reserve_dpa() * succeeded). */ - rc = select_pmem_id(nd_region, pmem_id); + rc = select_pmem_id(nd_region, nd_label->uuid); if (rc) goto err; /* Calculate total size and populate namespace properties from label0 */ for (i = 0; i < nd_region->ndr_mappings; i++) { - struct nd_mapping *nd_mapping = &nd_region->mapping[i]; - struct nd_namespace_label *label0 = nd_mapping->labels[0]; + struct nd_namespace_label *label0; + + nd_mapping = &nd_region->mapping[i]; + label_ent = list_first_entry_or_null(&nd_mapping->labels, + typeof(*label_ent), list); + label0 = label_ent ? label_ent->label : 0; + + if (!label0) { + WARN_ON(1); + continue; + } size += __le64_to_cpu(label0->rawsize); if (__le16_to_cpu(label0->position) != 0) @@ -1654,10 +1791,11 @@ static int find_pmem_label_set(struct nd_region *nd_region, goto err; } - nd_namespace_pmem_set_size(nd_region, nspm, size); + nd_namespace_pmem_set_resource(nd_region, nspm, size); - return 0; + return dev; err: + namespace_pmem_release(dev); switch (rc) { case -EINVAL: dev_dbg(&nd_region->dev, "%s: invalid label(s)\n", __func__); @@ -1670,55 +1808,7 @@ static int find_pmem_label_set(struct nd_region *nd_region, __func__, rc); break; } - return rc; -} - -static struct device **create_namespace_pmem(struct nd_region *nd_region) -{ - struct nd_namespace_pmem *nspm; - struct device *dev, **devs; - struct resource *res; - int rc; - - nspm = kzalloc(sizeof(*nspm), GFP_KERNEL); - if (!nspm) - return NULL; - - dev = &nspm->nsio.common.dev; - dev->type = &namespace_pmem_device_type; - dev->parent = &nd_region->dev; - res = &nspm->nsio.res; - res->name = dev_name(&nd_region->dev); - res->flags = IORESOURCE_MEM; - rc = find_pmem_label_set(nd_region, nspm); - if (rc == -ENODEV) { - int i; - - /* Pass, try to permit namespace creation... */ - for (i = 0; i < nd_region->ndr_mappings; i++) { - struct nd_mapping *nd_mapping = &nd_region->mapping[i]; - - kfree(nd_mapping->labels); - nd_mapping->labels = NULL; - } - - /* Publish a zero-sized namespace for userspace to configure. */ - nd_namespace_pmem_set_size(nd_region, nspm, 0); - - rc = 0; - } else if (rc) - goto err; - - devs = kcalloc(2, sizeof(struct device *), GFP_KERNEL); - if (!devs) - goto err; - - devs[0] = dev; - return devs; - - err: - namespace_pmem_release(&nspm->nsio.common.dev); - return NULL; + return ERR_PTR(rc); } struct resource *nsblk_add_resource(struct nd_region *nd_region, @@ -1770,16 +1860,58 @@ static struct device *nd_namespace_blk_create(struct nd_region *nd_region) return &nsblk->common.dev; } -void nd_region_create_blk_seed(struct nd_region *nd_region) +static struct device *nd_namespace_pmem_create(struct nd_region *nd_region) +{ + struct nd_namespace_pmem *nspm; + struct resource *res; + struct device *dev; + + if (!is_nd_pmem(&nd_region->dev)) + return NULL; + + nspm = kzalloc(sizeof(*nspm), GFP_KERNEL); + if (!nspm) + return NULL; + + dev = &nspm->nsio.common.dev; + dev->type = &namespace_pmem_device_type; + dev->parent = &nd_region->dev; + res = &nspm->nsio.res; + res->name = dev_name(&nd_region->dev); + res->flags = IORESOURCE_MEM; + + nspm->id = ida_simple_get(&nd_region->ns_ida, 0, 0, GFP_KERNEL); + if (nspm->id < 0) { + kfree(nspm); + return NULL; + } + dev_set_name(dev, "namespace%d.%d", nd_region->id, nspm->id); + dev->parent = &nd_region->dev; + dev->groups = nd_namespace_attribute_groups; + nd_namespace_pmem_set_resource(nd_region, nspm, 0); + + return dev; +} + +void nd_region_create_ns_seed(struct nd_region *nd_region) { WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev)); - nd_region->ns_seed = nd_namespace_blk_create(nd_region); + + if (nd_region_to_nstype(nd_region) == ND_DEVICE_NAMESPACE_IO) + return; + + if (is_nd_blk(&nd_region->dev)) + nd_region->ns_seed = nd_namespace_blk_create(nd_region); + else + nd_region->ns_seed = nd_namespace_pmem_create(nd_region); + /* * Seed creation failures are not fatal, provisioning is simply * disabled until memory becomes available */ if (!nd_region->ns_seed) - dev_err(&nd_region->dev, "failed to create blk namespace\n"); + dev_err(&nd_region->dev, "failed to create %s namespace\n", + is_nd_blk(&nd_region->dev) ? "blk" : "pmem"); else nd_device_register(nd_region->ns_seed); } @@ -1820,43 +1952,137 @@ void nd_region_create_btt_seed(struct nd_region *nd_region) dev_err(&nd_region->dev, "failed to create btt namespace\n"); } -static struct device **create_namespace_blk(struct nd_region *nd_region) +static int add_namespace_resource(struct nd_region *nd_region, + struct nd_namespace_label *nd_label, struct device **devs, + int count) { struct nd_mapping *nd_mapping = &nd_region->mapping[0]; - struct nd_namespace_label *nd_label; - struct device *dev, **devs = NULL; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + int i; + + for (i = 0; i < count; i++) { + u8 *uuid = namespace_to_uuid(devs[i]); + struct resource *res; + + if (IS_ERR_OR_NULL(uuid)) { + WARN_ON(1); + continue; + } + + if (memcmp(uuid, nd_label->uuid, NSLABEL_UUID_LEN) != 0) + continue; + if (is_namespace_blk(devs[i])) { + res = nsblk_add_resource(nd_region, ndd, + to_nd_namespace_blk(devs[i]), + __le64_to_cpu(nd_label->dpa)); + if (!res) + return -ENXIO; + nd_dbg_dpa(nd_region, ndd, res, "%d assign\n", count); + } else { + dev_err(&nd_region->dev, + "error: conflicting extents for uuid: %pUb\n", + nd_label->uuid); + return -ENXIO; + } + break; + } + + return i; +} + +struct device *create_namespace_blk(struct nd_region *nd_region, + struct nd_namespace_label *nd_label, int count) +{ + + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); struct nd_namespace_blk *nsblk; - struct nvdimm_drvdata *ndd; - int i, l, count = 0; + char *name[NSLABEL_NAME_LEN]; + struct device *dev = NULL; struct resource *res; - if (nd_region->ndr_mappings == 0) - return NULL; + nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL); + if (!nsblk) + return ERR_PTR(-ENOMEM); + dev = &nsblk->common.dev; + dev->type = &namespace_blk_device_type; + dev->parent = &nd_region->dev; + nsblk->id = -1; + nsblk->lbasize = __le64_to_cpu(nd_label->lbasize); + nsblk->uuid = kmemdup(nd_label->uuid, NSLABEL_UUID_LEN, + GFP_KERNEL); + if (!nsblk->uuid) + goto blk_err; + memcpy(name, nd_label->name, NSLABEL_NAME_LEN); + if (name[0]) + nsblk->alt_name = kmemdup(name, NSLABEL_NAME_LEN, + GFP_KERNEL); + res = nsblk_add_resource(nd_region, ndd, nsblk, + __le64_to_cpu(nd_label->dpa)); + if (!res) + goto blk_err; + nd_dbg_dpa(nd_region, ndd, res, "%d: assign\n", count); + return dev; + blk_err: + namespace_blk_release(dev); + return ERR_PTR(-ENXIO); +} + +static int cmp_dpa(const void *a, const void *b) +{ + const struct device *dev_a = *(const struct device **) a; + const struct device *dev_b = *(const struct device **) b; + struct nd_namespace_blk *nsblk_a, *nsblk_b; + struct nd_namespace_pmem *nspm_a, *nspm_b; + + if (is_namespace_io(dev_a)) + return 0; + + if (is_namespace_blk(dev_a)) { + nsblk_a = to_nd_namespace_blk(dev_a); + nsblk_b = to_nd_namespace_blk(dev_b); + + return memcmp(&nsblk_a->res[0]->start, &nsblk_b->res[0]->start, + sizeof(resource_size_t)); + } + + nspm_a = to_nd_namespace_pmem(dev_a); + nspm_b = to_nd_namespace_pmem(dev_b); + + return memcmp(&nspm_a->nsio.res.start, &nspm_b->nsio.res.start, + sizeof(resource_size_t)); +} - ndd = to_ndd(nd_mapping); - for_each_label(l, nd_label, nd_mapping->labels) { - u32 flags = __le32_to_cpu(nd_label->flags); - char *name[NSLABEL_NAME_LEN]; +static struct device **scan_labels(struct nd_region *nd_region) +{ + int i, count = 0; + struct device *dev, **devs = NULL; + struct nd_label_ent *label_ent, *e; + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + resource_size_t map_end = nd_mapping->start + nd_mapping->size - 1; + + /* "safe" because create_namespace_pmem() might list_move() label_ent */ + list_for_each_entry_safe(label_ent, e, &nd_mapping->labels, list) { + struct nd_namespace_label *nd_label = label_ent->label; struct device **__devs; + u32 flags; - if (flags & NSLABEL_FLAG_LOCAL) - /* pass */; + if (!nd_label) + continue; + flags = __le32_to_cpu(nd_label->flags); + if (is_nd_blk(&nd_region->dev) + == !!(flags & NSLABEL_FLAG_LOCAL)) + /* pass, region matches label type */; else continue; - for (i = 0; i < count; i++) { - nsblk = to_nd_namespace_blk(devs[i]); - if (memcmp(nsblk->uuid, nd_label->uuid, - NSLABEL_UUID_LEN) == 0) { - res = nsblk_add_resource(nd_region, ndd, nsblk, - __le64_to_cpu(nd_label->dpa)); - if (!res) - goto err; - nd_dbg_dpa(nd_region, ndd, res, "%s assign\n", - dev_name(&nsblk->common.dev)); - break; - } - } + /* skip labels that describe extents outside of the region */ + if (nd_label->dpa < nd_mapping->start || nd_label->dpa > map_end) + continue; + + i = add_namespace_resource(nd_region, nd_label, devs, count); + if (i < 0) + goto err; if (i < count) continue; __devs = kcalloc(count + 2, sizeof(dev), GFP_KERNEL); @@ -1866,67 +2092,126 @@ static struct device **create_namespace_blk(struct nd_region *nd_region) kfree(devs); devs = __devs; - nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL); - if (!nsblk) - goto err; - dev = &nsblk->common.dev; - dev->type = &namespace_blk_device_type; - dev->parent = &nd_region->dev; - dev_set_name(dev, "namespace%d.%d", nd_region->id, count); - devs[count++] = dev; - nsblk->id = -1; - nsblk->lbasize = __le64_to_cpu(nd_label->lbasize); - nsblk->uuid = kmemdup(nd_label->uuid, NSLABEL_UUID_LEN, - GFP_KERNEL); - if (!nsblk->uuid) - goto err; - memcpy(name, nd_label->name, NSLABEL_NAME_LEN); - if (name[0]) - nsblk->alt_name = kmemdup(name, NSLABEL_NAME_LEN, - GFP_KERNEL); - res = nsblk_add_resource(nd_region, ndd, nsblk, - __le64_to_cpu(nd_label->dpa)); - if (!res) - goto err; - nd_dbg_dpa(nd_region, ndd, res, "%s assign\n", - dev_name(&nsblk->common.dev)); + if (is_nd_blk(&nd_region->dev)) { + dev = create_namespace_blk(nd_region, nd_label, count); + if (IS_ERR(dev)) + goto err; + devs[count++] = dev; + } else { + dev = create_namespace_pmem(nd_region, nd_label); + if (IS_ERR(dev)) { + switch (PTR_ERR(dev)) { + case -EAGAIN: + /* skip invalid labels */ + continue; + case -ENODEV: + /* fallthrough to seed creation */ + break; + default: + goto err; + } + } else + devs[count++] = dev; + } } - dev_dbg(&nd_region->dev, "%s: discovered %d blk namespace%s\n", - __func__, count, count == 1 ? "" : "s"); + dev_dbg(&nd_region->dev, "%s: discovered %d %s namespace%s\n", + __func__, count, is_nd_blk(&nd_region->dev) + ? "blk" : "pmem", count == 1 ? "" : "s"); if (count == 0) { /* Publish a zero-sized namespace for userspace to configure. */ - for (i = 0; i < nd_region->ndr_mappings; i++) { - struct nd_mapping *nd_mapping = &nd_region->mapping[i]; - - kfree(nd_mapping->labels); - nd_mapping->labels = NULL; - } + nd_mapping_free_labels(nd_mapping); devs = kcalloc(2, sizeof(dev), GFP_KERNEL); if (!devs) goto err; - nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL); - if (!nsblk) - goto err; - dev = &nsblk->common.dev; - dev->type = &namespace_blk_device_type; + if (is_nd_blk(&nd_region->dev)) { + struct nd_namespace_blk *nsblk; + + nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL); + if (!nsblk) + goto err; + dev = &nsblk->common.dev; + dev->type = &namespace_blk_device_type; + } else { + struct nd_namespace_pmem *nspm; + + nspm = kzalloc(sizeof(*nspm), GFP_KERNEL); + if (!nspm) + goto err; + dev = &nspm->nsio.common.dev; + dev->type = &namespace_pmem_device_type; + nd_namespace_pmem_set_resource(nd_region, nspm, 0); + } dev->parent = &nd_region->dev; devs[count++] = dev; + } else if (is_nd_pmem(&nd_region->dev)) { + /* clean unselected labels */ + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct list_head *l, *e; + LIST_HEAD(list); + int j; + + nd_mapping = &nd_region->mapping[i]; + if (list_empty(&nd_mapping->labels)) { + WARN_ON(1); + continue; + } + + j = count; + list_for_each_safe(l, e, &nd_mapping->labels) { + if (!j--) + break; + list_move_tail(l, &list); + } + nd_mapping_free_labels(nd_mapping); + list_splice_init(&list, &nd_mapping->labels); + } } + if (count > 1) + sort(devs, count, sizeof(struct device *), cmp_dpa, NULL); + return devs; -err: - for (i = 0; i < count; i++) { - nsblk = to_nd_namespace_blk(devs[i]); - namespace_blk_release(&nsblk->common.dev); - } + err: + for (i = 0; devs[i]; i++) + if (is_nd_blk(&nd_region->dev)) + namespace_blk_release(devs[i]); + else + namespace_pmem_release(devs[i]); kfree(devs); return NULL; } +static struct device **create_namespaces(struct nd_region *nd_region) +{ + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + struct device **devs; + int i; + + if (nd_region->ndr_mappings == 0) + return NULL; + + /* lock down all mappings while we scan labels */ + for (i = 0; i < nd_region->ndr_mappings; i++) { + nd_mapping = &nd_region->mapping[i]; + mutex_lock_nested(&nd_mapping->lock, i); + } + + devs = scan_labels(nd_region); + + for (i = 0; i < nd_region->ndr_mappings; i++) { + int reverse = nd_region->ndr_mappings - 1 - i; + + nd_mapping = &nd_region->mapping[reverse]; + mutex_unlock(&nd_mapping->lock); + } + + return devs; +} + static int init_active_labels(struct nd_region *nd_region) { int i; @@ -1935,6 +2220,7 @@ static int init_active_labels(struct nd_region *nd_region) struct nd_mapping *nd_mapping = &nd_region->mapping[i]; struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); struct nvdimm *nvdimm = nd_mapping->nvdimm; + struct nd_label_ent *label_ent; int count, j; /* @@ -1956,16 +2242,27 @@ static int init_active_labels(struct nd_region *nd_region) dev_dbg(ndd->dev, "%s: %d\n", __func__, count); if (!count) continue; - nd_mapping->labels = kcalloc(count + 1, sizeof(void *), - GFP_KERNEL); - if (!nd_mapping->labels) - return -ENOMEM; for (j = 0; j < count; j++) { struct nd_namespace_label *label; + label_ent = kzalloc(sizeof(*label_ent), GFP_KERNEL); + if (!label_ent) + break; label = nd_label_active(ndd, j); - nd_mapping->labels[j] = label; + label_ent->label = label; + + mutex_lock(&nd_mapping->lock); + list_add_tail(&label_ent->list, &nd_mapping->labels); + mutex_unlock(&nd_mapping->lock); } + + if (j >= count) + continue; + + mutex_lock(&nd_mapping->lock); + nd_mapping_free_labels(nd_mapping); + mutex_unlock(&nd_mapping->lock); + return -ENOMEM; } return 0; @@ -1990,10 +2287,8 @@ int nd_region_register_namespaces(struct nd_region *nd_region, int *err) devs = create_namespace_io(nd_region); break; case ND_DEVICE_NAMESPACE_PMEM: - devs = create_namespace_pmem(nd_region); - break; case ND_DEVICE_NAMESPACE_BLK: - devs = create_namespace_blk(nd_region); + devs = create_namespaces(nd_region); break; default: break; @@ -2014,6 +2309,13 @@ int nd_region_register_namespaces(struct nd_region *nd_region, int *err) id = ida_simple_get(&nd_region->ns_ida, 0, 0, GFP_KERNEL); nsblk->id = id; + } else if (type == ND_DEVICE_NAMESPACE_PMEM) { + struct nd_namespace_pmem *nspm; + + nspm = to_nd_namespace_pmem(dev); + id = ida_simple_get(&nd_region->ns_ida, 0, 0, + GFP_KERNEL); + nspm->id = id; } else id = i; diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h index 38ce6bbbc170..8623e57c2ce3 100644 --- a/drivers/nvdimm/nd-core.h +++ b/drivers/nvdimm/nd-core.h @@ -44,6 +44,23 @@ struct nvdimm { struct resource *flush_wpq; }; +/** + * struct blk_alloc_info - tracking info for BLK dpa scanning + * @nd_mapping: blk region mapping boundaries + * @available: decremented in alias_dpa_busy as aliased PMEM is scanned + * @busy: decremented in blk_dpa_busy to account for ranges already + * handled by alias_dpa_busy + * @res: alias_dpa_busy interprets this a free space range that needs to + * be truncated to the valid BLK allocation starting DPA, blk_dpa_busy + * treats it as a busy range that needs the aliased PMEM ranges + * truncated. + */ +struct blk_alloc_info { + struct nd_mapping *nd_mapping; + resource_size_t available, busy; + struct resource *res; +}; + bool is_nvdimm(struct device *dev); bool is_nd_pmem(struct device *dev); bool is_nd_blk(struct device *dev); @@ -54,7 +71,7 @@ void nvdimm_devs_exit(void); void nd_region_devs_exit(void); void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev); struct nd_region; -void nd_region_create_blk_seed(struct nd_region *nd_region); +void nd_region_create_ns_seed(struct nd_region *nd_region); void nd_region_create_btt_seed(struct nd_region *nd_region); void nd_region_create_pfn_seed(struct nd_region *nd_region); void nd_region_create_dax_seed(struct nd_region *nd_region); @@ -73,13 +90,14 @@ bool nd_is_uuid_unique(struct device *dev, u8 *uuid); struct nd_region; struct nvdimm_drvdata; struct nd_mapping; +void nd_mapping_free_labels(struct nd_mapping *nd_mapping); resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region, struct nd_mapping *nd_mapping, resource_size_t *overlap); -resource_size_t nd_blk_available_dpa(struct nd_mapping *nd_mapping); +resource_size_t nd_blk_available_dpa(struct nd_region *nd_region); resource_size_t nd_region_available_dpa(struct nd_region *nd_region); resource_size_t nvdimm_allocated_dpa(struct nvdimm_drvdata *ndd, struct nd_label_id *label_id); -struct nd_mapping; +int alias_dpa_busy(struct device *dev, void *data); struct resource *nsblk_add_resource(struct nd_region *nd_region, struct nvdimm_drvdata *ndd, struct nd_namespace_blk *nsblk, resource_size_t start); diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 0b78a8211f4a..d3b2fca8deec 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -101,9 +101,6 @@ static inline struct nd_namespace_index *to_next_namespace_index( (unsigned long long) (res ? resource_size(res) : 0), \ (unsigned long long) (res ? res->start : 0), ##arg) -#define for_each_label(l, label, labels) \ - for (l = 0; (label = labels ? labels[l] : NULL); l++) - #define for_each_dpa_resource(ndd, res) \ for (res = (ndd)->dpa.child; res; res = res->sibling) @@ -116,6 +113,31 @@ struct nd_percpu_lane { spinlock_t lock; }; +struct nd_label_ent { + struct list_head list; + struct nd_namespace_label *label; +}; + +enum nd_mapping_lock_class { + ND_MAPPING_CLASS0, + ND_MAPPING_UUID_SCAN, +}; + +struct nd_mapping { + struct nvdimm *nvdimm; + u64 start; + u64 size; + struct list_head labels; + struct mutex lock; + /* + * @ndd is for private use at region enable / disable time for + * get_ndd() + put_ndd(), all other nd_mapping to ndd + * conversions use to_ndd() which respects enabled state of the + * nvdimm. + */ + struct nvdimm_drvdata *ndd; +}; + struct nd_region { struct device dev; struct ida ns_ida; @@ -209,6 +231,7 @@ void nvdimm_exit(void); void nd_region_exit(void); struct nvdimm; struct nvdimm_drvdata *to_ndd(struct nd_mapping *nd_mapping); +int nvdimm_check_config_data(struct device *dev); int nvdimm_init_nsarea(struct nvdimm_drvdata *ndd); int nvdimm_init_config_data(struct nvdimm_drvdata *ndd); int nvdimm_set_config_data(struct nvdimm_drvdata *ndd, size_t offset, diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 571a6c7ee2fc..42b3a8217073 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -66,13 +66,32 @@ static void pmem_clear_poison(struct pmem_device *pmem, phys_addr_t offset, invalidate_pmem(pmem->virt_addr + offset, len); } +static void write_pmem(void *pmem_addr, struct page *page, + unsigned int off, unsigned int len) +{ + void *mem = kmap_atomic(page); + + memcpy_to_pmem(pmem_addr, mem + off, len); + kunmap_atomic(mem); +} + +static int read_pmem(struct page *page, unsigned int off, + void *pmem_addr, unsigned int len) +{ + int rc; + void *mem = kmap_atomic(page); + + rc = memcpy_from_pmem(mem + off, pmem_addr, len); + kunmap_atomic(mem); + return rc; +} + static int pmem_do_bvec(struct pmem_device *pmem, struct page *page, unsigned int len, unsigned int off, bool is_write, sector_t sector) { int rc = 0; bool bad_pmem = false; - void *mem = kmap_atomic(page); phys_addr_t pmem_off = sector * 512 + pmem->data_offset; void *pmem_addr = pmem->virt_addr + pmem_off; @@ -83,7 +102,7 @@ static int pmem_do_bvec(struct pmem_device *pmem, struct page *page, if (unlikely(bad_pmem)) rc = -EIO; else { - rc = memcpy_from_pmem(mem + off, pmem_addr, len); + rc = read_pmem(page, off, pmem_addr, len); flush_dcache_page(page); } } else { @@ -102,14 +121,13 @@ static int pmem_do_bvec(struct pmem_device *pmem, struct page *page, * after clear poison. */ flush_dcache_page(page); - memcpy_to_pmem(pmem_addr, mem + off, len); + write_pmem(pmem_addr, page, off, len); if (unlikely(bad_pmem)) { pmem_clear_poison(pmem, pmem_off, len); - memcpy_to_pmem(pmem_addr, mem + off, len); + write_pmem(pmem_addr, page, off, len); } } - kunmap_atomic(mem); return rc; } diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index 4c0ac4abb629..6af5e629140c 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -70,7 +70,7 @@ static int nvdimm_map_flush(struct device *dev, struct nvdimm *nvdimm, int dimm, int nd_region_activate(struct nd_region *nd_region) { - int i, num_flush = 0; + int i, j, num_flush = 0; struct nd_region_data *ndrd; struct device *dev = &nd_region->dev; size_t flush_data_size = sizeof(void *); @@ -107,6 +107,21 @@ int nd_region_activate(struct nd_region *nd_region) return rc; } + /* + * Clear out entries that are duplicates. This should prevent the + * extra flushings. + */ + for (i = 0; i < nd_region->ndr_mappings - 1; i++) { + /* ignore if NULL already */ + if (!ndrd_get_flush_wpq(ndrd, i, 0)) + continue; + + for (j = i + 1; j < nd_region->ndr_mappings; j++) + if (ndrd_get_flush_wpq(ndrd, i, 0) == + ndrd_get_flush_wpq(ndrd, j, 0)) + ndrd_set_flush_wpq(ndrd, j, 0, NULL); + } + return 0; } @@ -298,9 +313,8 @@ resource_size_t nd_region_available_dpa(struct nd_region *nd_region) blk_max_overlap = overlap; goto retry; } - } else if (is_nd_blk(&nd_region->dev)) { - available += nd_blk_available_dpa(nd_mapping); - } + } else if (is_nd_blk(&nd_region->dev)) + available += nd_blk_available_dpa(nd_region); } return available; @@ -491,6 +505,17 @@ u64 nd_region_interleave_set_cookie(struct nd_region *nd_region) return 0; } +void nd_mapping_free_labels(struct nd_mapping *nd_mapping) +{ + struct nd_label_ent *label_ent, *e; + + WARN_ON(!mutex_is_locked(&nd_mapping->lock)); + list_for_each_entry_safe(label_ent, e, &nd_mapping->labels, list) { + list_del(&label_ent->list); + kfree(label_ent); + } +} + /* * Upon successful probe/remove, take/release a reference on the * associated interleave set (if present), and plant new btt + namespace @@ -511,8 +536,10 @@ static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus, struct nvdimm_drvdata *ndd = nd_mapping->ndd; struct nvdimm *nvdimm = nd_mapping->nvdimm; - kfree(nd_mapping->labels); - nd_mapping->labels = NULL; + mutex_lock(&nd_mapping->lock); + nd_mapping_free_labels(nd_mapping); + mutex_unlock(&nd_mapping->lock); + put_ndd(ndd); nd_mapping->ndd = NULL; if (ndd) @@ -522,11 +549,12 @@ static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus, if (is_nd_pmem(dev)) return; } - if (dev->parent && is_nd_blk(dev->parent) && probe) { + if (dev->parent && (is_nd_blk(dev->parent) || is_nd_pmem(dev->parent)) + && probe) { nd_region = to_nd_region(dev->parent); nvdimm_bus_lock(dev); if (nd_region->ns_seed == dev) - nd_region_create_blk_seed(nd_region); + nd_region_create_ns_seed(nd_region); nvdimm_bus_unlock(dev); } if (is_nd_btt(dev) && probe) { @@ -536,23 +564,30 @@ static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus, nvdimm_bus_lock(dev); if (nd_region->btt_seed == dev) nd_region_create_btt_seed(nd_region); - if (nd_region->ns_seed == &nd_btt->ndns->dev && - is_nd_blk(dev->parent)) - nd_region_create_blk_seed(nd_region); + if (nd_region->ns_seed == &nd_btt->ndns->dev) + nd_region_create_ns_seed(nd_region); nvdimm_bus_unlock(dev); } if (is_nd_pfn(dev) && probe) { + struct nd_pfn *nd_pfn = to_nd_pfn(dev); + nd_region = to_nd_region(dev->parent); nvdimm_bus_lock(dev); if (nd_region->pfn_seed == dev) nd_region_create_pfn_seed(nd_region); + if (nd_region->ns_seed == &nd_pfn->ndns->dev) + nd_region_create_ns_seed(nd_region); nvdimm_bus_unlock(dev); } if (is_nd_dax(dev) && probe) { + struct nd_dax *nd_dax = to_nd_dax(dev); + nd_region = to_nd_region(dev->parent); nvdimm_bus_lock(dev); if (nd_region->dax_seed == dev) nd_region_create_dax_seed(nd_region); + if (nd_region->ns_seed == &nd_dax->nd_pfn.ndns->dev) + nd_region_create_ns_seed(nd_region); nvdimm_bus_unlock(dev); } } @@ -759,10 +794,10 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, int ro = 0; for (i = 0; i < ndr_desc->num_mappings; i++) { - struct nd_mapping *nd_mapping = &ndr_desc->nd_mapping[i]; - struct nvdimm *nvdimm = nd_mapping->nvdimm; + struct nd_mapping_desc *mapping = &ndr_desc->mapping[i]; + struct nvdimm *nvdimm = mapping->nvdimm; - if ((nd_mapping->start | nd_mapping->size) % SZ_4K) { + if ((mapping->start | mapping->size) % SZ_4K) { dev_err(&nvdimm_bus->dev, "%s: %s mapping%d is not 4K aligned\n", caller, dev_name(&nvdimm->dev), i); @@ -813,11 +848,15 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, ndl->count = 0; } - memcpy(nd_region->mapping, ndr_desc->nd_mapping, - sizeof(struct nd_mapping) * ndr_desc->num_mappings); for (i = 0; i < ndr_desc->num_mappings; i++) { - struct nd_mapping *nd_mapping = &ndr_desc->nd_mapping[i]; - struct nvdimm *nvdimm = nd_mapping->nvdimm; + struct nd_mapping_desc *mapping = &ndr_desc->mapping[i]; + struct nvdimm *nvdimm = mapping->nvdimm; + + nd_region->mapping[i].nvdimm = nvdimm; + nd_region->mapping[i].start = mapping->start; + nd_region->mapping[i].size = mapping->size; + INIT_LIST_HEAD(&nd_region->mapping[i].labels); + mutex_init(&nd_region->mapping[i].lock); get_device(&nvdimm->dev); } |