From e7638488434415aa478e78435cac8f0365737638 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 16 May 2018 11:46:08 -0700 Subject: mm: introduce MEMORY_DEVICE_FS_DAX and CONFIG_DEV_PAGEMAP_OPS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation for fixing dax-dma-vs-unmap issues, filesystems need to be able to rely on the fact that they will get wakeups on dev_pagemap page-idle events. Introduce MEMORY_DEVICE_FS_DAX and generic_dax_page_free() as common indicator / infrastructure for dax filesytems to require. With this change there are no users of the MEMORY_DEVICE_HOST designation, so remove it. The HMM sub-system extended dev_pagemap to arrange a callback when a dev_pagemap managed page is freed. Since a dev_pagemap page is free / idle when its reference count is 1 it requires an additional branch to check the page-type at put_page() time. Given put_page() is a hot-path we do not want to incur that check if HMM is not in use, so a static branch is used to avoid that overhead when not necessary. Now, the FS_DAX implementation wants to reuse this mechanism for receiving dev_pagemap ->page_free() callbacks. Rework the HMM-specific static-key into a generic mechanism that either HMM or FS_DAX code paths can enable. For ARCH=um builds, and any other arch that lacks ZONE_DEVICE support, care must be taken to compile out the DEV_PAGEMAP_OPS infrastructure. However, we still need to support FS_DAX in the FS_DAX_LIMITED case implemented by the s390/dcssblk driver. Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Michal Hocko Reported-by: kbuild test robot Reported-by: Thomas Meyer Reported-by: Dave Jiang Cc: "Jérôme Glisse" Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Signed-off-by: Dan Williams --- drivers/dax/super.c | 14 +++++++++++--- drivers/nvdimm/pfn_devs.c | 2 -- drivers/nvdimm/pmem.c | 25 +++++++++++++++++++++++++ 3 files changed, 36 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 2b2332b605e4..bf8bfaf5596f 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -86,6 +86,7 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize) { struct block_device *bdev = sb->s_bdev; struct dax_device *dax_dev; + bool dax_enabled = false; pgoff_t pgoff; int err, id; void *kaddr; @@ -134,14 +135,21 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize) * on being able to do (page_address(pfn_to_page())). */ WARN_ON(IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API)); + dax_enabled = true; } else if (pfn_t_devmap(pfn)) { - /* pass */; - } else { + struct dev_pagemap *pgmap; + + pgmap = get_dev_pagemap(pfn_t_to_pfn(pfn), NULL); + if (pgmap && pgmap->type == MEMORY_DEVICE_FS_DAX) + dax_enabled = true; + put_dev_pagemap(pgmap); + } + + if (!dax_enabled) { pr_debug("VFS (%s): error: dax support not enabled\n", sb->s_id); return -EOPNOTSUPP; } - return 0; } EXPORT_SYMBOL_GPL(__bdev_dax_supported); diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index 30b08791597d..3f7ad5bc443e 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -561,8 +561,6 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap) res->start += start_pad; res->end -= end_trunc; - pgmap->type = MEMORY_DEVICE_HOST; - if (nd_pfn->mode == PFN_MODE_RAM) { if (offset < SZ_8K) return -EINVAL; diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 9d714926ecf5..06b41ec9f1b3 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -294,6 +294,27 @@ static void pmem_release_disk(void *__pmem) put_disk(pmem->disk); } +static void pmem_release_pgmap_ops(void *__pgmap) +{ + dev_pagemap_put_ops(); +} + +static void fsdax_pagefree(struct page *page, void *data) +{ + wake_up_var(&page->_refcount); +} + +static int setup_pagemap_fsdax(struct device *dev, struct dev_pagemap *pgmap) +{ + dev_pagemap_get_ops(); + if (devm_add_action_or_reset(dev, pmem_release_pgmap_ops, pgmap)) + return -ENOMEM; + pgmap->type = MEMORY_DEVICE_FS_DAX; + pgmap->page_free = fsdax_pagefree; + + return 0; +} + static int pmem_attach_disk(struct device *dev, struct nd_namespace_common *ndns) { @@ -353,6 +374,8 @@ static int pmem_attach_disk(struct device *dev, pmem->pfn_flags = PFN_DEV; pmem->pgmap.ref = &q->q_usage_counter; if (is_nd_pfn(dev)) { + if (setup_pagemap_fsdax(dev, &pmem->pgmap)) + return -ENOMEM; addr = devm_memremap_pages(dev, &pmem->pgmap); pfn_sb = nd_pfn->pfn_sb; pmem->data_offset = le64_to_cpu(pfn_sb->dataoff); @@ -364,6 +387,8 @@ static int pmem_attach_disk(struct device *dev, } else if (pmem_should_map_pages(dev)) { memcpy(&pmem->pgmap.res, &nsio->res, sizeof(pmem->pgmap.res)); pmem->pgmap.altmap_valid = false; + if (setup_pagemap_fsdax(dev, &pmem->pgmap)) + return -ENOMEM; addr = devm_memremap_pages(dev, &pmem->pgmap); pmem->pfn_flags |= PFN_MAP; memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res)); -- cgit v1.2.3 From b3a9a0c36e1f7b9e2e6cf965c2bb973624f2b3b9 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 2 May 2018 06:46:33 -0700 Subject: dax: Introduce a ->copy_to_iter dax operation Similar to the ->copy_from_iter() operation, a platform may want to deploy an architecture or device specific routine for handling reads from a dax_device like /dev/pmemX. On x86 this routine will point to a machine check safe version of copy_to_iter(). For now, add the plumbing to device-mapper and the dax core. Cc: Ross Zwisler Cc: Mike Snitzer Cc: Christoph Hellwig Signed-off-by: Dan Williams --- drivers/dax/super.c | 10 ++++++++++ drivers/md/dm-linear.c | 16 ++++++++++++++++ drivers/md/dm-log-writes.c | 15 +++++++++++++++ drivers/md/dm-stripe.c | 21 +++++++++++++++++++++ drivers/md/dm.c | 25 +++++++++++++++++++++++++ drivers/nvdimm/pmem.c | 7 +++++++ drivers/s390/block/dcssblk.c | 7 +++++++ fs/dax.c | 3 ++- include/linux/dax.h | 5 +++++ include/linux/device-mapper.h | 5 +++-- 10 files changed, 111 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 2b2332b605e4..31b839113399 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -282,6 +282,16 @@ size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, } EXPORT_SYMBOL_GPL(dax_copy_from_iter); +size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, + size_t bytes, struct iov_iter *i) +{ + if (!dax_alive(dax_dev)) + return 0; + + return dax_dev->ops->copy_to_iter(dax_dev, pgoff, addr, bytes, i); +} +EXPORT_SYMBOL_GPL(dax_copy_to_iter); + #ifdef CONFIG_ARCH_HAS_PMEM_API void arch_wb_cache_pmem(void *addr, size_t size); void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 775c06d953b7..d10964d41fd7 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -185,9 +185,24 @@ static size_t linear_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff, return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i); } +static size_t linear_dax_copy_to_iter(struct dm_target *ti, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i) +{ + struct linear_c *lc = ti->private; + struct block_device *bdev = lc->dev->bdev; + struct dax_device *dax_dev = lc->dev->dax_dev; + sector_t dev_sector, sector = pgoff * PAGE_SECTORS; + + dev_sector = linear_map_sector(ti, sector); + if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(bytes, PAGE_SIZE), &pgoff)) + return 0; + return dax_copy_to_iter(dax_dev, pgoff, addr, bytes, i); +} + #else #define linear_dax_direct_access NULL #define linear_dax_copy_from_iter NULL +#define linear_dax_copy_to_iter NULL #endif static struct target_type linear_target = { @@ -204,6 +219,7 @@ static struct target_type linear_target = { .iterate_devices = linear_iterate_devices, .direct_access = linear_dax_direct_access, .dax_copy_from_iter = linear_dax_copy_from_iter, + .dax_copy_to_iter = linear_dax_copy_to_iter, }; int __init dm_linear_init(void) diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index c90c7c08a77f..9ea2b0291f20 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -962,9 +962,23 @@ static size_t log_writes_dax_copy_from_iter(struct dm_target *ti, dax_copy: return dax_copy_from_iter(lc->dev->dax_dev, pgoff, addr, bytes, i); } + +static size_t log_writes_dax_copy_to_iter(struct dm_target *ti, + pgoff_t pgoff, void *addr, size_t bytes, + struct iov_iter *i) +{ + struct log_writes_c *lc = ti->private; + sector_t sector = pgoff * PAGE_SECTORS; + + if (bdev_dax_pgoff(lc->dev->bdev, sector, ALIGN(bytes, PAGE_SIZE), &pgoff)) + return 0; + return dax_copy_to_iter(lc->dev->dax_dev, pgoff, addr, bytes, i); +} + #else #define log_writes_dax_direct_access NULL #define log_writes_dax_copy_from_iter NULL +#define log_writes_dax_copy_to_iter NULL #endif static struct target_type log_writes_target = { @@ -982,6 +996,7 @@ static struct target_type log_writes_target = { .io_hints = log_writes_io_hints, .direct_access = log_writes_dax_direct_access, .dax_copy_from_iter = log_writes_dax_copy_from_iter, + .dax_copy_to_iter = log_writes_dax_copy_to_iter, }; static int __init dm_log_writes_init(void) diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index fe7fb9b1aec3..8547d7594338 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -354,9 +354,29 @@ static size_t stripe_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff, return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i); } +static size_t stripe_dax_copy_to_iter(struct dm_target *ti, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i) +{ + sector_t dev_sector, sector = pgoff * PAGE_SECTORS; + struct stripe_c *sc = ti->private; + struct dax_device *dax_dev; + struct block_device *bdev; + uint32_t stripe; + + stripe_map_sector(sc, sector, &stripe, &dev_sector); + dev_sector += sc->stripe[stripe].physical_start; + dax_dev = sc->stripe[stripe].dev->dax_dev; + bdev = sc->stripe[stripe].dev->bdev; + + if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(bytes, PAGE_SIZE), &pgoff)) + return 0; + return dax_copy_to_iter(dax_dev, pgoff, addr, bytes, i); +} + #else #define stripe_dax_direct_access NULL #define stripe_dax_copy_from_iter NULL +#define stripe_dax_copy_to_iter NULL #endif /* @@ -478,6 +498,7 @@ static struct target_type stripe_target = { .io_hints = stripe_io_hints, .direct_access = stripe_dax_direct_access, .dax_copy_from_iter = stripe_dax_copy_from_iter, + .dax_copy_to_iter = stripe_dax_copy_to_iter, }; int __init dm_stripe_init(void) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 0a7b0107ca78..6752f1c25258 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1089,6 +1089,30 @@ static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, return ret; } +static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i) +{ + struct mapped_device *md = dax_get_private(dax_dev); + sector_t sector = pgoff * PAGE_SECTORS; + struct dm_target *ti; + long ret = 0; + int srcu_idx; + + ti = dm_dax_get_live_target(md, sector, &srcu_idx); + + if (!ti) + goto out; + if (!ti->type->dax_copy_to_iter) { + ret = copy_to_iter(addr, bytes, i); + goto out; + } + ret = ti->type->dax_copy_to_iter(ti, pgoff, addr, bytes, i); + out: + dm_put_live_table(md, srcu_idx); + + return ret; +} + /* * A target may call dm_accept_partial_bio only from the map routine. It is * allowed for all bio types except REQ_PREFLUSH and REQ_OP_ZONE_RESET. @@ -3134,6 +3158,7 @@ static const struct block_device_operations dm_blk_dops = { static const struct dax_operations dm_dax_ops = { .direct_access = dm_dax_direct_access, .copy_from_iter = dm_dax_copy_from_iter, + .copy_to_iter = dm_dax_copy_to_iter, }; /* diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index e023d6aa22b5..1b8ab48365de 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -264,9 +264,16 @@ static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, return copy_from_iter_flushcache(addr, bytes, i); } +static size_t pmem_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i) +{ + return copy_to_iter(addr, bytes, i); +} + static const struct dax_operations pmem_dax_ops = { .direct_access = pmem_dax_direct_access, .copy_from_iter = pmem_copy_from_iter, + .copy_to_iter = pmem_copy_to_iter, }; static const struct attribute_group *pmem_attribute_groups[] = { diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 0a312e450207..29024492b8ed 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -51,9 +51,16 @@ static size_t dcssblk_dax_copy_from_iter(struct dax_device *dax_dev, return copy_from_iter(addr, bytes, i); } +static size_t dcssblk_dax_copy_to_iter(struct dax_device *dax_dev, + pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) +{ + return copy_to_iter(addr, bytes, i); +} + static const struct dax_operations dcssblk_dax_ops = { .direct_access = dcssblk_dax_direct_access, .copy_from_iter = dcssblk_dax_copy_from_iter, + .copy_to_iter = dcssblk_dax_copy_to_iter, }; struct dcssblk_dev_info { diff --git a/fs/dax.c b/fs/dax.c index aaec72ded1b6..a64afdf7ec0d 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1057,7 +1057,8 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr, map_len, iter); else - map_len = copy_to_iter(kaddr, map_len, iter); + map_len = dax_copy_to_iter(dax_dev, pgoff, kaddr, + map_len, iter); if (map_len <= 0) { ret = map_len ? map_len : -EFAULT; break; diff --git a/include/linux/dax.h b/include/linux/dax.h index f9eb22ad341e..a43b396fb336 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -20,6 +20,9 @@ struct dax_operations { /* copy_from_iter: required operation for fs-dax direct-i/o */ size_t (*copy_from_iter)(struct dax_device *, pgoff_t, void *, size_t, struct iov_iter *); + /* copy_to_iter: required operation for fs-dax direct-i/o */ + size_t (*copy_to_iter)(struct dax_device *, pgoff_t, void *, size_t, + struct iov_iter *); }; extern struct attribute_group dax_attribute_group; @@ -118,6 +121,8 @@ long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn); size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); +size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, + size_t bytes, struct iov_iter *i); void dax_flush(struct dax_device *dax_dev, void *addr, size_t size); ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 31fef7c34185..6fb0808e87c8 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -133,7 +133,7 @@ typedef int (*dm_busy_fn) (struct dm_target *ti); */ typedef long (*dm_dax_direct_access_fn) (struct dm_target *ti, pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn); -typedef size_t (*dm_dax_copy_from_iter_fn)(struct dm_target *ti, pgoff_t pgoff, +typedef size_t (*dm_dax_copy_iter_fn)(struct dm_target *ti, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); #define PAGE_SECTORS (PAGE_SIZE / 512) @@ -184,7 +184,8 @@ struct target_type { dm_iterate_devices_fn iterate_devices; dm_io_hints_fn io_hints; dm_dax_direct_access_fn direct_access; - dm_dax_copy_from_iter_fn dax_copy_from_iter; + dm_dax_copy_iter_fn dax_copy_from_iter; + dm_dax_copy_iter_fn dax_copy_to_iter; /* For internal device-mapper use. */ struct list_head list; -- cgit v1.2.3 From 6dfdb2b6d877d654c8c7b59d7166f4d672fba4e8 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 2 May 2018 13:41:02 -0700 Subject: pmem: Switch to copy_to_iter_mcsafe() Use the machine check safe version of copy_to_iter() for the ->copy_to_iter() operation published by the pmem driver. Signed-off-by: Dan Williams --- drivers/nvdimm/pmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 1b8ab48365de..6d3da8c92868 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -267,7 +267,7 @@ static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, static size_t pmem_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) { - return copy_to_iter(addr, bytes, i); + return copy_to_iter_mcsafe(addr, bytes, i); } static const struct dax_operations pmem_dax_ops = { -- cgit v1.2.3 From 254a4cd50b9fe2291a12b8902e08e56dcc4e9b10 Mon Sep 17 00:00:00 2001 From: Robert Elliott Date: Thu, 31 May 2018 18:36:36 -0500 Subject: linvdimm, pmem: Preserve read-only setting for pmem devices The pmem driver does not honor a forced read-only setting for very long: $ blockdev --setro /dev/pmem0 $ blockdev --getro /dev/pmem0 1 followed by various commands like these: $ blockdev --rereadpt /dev/pmem0 or $ mkfs.ext4 /dev/pmem0 results in this in the kernel serial log: nd_pmem namespace0.0: region0 read-write, marking pmem0 read-write with the read-only setting lost: $ blockdev --getro /dev/pmem0 0 That's from bus.c nvdimm_revalidate_disk(), which always applies the setting from nd_region (which is initially based on the ACPI NFIT NVDIMM state flags not_armed bit). In contrast, commit 20bd1d026aac ("scsi: sd: Keep disk read-only when re-reading partition") fixed this issue for SCSI devices to preserve the previous setting if it was set to read-only. This patch modifies bus.c to preserve any previous read-only setting. It also eliminates the kernel serial log print except for cases where read-write is changed to read-only, so it doesn't print read-only to read-only non-changes. Cc: Fixes: 581388209405 ("libnvdimm, nfit: handle unarmed dimms, mark namespaces read-only") Signed-off-by: Robert Elliott Signed-off-by: Dan Williams --- drivers/nvdimm/bus.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c index a64023690cad..b9e0d30e317a 100644 --- a/drivers/nvdimm/bus.c +++ b/drivers/nvdimm/bus.c @@ -566,14 +566,18 @@ int nvdimm_revalidate_disk(struct gendisk *disk) { struct device *dev = disk_to_dev(disk)->parent; struct nd_region *nd_region = to_nd_region(dev->parent); - const char *pol = nd_region->ro ? "only" : "write"; + int disk_ro = get_disk_ro(disk); - if (nd_region->ro == get_disk_ro(disk)) + /* + * Upgrade to read-only if the region is read-only preserve as + * read-only if the disk is already read-only. + */ + if (disk_ro || nd_region->ro == disk_ro) return 0; - dev_info(dev, "%s read-%s, marking %s read-%s\n", - dev_name(&nd_region->dev), pol, disk->disk_name, pol); - set_disk_ro(disk, nd_region->ro); + dev_info(dev, "%s read-only, marking %s read-only\n", + dev_name(&nd_region->dev), disk->disk_name); + set_disk_ro(disk, 1); return 0; -- cgit v1.2.3 From 3f46833df94eec3b7057181898e02cb9e4a49074 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 1 Jun 2018 14:10:58 -0700 Subject: libnvdimm: Debug probe times Instrument nvdimm_bus_probe() to emit timestamps for the start and end of libnvdimm device probing. This is useful for identifying sources of libnvdimm sub-system initialization latency. Signed-off-by: Dan Williams --- drivers/nvdimm/bus.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c index b9e0d30e317a..27902a8799b1 100644 --- a/drivers/nvdimm/bus.c +++ b/drivers/nvdimm/bus.c @@ -100,6 +100,9 @@ static int nvdimm_bus_probe(struct device *dev) if (!try_module_get(provider)) return -ENXIO; + dev_dbg(&nvdimm_bus->dev, "START: %s.probe(%s)\n", + dev->driver->name, dev_name(dev)); + nvdimm_bus_probe_start(nvdimm_bus); rc = nd_drv->probe(dev); if (rc == 0) @@ -108,7 +111,7 @@ static int nvdimm_bus_probe(struct device *dev) nd_region_disable(nvdimm_bus, dev); nvdimm_bus_probe_end(nvdimm_bus); - dev_dbg(&nvdimm_bus->dev, "%s.probe(%s) = %d\n", dev->driver->name, + dev_dbg(&nvdimm_bus->dev, "END: %s.probe(%s) = %d\n", dev->driver->name, dev_name(dev), rc); if (rc != 0) -- cgit v1.2.3 From d76401ade0bb6ab0a70dea317ec115d5425880cf Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 2 Jun 2018 11:43:39 -0700 Subject: libnvdimm, e820: Register all pmem resources There is currently a mismatch between the resources that will trigger the e820_pmem driver to register/load and the resources that will actually be surfaced as pmem ranges. register_e820_pmem() uses walk_iomem_res_desc() which includes children and siblings. In contrast, e820_pmem_probe() only considers top level resources. For example the following resource tree results in the driver being loaded, but no resources being registered: 398000000000-39bfffffffff : PCI Bus 0000:ae 39be00000000-39bf07ffffff : PCI Bus 0000:af 39be00000000-39beffffffff : 0000:af:00.0 39be10000000-39beffffffff : Persistent Memory (legacy) Fix this up to allow definitions of "legacy" pmem ranges anywhere in system-physical address space. Not that it is a recommended or safe to define a pmem range in PCI space, but it is useful for debug / experimentation, and the restriction on being a top-level resource was arbitrary. Cc: Christoph Hellwig Signed-off-by: Dan Williams --- drivers/nvdimm/e820.c | 41 ++++++++++++++++++++++------------------- kernel/resource.c | 1 + 2 files changed, 23 insertions(+), 19 deletions(-) (limited to 'drivers') diff --git a/drivers/nvdimm/e820.c b/drivers/nvdimm/e820.c index 6f9a6ffd7cde..521eaf53a52a 100644 --- a/drivers/nvdimm/e820.c +++ b/drivers/nvdimm/e820.c @@ -38,12 +38,27 @@ static int e820_range_to_nid(resource_size_t addr) } #endif +static int e820_register_one(struct resource *res, void *data) +{ + struct nd_region_desc ndr_desc; + struct nvdimm_bus *nvdimm_bus = data; + + memset(&ndr_desc, 0, sizeof(ndr_desc)); + ndr_desc.res = res; + ndr_desc.attr_groups = e820_pmem_region_attribute_groups; + ndr_desc.numa_node = e820_range_to_nid(res->start); + set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags); + if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc)) + return -ENXIO; + return 0; +} + static int e820_pmem_probe(struct platform_device *pdev) { static struct nvdimm_bus_descriptor nd_desc; struct device *dev = &pdev->dev; struct nvdimm_bus *nvdimm_bus; - struct resource *p; + int rc = -ENXIO; nd_desc.attr_groups = e820_pmem_attribute_groups; nd_desc.provider_name = "e820"; @@ -53,27 +68,15 @@ static int e820_pmem_probe(struct platform_device *pdev) goto err; platform_set_drvdata(pdev, nvdimm_bus); - for (p = iomem_resource.child; p ; p = p->sibling) { - struct nd_region_desc ndr_desc; - - if (p->desc != IORES_DESC_PERSISTENT_MEMORY_LEGACY) - continue; - - memset(&ndr_desc, 0, sizeof(ndr_desc)); - ndr_desc.res = p; - ndr_desc.attr_groups = e820_pmem_region_attribute_groups; - ndr_desc.numa_node = e820_range_to_nid(p->start); - set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags); - if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc)) - goto err; - } - + rc = walk_iomem_res_desc(IORES_DESC_PERSISTENT_MEMORY_LEGACY, + IORESOURCE_MEM, 0, -1, nvdimm_bus, e820_register_one); + if (rc) + goto err; return 0; - - err: +err: nvdimm_bus_unregister(nvdimm_bus); dev_err(dev, "failed to register legacy persistent memory ranges\n"); - return -ENXIO; + return rc; } static struct platform_driver e820_pmem_driver = { diff --git a/kernel/resource.c b/kernel/resource.c index 2af6c03858b9..b85f59e8a4b8 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -448,6 +448,7 @@ int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, return __walk_iomem_res_desc(&res, desc, false, arg, func); } +EXPORT_SYMBOL_GPL(walk_iomem_res_desc); /* * This function calls the @func callback against all memory ranges of type -- cgit v1.2.3 From d4dd70915ea82b7196108c1f0102bc09295c4513 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 3 Jun 2018 12:49:15 -0700 Subject: acpi, nfit: Remove ecc_unit_size The "Clear Error Unit" may be smaller than the ECC unit size on some devices. For example, poison may be tracked at 64-byte alignment even though the ECC unit is larger. Unless / until the ACPI specification provides a non-ambiguous way to communicate this property do not expose this to userspace. Software that had been using this property must already be prepared for the case where the property is not provided on older kernels, so it is safe to remove this attribute. Signed-off-by: Dan Williams --- Documentation/ABI/removed/sysfs-bus-nfit | 17 +++++++++++++++++ Documentation/ABI/testing/sysfs-bus-nfit | 19 ------------------- drivers/acpi/nfit/core.c | 11 ----------- 3 files changed, 17 insertions(+), 30 deletions(-) create mode 100644 Documentation/ABI/removed/sysfs-bus-nfit (limited to 'drivers') diff --git a/Documentation/ABI/removed/sysfs-bus-nfit b/Documentation/ABI/removed/sysfs-bus-nfit new file mode 100644 index 000000000000..ae8c1ca53828 --- /dev/null +++ b/Documentation/ABI/removed/sysfs-bus-nfit @@ -0,0 +1,17 @@ +What: /sys/bus/nd/devices/regionX/nfit/ecc_unit_size +Date: Aug, 2017 +KernelVersion: v4.14 (Removed v4.18) +Contact: linux-nvdimm@lists.01.org +Description: + (RO) Size of a write request to a DIMM that will not incur a + read-modify-write cycle at the memory controller. + + When the nfit driver initializes it runs an ARS (Address Range + Scrub) operation across every pmem range. Part of that process + involves determining the ARS capabilities of a given address + range. One of the capabilities that is reported is the 'Clear + Uncorrectable Error Range Length Unit Size' (see: ACPI 6.2 + section 9.20.7.4 Function Index 1 - Query ARS Capabilities). + This property indicates the boundary at which the NVDIMM may + need to perform read-modify-write cycles to maintain ECC (Error + Correcting Code) blocks. diff --git a/Documentation/ABI/testing/sysfs-bus-nfit b/Documentation/ABI/testing/sysfs-bus-nfit index 619eb8ca0f99..a1cb44dcb908 100644 --- a/Documentation/ABI/testing/sysfs-bus-nfit +++ b/Documentation/ABI/testing/sysfs-bus-nfit @@ -212,22 +212,3 @@ Description: range. Used by NVDIMM Region Mapping Structure to uniquely refer to this structure. Value of 0 is reserved and not used as an index. - - -What: /sys/bus/nd/devices/regionX/nfit/ecc_unit_size -Date: Aug, 2017 -KernelVersion: v4.14 -Contact: linux-nvdimm@lists.01.org -Description: - (RO) Size of a write request to a DIMM that will not incur a - read-modify-write cycle at the memory controller. - - When the nfit driver initializes it runs an ARS (Address Range - Scrub) operation across every pmem range. Part of that process - involves determining the ARS capabilities of a given address - range. One of the capabilities that is reported is the 'Clear - Uncorrectable Error Range Length Unit Size' (see: ACPI 6.2 - section 9.20.7.4 Function Index 1 - Query ARS Capabilities). - This property indicates the boundary at which the NVDIMM may - need to perform read-modify-write cycles to maintain ECC (Error - Correcting Code) blocks. diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index e2235ed3e4be..b87252bf4571 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -1978,19 +1978,8 @@ static ssize_t range_index_show(struct device *dev, } static DEVICE_ATTR_RO(range_index); -static ssize_t ecc_unit_size_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct nd_region *nd_region = to_nd_region(dev); - struct nfit_spa *nfit_spa = nd_region_provider_data(nd_region); - - return sprintf(buf, "%d\n", nfit_spa->clear_err_unit); -} -static DEVICE_ATTR_RO(ecc_unit_size); - static struct attribute *acpi_nfit_region_attributes[] = { &dev_attr_range_index.attr, - &dev_attr_ecc_unit_size.attr, NULL, }; -- cgit v1.2.3 From d2d6364dcbf6affb6f52cdae668c59117703c661 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Wed, 6 Jun 2018 10:45:12 -0600 Subject: libnvdimm, pmem: Complete REQ_FLUSH => REQ_PREFLUSH Complete the move from REQ_FLUSH to REQ_PREFLUSH that apparently started way back in v4.8. Signed-off-by: Ross Zwisler Signed-off-by: Dan Williams --- drivers/nvdimm/pmem.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'drivers') diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 9d714926ecf5..252adfab1e47 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -164,11 +164,6 @@ static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page, return rc; } -/* account for REQ_FLUSH rename, replace with REQ_PREFLUSH after v4.8-rc1 */ -#ifndef REQ_FLUSH -#define REQ_FLUSH REQ_PREFLUSH -#endif - static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) { blk_status_t rc = 0; @@ -179,7 +174,7 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) struct pmem_device *pmem = q->queuedata; struct nd_region *nd_region = to_region(pmem); - if (bio->bi_opf & REQ_FLUSH) + if (bio->bi_opf & REQ_PREFLUSH) nvdimm_flush(nd_region); do_acct = nd_iostat_start(bio, &start); -- cgit v1.2.3 From ce7f11a230d5b7165480b96c0cc7a90358b5b5e2 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Wed, 6 Jun 2018 10:45:13 -0600 Subject: libnvdimm, pmem: Unconditionally deep flush on *sync Prior to this commit we would only do a "deep flush" (have nvdimm_flush() write to each of the flush hints for a region) in response to an msync/fsync/sync call if the nvdimm_has_cache() returned true at the time we were setting up the request queue. This happens due to the write cache value passed in to blk_queue_write_cache(), which then causes the block layer to send down BIOs with REQ_FUA and REQ_PREFLUSH set. We do have a "write_cache" sysfs entry for namespaces, i.e.: /sys/bus/nd/devices/pfn0.1/block/pmem0/dax/write_cache which can be used to control whether or not the kernel thinks a given namespace has a write cache, but this didn't modify the deep flush behavior that we set up when the driver was initialized. Instead, it only modified whether or not DAX would flush CPU caches via dax_flush() in response to *sync calls. Simplify this by making the *sync deep flush always happen, regardless of the write cache setting of a namespace. The DAX CPU cache flushing will still be controlled the write_cache setting of the namespace. Cc: Fixes: 5fdf8e5ba566 ("libnvdimm: re-enable deep flush for pmem devices via fsync()") Signed-off-by: Ross Zwisler Signed-off-by: Dan Williams --- drivers/nvdimm/pmem.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 252adfab1e47..97b4c39a9267 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -294,7 +294,7 @@ static int pmem_attach_disk(struct device *dev, { struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); struct nd_region *nd_region = to_nd_region(dev->parent); - int nid = dev_to_node(dev), fua, wbc; + int nid = dev_to_node(dev), fua; struct resource *res = &nsio->res; struct resource bb_res; struct nd_pfn *nd_pfn = NULL; @@ -330,7 +330,6 @@ static int pmem_attach_disk(struct device *dev, dev_warn(dev, "unable to guarantee persistence of writes\n"); fua = 0; } - wbc = nvdimm_has_cache(nd_region); if (!devm_request_mem_region(dev, res->start, resource_size(res), dev_name(&ndns->dev))) { @@ -377,7 +376,7 @@ static int pmem_attach_disk(struct device *dev, return PTR_ERR(addr); pmem->virt_addr = addr; - blk_queue_write_cache(q, wbc, fua); + blk_queue_write_cache(q, true, fua); blk_queue_make_request(q, pmem_make_request); blk_queue_physical_block_size(q, PAGE_SIZE); blk_queue_logical_block_size(q, pmem_sector_size(ndns)); @@ -408,7 +407,7 @@ static int pmem_attach_disk(struct device *dev, put_disk(disk); return -ENOMEM; } - dax_write_cache(dax_dev, wbc); + dax_write_cache(dax_dev, nvdimm_has_cache(nd_region)); pmem->dax_dev = dax_dev; gendev = disk_to_dev(disk); -- cgit v1.2.3 From 546eb0317cfa3c4f9e1d9ab892766d65d7f78fad Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Wed, 6 Jun 2018 10:45:15 -0600 Subject: libnvdimm, pmem: Do not flush power-fail protected CPU caches This commit: 5fdf8e5ba566 ("libnvdimm: re-enable deep flush for pmem devices via fsync()") intended to make sure that deep flush was always available even on platforms which support a power-fail protected CPU cache. An unintended side effect of this change was that we also lost the ability to skip flushing CPU caches on those power-fail protected CPU cache. Fix this by skipping the low level cache flushing in dax_flush() if we have CPU caches which are power-fail protected. The user can still override this behavior by manually setting the write_cache state of a namespace. See libndctl's ndctl_namespace_write_cache_is_enabled(), ndctl_namespace_enable_write_cache() and ndctl_namespace_disable_write_cache() functions. Cc: Fixes: 5fdf8e5ba566 ("libnvdimm: re-enable deep flush for pmem devices via fsync()") Signed-off-by: Ross Zwisler Signed-off-by: Dan Williams --- drivers/nvdimm/region_devs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index a612be6f019d..ec3543b83330 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -1132,7 +1132,8 @@ EXPORT_SYMBOL_GPL(nvdimm_has_flush); int nvdimm_has_cache(struct nd_region *nd_region) { - return is_nd_pmem(&nd_region->dev); + return is_nd_pmem(&nd_region->dev) && + !test_bit(ND_REGION_PERSIST_CACHE, &nd_region->flags); } EXPORT_SYMBOL_GPL(nvdimm_has_cache); -- cgit v1.2.3 From 808c340be17dc77131fcdf9ad1eb34452d650da1 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Wed, 6 Jun 2018 10:45:14 -0600 Subject: dax: Use dax_write_cache* helpers Use dax_write_cache() and dax_write_cache_enabled() instead of open coding the bit operations. Signed-off-by: Ross Zwisler Signed-off-by: Dan Williams --- drivers/dax/super.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'drivers') diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 2b2332b605e4..c2c46f96b18c 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -182,8 +182,7 @@ static ssize_t write_cache_show(struct device *dev, if (!dax_dev) return -ENXIO; - rc = sprintf(buf, "%d\n", !!test_bit(DAXDEV_WRITE_CACHE, - &dax_dev->flags)); + rc = sprintf(buf, "%d\n", !!dax_write_cache_enabled(dax_dev)); put_dax(dax_dev); return rc; } @@ -201,10 +200,8 @@ static ssize_t write_cache_store(struct device *dev, if (rc) len = rc; - else if (write_cache) - set_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags); else - clear_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags); + dax_write_cache(dax_dev, write_cache); put_dax(dax_dev); return len; @@ -286,7 +283,7 @@ EXPORT_SYMBOL_GPL(dax_copy_from_iter); void arch_wb_cache_pmem(void *addr, size_t size); void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) { - if (unlikely(!test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags))) + if (unlikely(!dax_write_cache_enabled(dax_dev))) return; arch_wb_cache_pmem(addr, size); -- cgit v1.2.3