summaryrefslogtreecommitdiffstats
path: root/drivers/dax
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-01-15 19:37:06 +0100
committerLinus Torvalds <torvalds@linux-foundation.org>2022-01-15 19:37:06 +0100
commitf56caedaf94f9ced5dbfcdb0060a3e788d2078af (patch)
treee213532d1b3d32f9f0e81948f3b23804baff287d /drivers/dax
parentMerge tag 'xfs-5.17-merge-3' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux (diff)
parentmm/damon: hide kernel pointer from tracepoint event (diff)
downloadlinux-f56caedaf94f9ced5dbfcdb0060a3e788d2078af.tar.xz
linux-f56caedaf94f9ced5dbfcdb0060a3e788d2078af.zip
Merge branch 'akpm' (patches from Andrew)
Merge misc updates from Andrew Morton: "146 patches. Subsystems affected by this patch series: kthread, ia64, scripts, ntfs, squashfs, ocfs2, vfs, and mm (slab-generic, slab, kmemleak, dax, kasan, debug, pagecache, gup, shmem, frontswap, memremap, memcg, selftests, pagemap, dma, vmalloc, memory-failure, hugetlb, userfaultfd, vmscan, mempolicy, oom-kill, hugetlbfs, migration, thp, ksm, page-poison, percpu, rmap, zswap, zram, cleanups, hmm, and damon)" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (146 commits) mm/damon: hide kernel pointer from tracepoint event mm/damon/vaddr: hide kernel pointer from damon_va_three_regions() failure log mm/damon/vaddr: use pr_debug() for damon_va_three_regions() failure logging mm/damon/dbgfs: remove an unnecessary variable mm/damon: move the implementation of damon_insert_region to damon.h mm/damon: add access checking for hugetlb pages Docs/admin-guide/mm/damon/usage: update for schemes statistics mm/damon/dbgfs: support all DAMOS stats Docs/admin-guide/mm/damon/reclaim: document statistics parameters mm/damon/reclaim: provide reclamation statistics mm/damon/schemes: account how many times quota limit has exceeded mm/damon/schemes: account scheme actions that successfully applied mm/damon: remove a mistakenly added comment for a future feature Docs/admin-guide/mm/damon/usage: update for kdamond_pid and (mk|rm)_contexts Docs/admin-guide/mm/damon/usage: mention tracepoint at the beginning Docs/admin-guide/mm/damon/usage: remove redundant information Docs/admin-guide/mm/damon/usage: update for scheme quotas and watermarks mm/damon: convert macro functions to static inline functions mm/damon: modify damon_rand() macro to static inline function mm/damon: move damon_rand() definition into damon.h ...
Diffstat (limited to 'drivers/dax')
-rw-r--r--drivers/dax/bus.c32
-rw-r--r--drivers/dax/bus.h1
-rw-r--r--drivers/dax/device.c126
3 files changed, 109 insertions, 50 deletions
diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index ee4568ef757c..1dad813ee4a6 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -127,11 +127,35 @@ ATTRIBUTE_GROUPS(dax_drv);
static int dax_bus_match(struct device *dev, struct device_driver *drv);
+/*
+ * Static dax regions are regions created by an external subsystem
+ * nvdimm where a single range is assigned. Its boundaries are by the external
+ * subsystem and are usually limited to one physical memory range. For example,
+ * for PMEM it is usually defined by NVDIMM Namespace boundaries (i.e. a
+ * single contiguous range)
+ *
+ * On dynamic dax regions, the assigned region can be partitioned by dax core
+ * into multiple subdivisions. A subdivision is represented into one
+ * /dev/daxN.M device composed by one or more potentially discontiguous ranges.
+ *
+ * When allocating a dax region, drivers must set whether it's static
+ * (IORESOURCE_DAX_STATIC). On static dax devices, the @pgmap is pre-assigned
+ * to dax core when calling devm_create_dev_dax(), whereas in dynamic dax
+ * devices it is NULL but afterwards allocated by dax core on device ->probe().
+ * Care is needed to make sure that dynamic dax devices are torn down with a
+ * cleared @pgmap field (see kill_dev_dax()).
+ */
static bool is_static(struct dax_region *dax_region)
{
return (dax_region->res.flags & IORESOURCE_DAX_STATIC) != 0;
}
+bool static_dev_dax(struct dev_dax *dev_dax)
+{
+ return is_static(dev_dax->region);
+}
+EXPORT_SYMBOL_GPL(static_dev_dax);
+
static u64 dev_dax_size(struct dev_dax *dev_dax)
{
u64 size = 0;
@@ -361,6 +385,14 @@ void kill_dev_dax(struct dev_dax *dev_dax)
kill_dax(dax_dev);
unmap_mapping_range(inode->i_mapping, 0, 0, 1);
+
+ /*
+ * Dynamic dax region have the pgmap allocated via dev_kzalloc()
+ * and thus freed by devm. Clear the pgmap to not have stale pgmap
+ * ranges on probe() from previous reconfigurations of region devices.
+ */
+ if (!static_dev_dax(dev_dax))
+ dev_dax->pgmap = NULL;
}
EXPORT_SYMBOL_GPL(kill_dev_dax);
diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h
index 381cec9ff05c..fbb940293d6d 100644
--- a/drivers/dax/bus.h
+++ b/drivers/dax/bus.h
@@ -39,6 +39,7 @@ int __dax_driver_register(struct dax_device_driver *dax_drv,
__dax_driver_register(driver, THIS_MODULE, KBUILD_MODNAME)
void dax_driver_unregister(struct dax_device_driver *dax_drv);
void kill_dev_dax(struct dev_dax *dev_dax);
+bool static_dev_dax(struct dev_dax *dev_dax);
/*
* While run_dax() is potentially a generic operation that could be
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index e58d597f0415..d33a0613ed0c 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -73,11 +73,39 @@ __weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff,
return -1;
}
+static void dax_set_mapping(struct vm_fault *vmf, pfn_t pfn,
+ unsigned long fault_size)
+{
+ unsigned long i, nr_pages = fault_size / PAGE_SIZE;
+ struct file *filp = vmf->vma->vm_file;
+ struct dev_dax *dev_dax = filp->private_data;
+ pgoff_t pgoff;
+
+ /* mapping is only set on the head */
+ if (dev_dax->pgmap->vmemmap_shift)
+ nr_pages = 1;
+
+ pgoff = linear_page_index(vmf->vma,
+ ALIGN(vmf->address, fault_size));
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pfn_to_page(pfn_t_to_pfn(pfn) + i);
+
+ page = compound_head(page);
+ if (page->mapping)
+ continue;
+
+ page->mapping = filp->f_mapping;
+ page->index = pgoff + i;
+ }
+}
+
static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax,
- struct vm_fault *vmf, pfn_t *pfn)
+ struct vm_fault *vmf)
{
struct device *dev = &dev_dax->dev;
phys_addr_t phys;
+ pfn_t pfn;
unsigned int fault_size = PAGE_SIZE;
if (check_vma(dev_dax, vmf->vma, __func__))
@@ -98,18 +126,21 @@ static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax,
return VM_FAULT_SIGBUS;
}
- *pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
+ pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
- return vmf_insert_mixed(vmf->vma, vmf->address, *pfn);
+ dax_set_mapping(vmf, pfn, fault_size);
+
+ return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
}
static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax,
- struct vm_fault *vmf, pfn_t *pfn)
+ struct vm_fault *vmf)
{
unsigned long pmd_addr = vmf->address & PMD_MASK;
struct device *dev = &dev_dax->dev;
phys_addr_t phys;
pgoff_t pgoff;
+ pfn_t pfn;
unsigned int fault_size = PMD_SIZE;
if (check_vma(dev_dax, vmf->vma, __func__))
@@ -138,19 +169,22 @@ static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax,
return VM_FAULT_SIGBUS;
}
- *pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
+ pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
- return vmf_insert_pfn_pmd(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE);
+ dax_set_mapping(vmf, pfn, fault_size);
+
+ return vmf_insert_pfn_pmd(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE);
}
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
- struct vm_fault *vmf, pfn_t *pfn)
+ struct vm_fault *vmf)
{
unsigned long pud_addr = vmf->address & PUD_MASK;
struct device *dev = &dev_dax->dev;
phys_addr_t phys;
pgoff_t pgoff;
+ pfn_t pfn;
unsigned int fault_size = PUD_SIZE;
@@ -180,13 +214,15 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
return VM_FAULT_SIGBUS;
}
- *pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
+ pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
- return vmf_insert_pfn_pud(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE);
+ dax_set_mapping(vmf, pfn, fault_size);
+
+ return vmf_insert_pfn_pud(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE);
}
#else
static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
- struct vm_fault *vmf, pfn_t *pfn)
+ struct vm_fault *vmf)
{
return VM_FAULT_FALLBACK;
}
@@ -196,10 +232,8 @@ static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf,
enum page_entry_size pe_size)
{
struct file *filp = vmf->vma->vm_file;
- unsigned long fault_size;
vm_fault_t rc = VM_FAULT_SIGBUS;
int id;
- pfn_t pfn;
struct dev_dax *dev_dax = filp->private_data;
dev_dbg(&dev_dax->dev, "%s: %s (%#lx - %#lx) size = %d\n", current->comm,
@@ -209,43 +243,18 @@ static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf,
id = dax_read_lock();
switch (pe_size) {
case PE_SIZE_PTE:
- fault_size = PAGE_SIZE;
- rc = __dev_dax_pte_fault(dev_dax, vmf, &pfn);
+ rc = __dev_dax_pte_fault(dev_dax, vmf);
break;
case PE_SIZE_PMD:
- fault_size = PMD_SIZE;
- rc = __dev_dax_pmd_fault(dev_dax, vmf, &pfn);
+ rc = __dev_dax_pmd_fault(dev_dax, vmf);
break;
case PE_SIZE_PUD:
- fault_size = PUD_SIZE;
- rc = __dev_dax_pud_fault(dev_dax, vmf, &pfn);
+ rc = __dev_dax_pud_fault(dev_dax, vmf);
break;
default:
rc = VM_FAULT_SIGBUS;
}
- if (rc == VM_FAULT_NOPAGE) {
- unsigned long i;
- pgoff_t pgoff;
-
- /*
- * In the device-dax case the only possibility for a
- * VM_FAULT_NOPAGE result is when device-dax capacity is
- * mapped. No need to consider the zero page, or racing
- * conflicting mappings.
- */
- pgoff = linear_page_index(vmf->vma, vmf->address
- & ~(fault_size - 1));
- for (i = 0; i < fault_size / PAGE_SIZE; i++) {
- struct page *page;
-
- page = pfn_to_page(pfn_t_to_pfn(pfn) + i);
- if (page->mapping)
- continue;
- page->mapping = filp->f_mapping;
- page->index = pgoff + i;
- }
- }
dax_read_unlock(id);
return rc;
@@ -398,17 +407,34 @@ int dev_dax_probe(struct dev_dax *dev_dax)
void *addr;
int rc, i;
- pgmap = dev_dax->pgmap;
- if (dev_WARN_ONCE(dev, pgmap && dev_dax->nr_range > 1,
- "static pgmap / multi-range device conflict\n"))
- return -EINVAL;
+ if (static_dev_dax(dev_dax)) {
+ if (dev_dax->nr_range > 1) {
+ dev_warn(dev,
+ "static pgmap / multi-range device conflict\n");
+ return -EINVAL;
+ }
- if (!pgmap) {
- pgmap = devm_kzalloc(dev, sizeof(*pgmap) + sizeof(struct range)
- * (dev_dax->nr_range - 1), GFP_KERNEL);
+ pgmap = dev_dax->pgmap;
+ } else {
+ if (dev_dax->pgmap) {
+ dev_warn(dev,
+ "dynamic-dax with pre-populated page map\n");
+ return -EINVAL;
+ }
+
+ pgmap = devm_kzalloc(dev,
+ struct_size(pgmap, ranges, dev_dax->nr_range - 1),
+ GFP_KERNEL);
if (!pgmap)
return -ENOMEM;
+
pgmap->nr_range = dev_dax->nr_range;
+ dev_dax->pgmap = pgmap;
+
+ for (i = 0; i < dev_dax->nr_range; i++) {
+ struct range *range = &dev_dax->ranges[i].range;
+ pgmap->ranges[i] = *range;
+ }
}
for (i = 0; i < dev_dax->nr_range; i++) {
@@ -420,12 +446,12 @@ int dev_dax_probe(struct dev_dax *dev_dax)
i, range->start, range->end);
return -EBUSY;
}
- /* don't update the range for static pgmap */
- if (!dev_dax->pgmap)
- pgmap->ranges[i] = *range;
}
pgmap->type = MEMORY_DEVICE_GENERIC;
+ if (dev_dax->align > PAGE_SIZE)
+ pgmap->vmemmap_shift =
+ order_base_2(dev_dax->align >> PAGE_SHIFT);
addr = devm_memremap_pages(dev, pgmap);
if (IS_ERR(addr))
return PTR_ERR(addr);