diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-04-02 02:57:52 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-04-02 02:57:52 +0200 |
commit | 50a5de895dbe5df947b3a695777db5b2c313e065 (patch) | |
tree | 4eb38b5b7a9c842ec3b262788ca17b19bfaf9015 | |
parent | Merge tag 'xarray-5.7' of git://git.infradead.org/users/willy/linux-dax (diff) | |
parent | mm/hmm: return error for non-vma snapshots (diff) | |
download | linux-50a5de895dbe5df947b3a695777db5b2c313e065.tar.xz linux-50a5de895dbe5df947b3a695777db5b2c313e065.zip |
Merge tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
Pull hmm updates from Jason Gunthorpe:
"This series focuses on corner case bug fixes and general clarity
improvements to hmm_range_fault(). It arose from a review of
hmm_range_fault() by Christoph, Ralph and myself.
hmm_range_fault() is being used by these 'SVM' style drivers to
non-destructively read the page tables. It is very similar to
get_user_pages() except that the output is an array of PFNs and
per-pfn flags, and it has various modes of reading.
This is necessary before RDMA ODP can be converted, as we don't want
to have weird corner case regressions, which is still a looking
forward item. Ralph has a nice tester for this routine, but it is
waiting for feedback from the selftests maintainers.
Summary:
- 9 bug fixes
- Allow pgmap to track the 'owner' of a DEVICE_PRIVATE - in this case
the owner tells the driver if it can understand the DEVICE_PRIVATE
page or not. Use this to resolve a bug in nouveau where it could
touch DEVICE_PRIVATE pages from other drivers.
- Remove a bunch of dead, redundant or unused code and flags
- Clarity improvements to hmm_range_fault()"
* tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (25 commits)
mm/hmm: return error for non-vma snapshots
mm/hmm: do not set pfns when returning an error code
mm/hmm: do not unconditionally set pfns when returning EBUSY
mm/hmm: use device_private_entry_to_pfn()
mm/hmm: remove HMM_FAULT_SNAPSHOT
mm/hmm: remove unused code and tidy comments
mm/hmm: return the fault type from hmm_pte_need_fault()
mm/hmm: remove pgmap checking for devmap pages
mm/hmm: check the device private page owner in hmm_range_fault()
mm: simplify device private page handling in hmm_range_fault
mm: handle multiple owners of device private pages in migrate_vma
memremap: add an owner field to struct dev_pagemap
mm: merge hmm_vma_do_fault into into hmm_vma_walk_hole_
mm/hmm: don't handle the non-fault case in hmm_vma_walk_hole_()
mm/hmm: simplify hmm_vma_walk_hugetlb_entry()
mm/hmm: remove the unused HMM_FAULT_ALLOW_RETRY flag
mm/hmm: don't provide a stub for hmm_range_fault()
mm/hmm: do not check pmd_protnone twice in hmm_vma_handle_pmd()
mm/hmm: add missing call to hmm_pte_need_fault in HMM_PFN_SPECIAL handling
mm/hmm: return -EFAULT when setting HMM_PFN_ERROR on requested valid pages
...
-rw-r--r-- | Documentation/vm/hmm.rst | 12 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_hv_uvmem.c | 3 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 | ||||
-rw-r--r-- | drivers/gpu/drm/nouveau/nouveau_dmem.c | 19 | ||||
-rw-r--r-- | drivers/gpu/drm/nouveau/nouveau_svm.c | 3 | ||||
-rw-r--r-- | include/linux/hmm.h | 125 | ||||
-rw-r--r-- | include/linux/memremap.h | 4 | ||||
-rw-r--r-- | include/linux/migrate.h | 8 | ||||
-rw-r--r-- | mm/hmm.c | 470 | ||||
-rw-r--r-- | mm/memremap.c | 4 | ||||
-rw-r--r-- | mm/migrate.c | 9 |
11 files changed, 227 insertions, 433 deletions
diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst index 95fec5968362..4e3e9362afeb 100644 --- a/Documentation/vm/hmm.rst +++ b/Documentation/vm/hmm.rst @@ -161,13 +161,11 @@ device must complete the update before the driver callback returns. When the device driver wants to populate a range of virtual addresses, it can use:: - long hmm_range_fault(struct hmm_range *range, unsigned int flags); + long hmm_range_fault(struct hmm_range *range); -With the HMM_RANGE_SNAPSHOT flag, it will only fetch present CPU page table -entries and will not trigger a page fault on missing or non-present entries. -Without that flag, it does trigger a page fault on missing or read-only entries -if write access is requested (see below). Page faults use the generic mm page -fault code path just like a CPU page fault. +It will trigger a page fault on missing or read-only entries if write access is +requested (see below). Page faults use the generic mm page fault code path just +like a CPU page fault. Both functions copy CPU page table entries into their pfns array argument. Each entry in that array corresponds to an address in the virtual range. HMM @@ -197,7 +195,7 @@ The usage pattern is:: again: range.notifier_seq = mmu_interval_read_begin(&interval_sub); down_read(&mm->mmap_sem); - ret = hmm_range_fault(&range, HMM_RANGE_SNAPSHOT); + ret = hmm_range_fault(&range); if (ret) { up_read(&mm->mmap_sem); if (ret == -EBUSY) diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 79b1202b1c62..f44f6b27950f 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -563,6 +563,7 @@ kvmppc_svm_page_out(struct vm_area_struct *vma, unsigned long start, mig.end = end; mig.src = &src_pfn; mig.dst = &dst_pfn; + mig.src_owner = &kvmppc_uvmem_pgmap; mutex_lock(&kvm->arch.uvmem_lock); /* The requested page is already paged-out, nothing to do */ @@ -779,6 +780,8 @@ int kvmppc_uvmem_init(void) kvmppc_uvmem_pgmap.type = MEMORY_DEVICE_PRIVATE; kvmppc_uvmem_pgmap.res = *res; kvmppc_uvmem_pgmap.ops = &kvmppc_uvmem_ops; + /* just one global instance: */ + kvmppc_uvmem_pgmap.owner = &kvmppc_uvmem_pgmap; addr = memremap_pages(&kvmppc_uvmem_pgmap, NUMA_NO_NODE); if (IS_ERR(addr)) { ret = PTR_ERR(addr); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 9f44ba7d9d97..6309ff72bd78 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -770,7 +770,6 @@ struct amdgpu_ttm_tt { static const uint64_t hmm_range_flags[HMM_PFN_FLAG_MAX] = { (1 << 0), /* HMM_PFN_VALID */ (1 << 1), /* HMM_PFN_WRITE */ - 0 /* HMM_PFN_DEVICE_PRIVATE */ }; static const uint64_t hmm_range_values[HMM_PFN_VALUE_MAX] = { @@ -851,7 +850,7 @@ retry: range->notifier_seq = mmu_interval_read_begin(&bo->notifier); down_read(&mm->mmap_sem); - r = hmm_range_fault(range, 0); + r = hmm_range_fault(range); up_read(&mm->mmap_sem); if (unlikely(r <= 0)) { /* diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 0ad5d87b5a8e..ad89e09a0be3 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -28,6 +28,7 @@ #include <nvif/class.h> #include <nvif/object.h> +#include <nvif/if000c.h> #include <nvif/if500b.h> #include <nvif/if900b.h> @@ -176,6 +177,7 @@ static vm_fault_t nouveau_dmem_migrate_to_ram(struct vm_fault *vmf) .end = vmf->address + PAGE_SIZE, .src = &src, .dst = &dst, + .src_owner = drm->dev, }; /* @@ -526,6 +528,7 @@ nouveau_dmem_init(struct nouveau_drm *drm) drm->dmem->pagemap.type = MEMORY_DEVICE_PRIVATE; drm->dmem->pagemap.res = *res; drm->dmem->pagemap.ops = &nouveau_dmem_pagemap_ops; + drm->dmem->pagemap.owner = drm->dev; if (IS_ERR(devm_memremap_pages(device, &drm->dmem->pagemap))) goto out_free; @@ -669,12 +672,6 @@ out: return ret; } -static inline bool -nouveau_dmem_page(struct nouveau_drm *drm, struct page *page) -{ - return is_device_private_page(page) && drm->dmem == page_to_dmem(page); -} - void nouveau_dmem_convert_pfn(struct nouveau_drm *drm, struct hmm_range *range) @@ -690,18 +687,12 @@ nouveau_dmem_convert_pfn(struct nouveau_drm *drm, if (page == NULL) continue; - if (!(range->pfns[i] & range->flags[HMM_PFN_DEVICE_PRIVATE])) { + if (!is_device_private_page(page)) continue; - } - - if (!nouveau_dmem_page(drm, page)) { - WARN(1, "Some unknown device memory !\n"); - range->pfns[i] = 0; - continue; - } addr = nouveau_dmem_page_addr(page); range->pfns[i] &= ((1UL << range->pfn_shift) - 1); range->pfns[i] |= (addr >> PAGE_SHIFT) << range->pfn_shift; + range->pfns[i] |= NVIF_VMM_PFNMAP_V0_VRAM; } } diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index df9bf1fd1bc0..e3797b2d4d17 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -367,7 +367,6 @@ static const u64 nouveau_svm_pfn_flags[HMM_PFN_FLAG_MAX] = { [HMM_PFN_VALID ] = NVIF_VMM_PFNMAP_V0_V, [HMM_PFN_WRITE ] = NVIF_VMM_PFNMAP_V0_W, - [HMM_PFN_DEVICE_PRIVATE] = NVIF_VMM_PFNMAP_V0_VRAM, }; static const u64 @@ -541,7 +540,7 @@ static int nouveau_range_fault(struct nouveau_svmm *svmm, range.default_flags = 0; range.pfn_flags_mask = -1UL; down_read(&mm->mmap_sem); - ret = hmm_range_fault(&range, 0); + ret = hmm_range_fault(&range); up_read(&mm->mmap_sem); if (ret <= 0) { if (ret == 0 || ret == -EBUSY) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index ddf9f7144c43..7475051100c7 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -3,58 +3,8 @@ * Copyright 2013 Red Hat Inc. * * Authors: Jérôme Glisse <jglisse@redhat.com> - */ -/* - * Heterogeneous Memory Management (HMM) - * - * See Documentation/vm/hmm.rst for reasons and overview of what HMM is and it - * is for. Here we focus on the HMM API description, with some explanation of - * the underlying implementation. - * - * Short description: HMM provides a set of helpers to share a virtual address - * space between CPU and a device, so that the device can access any valid - * address of the process (while still obeying memory protection). HMM also - * provides helpers to migrate process memory to device memory, and back. Each - * set of functionality (address space mirroring, and migration to and from - * device memory) can be used independently of the other. - * - * - * HMM address space mirroring API: - * - * Use HMM address space mirroring if you want to mirror a range of the CPU - * page tables of a process into a device page table. Here, "mirror" means "keep - * synchronized". Prerequisites: the device must provide the ability to write- - * protect its page tables (at PAGE_SIZE granularity), and must be able to - * recover from the resulting potential page faults. - * - * HMM guarantees that at any point in time, a given virtual address points to - * either the same memory in both CPU and device page tables (that is: CPU and - * device page tables each point to the same pages), or that one page table (CPU - * or device) points to no entry, while the other still points to the old page - * for the address. The latter case happens when the CPU page table update - * happens first, and then the update is mirrored over to the device page table. - * This does not cause any issue, because the CPU page table cannot start - * pointing to a new page until the device page table is invalidated. - * - * HMM uses mmu_notifiers to monitor the CPU page tables, and forwards any - * updates to each device driver that has registered a mirror. It also provides - * some API calls to help with taking a snapshot of the CPU page table, and to - * synchronize with any updates that might happen concurrently. * - * - * HMM migration to and from device memory: - * - * HMM provides a set of helpers to hotplug device memory as ZONE_DEVICE, with - * a new MEMORY_DEVICE_PRIVATE type. This provides a struct page for each page - * of the device memory, and allows the device driver to manage its memory - * using those struct pages. Having struct pages for device memory makes - * migration easier. Because that memory is not addressable by the CPU it must - * never be pinned to the device; in other words, any CPU page fault can always - * cause the device memory to be migrated (copied/moved) back to regular memory. - * - * A new migrate helper (migrate_vma()) has been added (see mm/migrate.c) that - * allows use of a device DMA engine to perform the copy operation between - * regular system memory and device memory. + * See Documentation/vm/hmm.rst for reasons and overview of what HMM is. */ #ifndef LINUX_HMM_H #define LINUX_HMM_H @@ -74,7 +24,6 @@ * Flags: * HMM_PFN_VALID: pfn is valid. It has, at least, read permission. * HMM_PFN_WRITE: CPU page table has write permission set - * HMM_PFN_DEVICE_PRIVATE: private device memory (ZONE_DEVICE) * * The driver provides a flags array for mapping page protections to device * PTE bits. If the driver valid bit for an entry is bit 3, @@ -86,7 +35,6 @@ enum hmm_pfn_flag_e { HMM_PFN_VALID = 0, HMM_PFN_WRITE, - HMM_PFN_DEVICE_PRIVATE, HMM_PFN_FLAG_MAX }; @@ -122,9 +70,6 @@ enum hmm_pfn_value_e { * * @notifier: a mmu_interval_notifier that includes the start/end * @notifier_seq: result of mmu_interval_read_begin() - * @hmm: the core HMM structure this range is active against - * @vma: the vm area struct for the range - * @list: all range lock are on a list * @start: range virtual start address (inclusive) * @end: range virtual end address (exclusive) * @pfns: array of pfns (big enough for the range) @@ -132,8 +77,8 @@ enum hmm_pfn_value_e { * @values: pfn value for some special case (none, special, error, ...) * @default_flags: default flags for the range (write, read, ... see hmm doc) * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter - * @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT) - * @valid: pfns array did not change since it has been fill by an HMM function + * @pfn_shift: pfn shift value (should be <= PAGE_SHIFT) + * @dev_private_owner: owner of device private pages */ struct hmm_range { struct mmu_interval_notifier *notifier; @@ -146,6 +91,7 @@ struct hmm_range { uint64_t default_flags; uint64_t pfn_flags_mask; uint8_t pfn_shift; + void *dev_private_owner; }; /* @@ -172,70 +118,9 @@ static inline struct page *hmm_device_entry_to_page(const struct hmm_range *rang } /* - * hmm_device_entry_to_pfn() - return pfn value store in a device entry - * @range: range use to decode device entry value - * @entry: device entry to extract pfn from - * Return: pfn value if device entry is valid, -1UL otherwise - */ -static inline unsigned long -hmm_device_entry_to_pfn(const struct hmm_range *range, uint64_t pfn) -{ - if (pfn == range->values[HMM_PFN_NONE]) - return -1UL; - if (pfn == range->values[HMM_PFN_ERROR]) - return -1UL; - if (pfn == range->values[HMM_PFN_SPECIAL]) - return -1UL; - if (!(pfn & range->flags[HMM_PFN_VALID])) - return -1UL; - return (pfn >> range->pfn_shift); -} - -/* - * hmm_device_entry_from_page() - create a valid device entry for a page - * @range: range use to encode HMM pfn value - * @page: page for which to create the device entry - * Return: valid device entry for the page - */ -static inline uint64_t hmm_device_entry_from_page(const struct hmm_range *range, - struct page *page) -{ - return (page_to_pfn(page) << range->pfn_shift) | - range->flags[HMM_PFN_VALID]; -} - -/* - * hmm_device_entry_from_pfn() - create a valid device entry value from pfn - * @range: range use to encode HMM pfn value - * @pfn: pfn value for which to create the device entry - * Return: valid device entry for the pfn - */ -static inline uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range, - unsigned long pfn) -{ - return (pfn << range->pfn_shift) | - range->flags[HMM_PFN_VALID]; -} - -/* - * Retry fault if non-blocking, drop mmap_sem and return -EAGAIN in that case. - */ -#define HMM_FAULT_ALLOW_RETRY (1 << 0) - -/* Don't fault in missing PTEs, just snapshot the current state. */ -#define HMM_FAULT_SNAPSHOT (1 << 1) - -#ifdef CONFIG_HMM_MIRROR -/* * Please see Documentation/vm/hmm.rst for how to use the range API. */ -long hmm_range_fault(struct hmm_range *range, unsigned int flags); -#else -static inline long hmm_range_fault(struct hmm_range *range, unsigned int flags) -{ - return -EOPNOTSUPP; -} -#endif +long hmm_range_fault(struct hmm_range *range); /* * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 6fefb09af7c3..60d97e8fd3c0 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -103,6 +103,9 @@ struct dev_pagemap_ops { * @type: memory type: see MEMORY_* in memory_hotplug.h * @flags: PGMAP_* flags to specify defailed behavior * @ops: method table + * @owner: an opaque pointer identifying the entity that manages this + * instance. Used by various helpers to make sure that no + * foreign ZONE_DEVICE memory is accessed. */ struct dev_pagemap { struct vmem_altmap altmap; @@ -113,6 +116,7 @@ struct dev_pagemap { enum memory_type type; unsigned int flags; const struct dev_pagemap_ops *ops; + void *owner; }; static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 72120061b7d4..3e546cbf03dd 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -196,6 +196,14 @@ struct migrate_vma { unsigned long npages; unsigned long start; unsigned long end; + + /* + * Set to the owner value also stored in page->pgmap->owner for + * migrating out of device private memory. If set only device + * private pages with this owner are migrated. If not set + * device private pages are not migrated at all. + */ + void *src_owner; }; int migrate_vma_setup(struct migrate_vma *args); @@ -28,41 +28,25 @@ struct hmm_vma_walk { struct hmm_range *range; - struct dev_pagemap *pgmap; unsigned long last; - unsigned int flags; }; -static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, - bool write_fault, uint64_t *pfn) -{ - unsigned int flags = FAULT_FLAG_REMOTE; - struct hmm_vma_walk *hmm_vma_walk = walk->private; - struct hmm_range *range = hmm_vma_walk->range; - struct vm_area_struct *vma = walk->vma; - vm_fault_t ret; - - if (!vma) - goto err; - - if (hmm_vma_walk->flags & HMM_FAULT_ALLOW_RETRY) - flags |= FAULT_FLAG_ALLOW_RETRY; - if (write_fault) - flags |= FAULT_FLAG_WRITE; - - ret = handle_mm_fault(vma, addr, flags); - if (ret & VM_FAULT_RETRY) { - /* Note, handle_mm_fault did up_read(&mm->mmap_sem)) */ - return -EAGAIN; - } - if (ret & VM_FAULT_ERROR) - goto err; - - return -EBUSY; +enum { + HMM_NEED_FAULT = 1 << 0, + HMM_NEED_WRITE_FAULT = 1 << 1, + HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT, +}; -err: - *pfn = range->values[HMM_PFN_ERROR]; - return -EFAULT; +/* + * hmm_device_entry_from_pfn() - create a valid device entry value from pfn + * @range: range use to encode HMM pfn value + * @pfn: pfn value for which to create the device entry + * Return: valid device entry for the pfn + */ +static uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range, + unsigned long pfn) +{ + return (pfn << range->pfn_shift) | range->flags[HMM_PFN_VALID]; } static int hmm_pfns_fill(unsigned long addr, unsigned long end, @@ -79,56 +63,43 @@ static int hmm_pfns_fill(unsigned long addr, unsigned long end, } /* - * hmm_vma_walk_hole_() - handle a range lacking valid pmd or pte(s) + * hmm_vma_fault() - fault in a range lacking valid pmd or pte(s) * @addr: range virtual start address (inclusive) * @end: range virtual end address (exclusive) - * @fault: should we fault or not ? - * @write_fault: write fault ? + * @required_fault: HMM_NEED_* flags * @walk: mm_walk structure - * Return: 0 on success, -EBUSY after page fault, or page fault error + * Return: -EBUSY after page fault, or page fault error * * This function will be called whenever pmd_none() or pte_none() returns true, * or whenever there is no page directory covering the virtual address range. */ -static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, - bool fault, bool write_fault, - struct mm_walk *walk) +static int hmm_vma_fault(unsigned long addr, unsigned long end, + unsigned int required_fault, struct mm_walk *walk) { struct hmm_vma_walk *hmm_vma_walk = walk->private; - struct hmm_range *range = hmm_vma_walk->range; - uint64_t *pfns = range->pfns; - unsigned long i; + struct vm_area_struct *vma = walk->vma; + unsigned int fault_flags = FAULT_FLAG_REMOTE; + WARN_ON_ONCE(!required_fault); hmm_vma_walk->last = addr; - i = (addr - range->start) >> PAGE_SHIFT; - - if (write_fault && walk->vma && !(walk->vma->vm_flags & VM_WRITE)) - return -EPERM; - - for (; addr < end; addr += PAGE_SIZE, i++) { - pfns[i] = range->values[HMM_PFN_NONE]; - if (fault || write_fault) { - int ret; - ret = hmm_vma_do_fault(walk, addr, write_fault, - &pfns[i]); - if (ret != -EBUSY) - return ret; - } + if (required_fault & HMM_NEED_WRITE_FAULT) { + if (!(vma->vm_flags & VM_WRITE)) + return -EPERM; + fault_flags |= FAULT_FLAG_WRITE; } - return (fault || write_fault) ? -EBUSY : 0; + for (; addr < end; addr += PAGE_SIZE) + if (handle_mm_fault(vma, addr, fault_flags) & VM_FAULT_ERROR) + return -EFAULT; + return -EBUSY; } -static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, - uint64_t pfns, uint64_t cpu_flags, - bool *fault, bool *write_fault) +static unsigned int hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, + uint64_t pfns, uint64_t cpu_flags) { struct hmm_range *range = hmm_vma_walk->range; - if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) - return; - /* * So we not only consider the individual per page request we also * consider the default flags requested for the range. The API can @@ -143,46 +114,44 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, /* We aren't ask to do anything ... */ if (!(pfns & range->flags[HMM_PFN_VALID])) - return; - /* If this is device memory then only fault if explicitly requested */ - if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) { - /* Do we fault on device memory ? */ - if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) { - *write_fault = pfns & range->flags[HMM_PFN_WRITE]; - *fault = true; - } - return; - } + return 0; - /* If CPU page table is not valid then we need to fault */ - *fault = !(cpu_flags & range->flags[HMM_PFN_VALID]); /* Need to write fault ? */ if ((pfns & range->flags[HMM_PFN_WRITE]) && - !(cpu_flags & range->flags[HMM_PFN_WRITE])) { - *write_fault = true; - *fault = true; - } + !(cpu_flags & range->flags[HMM_PFN_WRITE])) + return HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT; + + /* If CPU page table is not valid then we need to fault */ + if (!(cpu_flags & range->flags[HMM_PFN_VALID])) + return HMM_NEED_FAULT; + return 0; } -static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, - const uint64_t *pfns, unsigned long npages, - uint64_t cpu_flags, bool *fault, - bool *write_fault) +static unsigned int +hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, + const uint64_t *pfns, unsigned long npages, + uint64_t cpu_flags) { + struct hmm_range *range = hmm_vma_walk->range; + unsigned int required_fault = 0; unsigned long i; - if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) { - *fault = *write_fault = false; - return; - } + /* + * If the default flags do not request to fault pages, and the mask does + * not allow for individual pages to be faulted, then + * hmm_pte_need_fault() will always return 0. + */ + if (!((range->default_flags | range->pfn_flags_mask) & + range->flags[HMM_PFN_VALID])) + return 0; - *fault = *write_fault = false; for (i = 0; i < npages; ++i) { - hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags, - fault, write_fault); - if ((*write_fault)) - return; + required_fault |= + hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags); + if (required_fault == HMM_NEED_ALL_BITS) + return required_fault; } + return required_fault; } static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, @@ -190,16 +159,23 @@ static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; - bool fault, write_fault; + unsigned int required_fault; unsigned long i, npages; uint64_t *pfns; i = (addr - range->start) >> PAGE_SHIFT; npages = (end - addr) >> PAGE_SHIFT; pfns = &range->pfns[i]; - hmm_range_need_fault(hmm_vma_walk, pfns, npages, - 0, &fault, &write_fault); - return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); + required_fault = hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0); + if (!walk->vma) { + if (required_fault) + return -EFAULT; + return hmm_pfns_fill(addr, end, range, HMM_PFN_ERROR); + } + if (required_fault) + return hmm_vma_fault(addr, end, required_fault, walk); + hmm_vma_walk->last = addr; + return hmm_pfns_fill(addr, end, range, HMM_PFN_NONE); } static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) @@ -218,31 +194,19 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; unsigned long pfn, npages, i; - bool fault, write_fault; + unsigned int required_fault; uint64_t cpu_flags; npages = (end - addr) >> PAGE_SHIFT; cpu_flags = pmd_to_hmm_pfn_flags(range, pmd); - hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags, - &fault, &write_fault); - - if (pmd_protnone(pmd) || fault || write_fault) - return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); + required_fault = + hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags); + if (required_fault) + return hmm_vma_fault(addr, end, required_fault, walk); pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { - if (pmd_devmap(pmd)) { - hmm_vma_walk->pgmap = get_dev_pagemap(pfn, - hmm_vma_walk->pgmap); - if (unlikely(!hmm_vma_walk->pgmap)) - return -EBUSY; - } + for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags; - } - if (hmm_vma_walk->pgmap) { - put_dev_pagemap(hmm_vma_walk->pgmap); - hmm_vma_walk->pgmap = NULL; - } hmm_vma_walk->last = end; return 0; } @@ -252,6 +216,14 @@ int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, unsigned long end, uint64_t *pfns, pmd_t pmd); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +static inline bool hmm_is_device_private_entry(struct hmm_range *range, + swp_entry_t entry) +{ + return is_device_private_entry(entry) && + device_private_entry_to_page(entry)->pgmap->owner == + range->dev_private_owner; +} + static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) { if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte)) @@ -267,102 +239,81 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; - bool fault, write_fault; + unsigned int required_fault; uint64_t cpu_flags; pte_t pte = *ptep; uint64_t orig_pfn = *pfn; - *pfn = range->values[HMM_PFN_NONE]; - fault = write_fault = false; - if (pte_none(pte)) { - hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0, - &fault, &write_fault); - if (fault || write_fault) + required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0); + if (required_fault) goto fault; + *pfn = range->values[HMM_PFN_NONE]; return 0; } if (!pte_present(pte)) { swp_entry_t entry = pte_to_swp_entry(pte); - if (!non_swap_entry(entry)) { - cpu_flags = pte_to_hmm_pfn_flags(range, pte); - hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, - &fault, &write_fault); - if (fault || write_fault) - goto fault; - return 0; - } - /* - * This is a special swap entry, ignore migration, use - * device and report anything else as error. + * Never fault in device private pages pages, but just report + * the PFN even if not present. */ - if (is_device_private_entry(entry)) { - cpu_flags = range->flags[HMM_PFN_VALID] | - range->flags[HMM_PFN_DEVICE_PRIVATE]; - cpu_flags |= is_write_device_private_entry(entry) ? - range->flags[HMM_PFN_WRITE] : 0; - hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, - &fault, &write_fault); - if (fault || write_fault) - goto fault; + if (hmm_is_device_private_entry(range, entry)) { *pfn = hmm_device_entry_from_pfn(range, - swp_offset(entry)); - *pfn |= cpu_flags; + device_private_entry_to_pfn(entry)); + *pfn |= range->flags[HMM_PFN_VALID]; + if (is_write_device_private_entry(entry)) + *pfn |= range->flags[HMM_PFN_WRITE]; return 0; } - if (is_migration_entry(entry)) { - if (fault || write_fault) { - pte_unmap(ptep); - hmm_vma_walk->last = addr; - migration_entry_wait(walk->mm, pmdp, addr); - return -EBUSY; - } + required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0); + if (!required_fault) { + *pfn = range->values[HMM_PFN_NONE]; return 0; } + if (!non_swap_entry(entry)) + goto fault; + + if (is_migration_entry(entry)) { + pte_unmap(ptep); + hmm_vma_walk->last = addr; + migration_entry_wait(walk->mm, pmdp, addr); + return -EBUSY; + } + /* Report error for everything else */ - *pfn = range->values[HMM_PFN_ERROR]; + pte_unmap(ptep); return -EFAULT; - } else { - cpu_flags = pte_to_hmm_pfn_flags(range, pte); - hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, - &fault, &write_fault); } - if (fault || write_fault) + cpu_flags = pte_to_hmm_pfn_flags(range, pte); + required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags); + if (required_fault) goto fault; - if (pte_devmap(pte)) { - hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte), - hmm_vma_walk->pgmap); - if (unlikely(!hmm_vma_walk->pgmap)) - return -EBUSY; - } else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) { - if (!is_zero_pfn(pte_pfn(pte))) { - *pfn = range->values[HMM_PFN_SPECIAL]; + /* + * Since each architecture defines a struct page for the zero page, just + * fall through and treat it like a normal page. + */ + if (pte_special(pte) && !is_zero_pfn(pte_pfn(pte))) { + if (hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0)) { + pte_unmap(ptep); return -EFAULT; } - /* - * Since each architecture defines a struct page for the zero - * page, just fall through and treat it like a normal page. - */ + *pfn = range->values[HMM_PFN_SPECIAL]; + return 0; } *pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags; return 0; fault: - if (hmm_vma_walk->pgmap) { - put_dev_pagemap(hmm_vma_walk->pgmap); - hmm_vma_walk->pgmap = NULL; - } pte_unmap(ptep); /* Fault any virtual address we were asked to fault */ - return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); + return hmm_vma_fault(addr, end, required_fault, walk); } static int hmm_vma_walk_pmd(pmd_t *pmdp, @@ -372,8 +323,9 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; - uint64_t *pfns = range->pfns; - unsigned long addr = start, i; + uint64_t *pfns = &range->pfns[(start - range->start) >> PAGE_SHIFT]; + unsigned long npages = (end - start) >> PAGE_SHIFT; + unsigned long addr = start; pte_t *ptep; pmd_t pmd; @@ -383,24 +335,19 @@ again: return hmm_vma_walk_hole(start, end, -1, walk); if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { - bool fault, write_fault; - unsigned long npages; - uint64_t *pfns; - - i = (addr - range->start) >> PAGE_SHIFT; - npages = (end - addr) >> PAGE_SHIFT; - pfns = &range->pfns[i]; - - hmm_range_need_fault(hmm_vma_walk, pfns, npages, - 0, &fault, &write_fault); - if (fault || write_fault) { + if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0)) { hmm_vma_walk->last = addr; pmd_migration_entry_wait(walk->mm, pmdp); return -EBUSY; } - return 0; - } else if (!pmd_present(pmd)) + return hmm_pfns_fill(start, end, range, HMM_PFN_NONE); + } + + if (!pmd_present(pmd)) { + if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0)) + return -EFAULT; return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); + } if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) { /* @@ -417,8 +364,7 @@ again: if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) goto again; - i = (addr - range->start) >> PAGE_SHIFT; - return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd); + return hmm_vma_handle_pmd(walk, addr, end, pfns, pmd); } /* @@ -427,31 +373,23 @@ again: * entry pointing to pte directory or it is a bad pmd that will not * recover. */ - if (pmd_bad(pmd)) + if (pmd_bad(pmd)) { + if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0)) + return -EFAULT; return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); + } ptep = pte_offset_map(pmdp, addr); - i = (addr - range->start) >> PAGE_SHIFT; - for (; addr < end; addr += PAGE_SIZE, ptep++, i++) { + for (; addr < end; addr += PAGE_SIZE, ptep++, pfns++) { int r; - r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]); + r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, pfns); if (r) { - /* hmm_vma_handle_pte() did unmap pte directory */ + /* hmm_vma_handle_pte() did pte_unmap() */ hmm_vma_walk->last = addr; return r; } } - if (hmm_vma_walk->pgmap) { - /* - * We do put_dev_pagemap() here and not in hmm_vma_handle_pte() - * so that we can leverage get_dev_pagemap() optimization which - * will not re-take a reference on a pgmap if we already have - * one. - */ - put_dev_pagemap(hmm_vma_walk->pgmap); - hmm_vma_walk->pgmap = NULL; - } pte_unmap(ptep - 1); hmm_vma_walk->last = addr; @@ -487,18 +425,18 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, pud = READ_ONCE(*pudp); if (pud_none(pud)) { - ret = hmm_vma_walk_hole(start, end, -1, walk); - goto out_unlock; + spin_unlock(ptl); + return hmm_vma_walk_hole(start, end, -1, walk); } if (pud_huge(pud) && pud_devmap(pud)) { unsigned long i, npages, pfn; + unsigned int required_fault; uint64_t *pfns, cpu_flags; - bool fault, write_fault; if (!pud_present(pud)) { - ret = hmm_vma_walk_hole(start, end, -1, walk); - goto out_unlock; + spin_unlock(ptl); + return hmm_vma_walk_hole(start, end, -1, walk); } i = (addr - range->start) >> PAGE_SHIFT; @@ -506,29 +444,17 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, pfns = &range->pfns[i]; cpu_flags = pud_to_hmm_pfn_flags(range, pud); - hmm_range_need_fault(hmm_vma_walk, pfns, npages, - cpu_flags, &fault, &write_fault); - if (fault || write_fault) { - ret = hmm_vma_walk_hole_(addr, end, fault, - write_fault, walk); - goto out_unlock; + required_fault = hmm_range_need_fault(hmm_vma_walk, pfns, + npages, cpu_flags); + if (required_fault) { + spin_unlock(ptl); + return hmm_vma_fault(addr, end, required_fault, walk); } pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - for (i = 0; i < npages; ++i, ++pfn) { - hmm_vma_walk->pgmap = get_dev_pagemap(pfn, - hmm_vma_walk->pgmap); - if (unlikely(!hmm_vma_walk->pgmap)) { - ret = -EBUSY; - goto out_unlock; - } + for (i = 0; i < npages; ++i, ++pfn) pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags; - } - if (hmm_vma_walk->pgmap) { - put_dev_pagemap(hmm_vma_walk->pgmap); - hmm_vma_walk->pgmap = NULL; - } hmm_vma_walk->last = end; goto out_unlock; } @@ -554,24 +480,20 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, struct hmm_range *range = hmm_vma_walk->range; struct vm_area_struct *vma = walk->vma; uint64_t orig_pfn, cpu_flags; - bool fault, write_fault; + unsigned int required_fault; spinlock_t *ptl; pte_t entry; - int ret = 0; ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte); entry = huge_ptep_get(pte); i = (start - range->start) >> PAGE_SHIFT; orig_pfn = range->pfns[i]; - range->pfns[i] = range->values[HMM_PFN_NONE]; cpu_flags = pte_to_hmm_pfn_flags(range, entry); - fault = write_fault = false; - hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, - &fault, &write_fault); - if (fault || write_fault) { - ret = -ENOENT; - goto unlock; + required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags); + if (required_fault) { + spin_unlock(ptl); + return hmm_vma_fault(addr, end, required_fault, walk); } pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT); @@ -579,14 +501,8 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags; hmm_vma_walk->last = end; - -unlock: spin_unlock(ptl); - - if (ret == -ENOENT) - return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); - - return ret; + return 0; } #else #define hmm_vma_walk_hugetlb_entry NULL @@ -599,40 +515,32 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end, struct hmm_range *range = hmm_vma_walk->range; struct vm_area_struct *vma = walk->vma; - /* - * Skip vma ranges that don't have struct page backing them or - * map I/O devices directly. - */ - if (vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP)) - return -EFAULT; + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP)) && + vma->vm_flags & VM_READ) + return 0; /* + * vma ranges that don't have struct page backing them or map I/O + * devices directly cannot be handled by hmm_range_fault(). + * * If the vma does not allow read access, then assume that it does not - * allow write access either. HMM does not support architectures - * that allow write without read. + * allow write access either. HMM does not support architectures that + * allow write without read. + * + * If a fault is requested for an unsupported range then it is a hard + * failure. */ - if (!(vma->vm_flags & VM_READ)) { - bool fault, write_fault; - - /* - * Check to see if a fault is requested for any page in the - * range. - */ - hmm_range_need_fault(hmm_vma_walk, range->pfns + - ((start - range->start) >> PAGE_SHIFT), - (end - start) >> PAGE_SHIFT, - 0, &fault, &write_fault); - if (fault || write_fault) - return -EFAULT; - - hmm_pfns_fill(start, end, range, HMM_PFN_NONE); - hmm_vma_walk->last = end; + if (hmm_range_need_fault(hmm_vma_walk, + range->pfns + + ((start - range->start) >> PAGE_SHIFT), + (end - start) >> PAGE_SHIFT, 0)) + return -EFAULT; - /* Skip this vma and continue processing the next vma. */ - return 1; - } + hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); + hmm_vma_walk->last = end; - return 0; + /* Skip this vma and continue processing the next vma. */ + return 1; } static const struct mm_walk_ops hmm_walk_ops = { @@ -645,8 +553,7 @@ static const struct mm_walk_ops hmm_walk_ops = { /** * hmm_range_fault - try to fault some address in a virtual address range - * @range: range being faulted - * @flags: HMM_FAULT_* flags + * @range: argument structure * * Return: the number of valid pages in range->pfns[] (from range start * address), which may be zero. On error one of the following status codes @@ -657,26 +564,19 @@ static const struct mm_walk_ops hmm_walk_ops = { * -ENOMEM: Out of memory. * -EPERM: Invalid permission (e.g., asking for write and range is read * only). - * -EAGAIN: A page fault needs to be retried and mmap_sem was dropped. * -EBUSY: The range has been invalidated and the caller needs to wait for * the invalidation to finish. - * -EFAULT: Invalid (i.e., either no valid vma or it is illegal to access - * that range) number of valid pages in range->pfns[] (from - * range start address). - * - * This is similar to a regular CPU page fault except that it will not trigger - * any memory migration if the memory being faulted is not accessible by CPUs - * and caller does not ask for migration. + * -EFAULT: A page was requested to be valid and could not be made valid + * ie it has no backing VMA or it is illegal to access * - * On error, for one virtual address in the range, the function will mark the - * corresponding HMM pfn entry with an error flag. + * This is similar to get_user_pages(), except that it can read the page tables + * without mutating them (ie causing faults). */ -long hmm_range_fault(struct hmm_range *range, unsigned int flags) +long hmm_range_fault(struct hmm_range *range) { struct hmm_vma_walk hmm_vma_walk = { .range = range, .last = range->start, - .flags = flags, }; struct mm_struct *mm = range->notifier->mm; int ret; diff --git a/mm/memremap.c b/mm/memremap.c index 09b5b7adc773..9b2c97ceb775 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -181,6 +181,10 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) WARN(1, "Missing migrate_to_ram method\n"); return ERR_PTR(-EINVAL); } + if (!pgmap->owner) { + WARN(1, "Missing owner\n"); + return ERR_PTR(-EINVAL); + } break; case MEMORY_DEVICE_FS_DAX: if (!IS_ENABLED(CONFIG_ZONE_DEVICE) || diff --git a/mm/migrate.c b/mm/migrate.c index b1092876e537..7605d2c23433 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2241,7 +2241,7 @@ again: arch_enter_lazy_mmu_mode(); for (; addr < end; addr += PAGE_SIZE, ptep++) { - unsigned long mpfn, pfn; + unsigned long mpfn = 0, pfn; struct page *page; swp_entry_t entry; pte_t pte; @@ -2255,8 +2255,6 @@ again: } if (!pte_present(pte)) { - mpfn = 0; - /* * Only care about unaddressable device page special * page table entry. Other special swap entries are not @@ -2267,11 +2265,16 @@ again: goto next; page = device_private_entry_to_page(entry); + if (page->pgmap->owner != migrate->src_owner) + goto next; + mpfn = migrate_pfn(page_to_pfn(page)) | MIGRATE_PFN_MIGRATE; if (is_write_device_private_entry(entry)) mpfn |= MIGRATE_PFN_WRITE; } else { + if (migrate->src_owner) + goto next; pfn = pte_pfn(pte); if (is_zero_pfn(pfn)) { mpfn = MIGRATE_PFN_MIGRATE; |