summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-10-14 21:28:43 +0200
committerLinus Torvalds <torvalds@linux-foundation.org>2022-10-14 21:28:43 +0200
commit5e714bf1713b4b096d20ec75c13880b7086964bd (patch)
tree7742ba7cc03302f59fefe54bc105dc347a57803e
parentMerge tag 'parisc-for-6.1-1' of git://git.kernel.org/pub/scm/linux/kernel/git... (diff)
parenthighmem: fix kmap_to_page() for kmap_local_page() addresses (diff)
downloadlinux-5e714bf1713b4b096d20ec75c13880b7086964bd.tar.xz
linux-5e714bf1713b4b096d20ec75c13880b7086964bd.zip
Merge tag 'mm-stable-2022-10-13' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull more MM updates from Andrew Morton: - fix a race which causes page refcounting errors in ZONE_DEVICE pages (Alistair Popple) - fix userfaultfd test harness instability (Peter Xu) - various other patches in MM, mainly fixes * tag 'mm-stable-2022-10-13' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (29 commits) highmem: fix kmap_to_page() for kmap_local_page() addresses mm/page_alloc: fix incorrect PGFREE and PGALLOC for high-order page mm/selftest: uffd: explain the write missing fault check mm/hugetlb: use hugetlb_pte_stable in migration race check mm/hugetlb: fix race condition of uffd missing/minor handling zram: always expose rw_page LoongArch: update local TLB if PTE entry exists mm: use update_mmu_tlb() on the second thread kasan: fix array-bounds warnings in tests hmm-tests: add test for migrate_device_range() nouveau/dmem: evict device private memory during release nouveau/dmem: refactor nouveau_dmem_fault_copy_one() mm/migrate_device.c: add migrate_device_range() mm/migrate_device.c: refactor migrate_vma and migrate_deivce_coherent_page() mm/memremap.c: take a pgmap reference on page allocation mm: free device private pages have zero refcount mm/memory.c: fix race when faulting a device private page mm/damon: use damon_sz_region() in appropriate place mm/damon: move sz_damon_region to damon_sz_region lib/test_meminit: add checks for the allocation functions ...
-rw-r--r--arch/loongarch/include/asm/pgtable.h3
-rw-r--r--arch/powerpc/kvm/book3s_hv_uvmem.c21
-rw-r--r--drivers/block/zram/zram_drv.c26
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_migrate.c19
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_migrate.h2
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_svm.c11
-rw-r--r--drivers/gpu/drm/nouveau/nouveau_dmem.c108
-rw-r--r--fs/ext4/verity.c3
-rw-r--r--fs/f2fs/verity.c3
-rw-r--r--include/linux/damon.h6
-rw-r--r--include/linux/memremap.h1
-rw-r--r--include/linux/migrate.h15
-rw-r--r--include/linux/sched.h2
-rw-r--r--lib/test_hmm.c129
-rw-r--r--lib/test_hmm_uapi.h1
-rw-r--r--lib/test_meminit.c21
-rw-r--r--mm/compaction.c1
-rw-r--r--mm/damon/core.c26
-rw-r--r--mm/damon/vaddr.c4
-rw-r--r--mm/highmem.c43
-rw-r--r--mm/hugetlb.c72
-rw-r--r--mm/kasan/kasan_test.c9
-rw-r--r--mm/memory.c20
-rw-r--r--mm/memremap.c30
-rw-r--r--mm/migrate.c34
-rw-r--r--mm/migrate_device.c239
-rw-r--r--mm/mmap.c28
-rw-r--r--mm/mmu_gather.c10
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/page_alloc.c12
-rw-r--r--tools/testing/selftests/vm/hmm-tests.c49
-rw-r--r--tools/testing/selftests/vm/userfaultfd.c22
32 files changed, 719 insertions, 253 deletions
diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h
index 8ea57e2f0e04..946704bee599 100644
--- a/arch/loongarch/include/asm/pgtable.h
+++ b/arch/loongarch/include/asm/pgtable.h
@@ -412,6 +412,9 @@ static inline void update_mmu_cache(struct vm_area_struct *vma,
__update_tlb(vma, address, ptep);
}
+#define __HAVE_ARCH_UPDATE_MMU_TLB
+#define update_mmu_tlb update_mmu_cache
+
static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp)
{
diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
index 598006301620..e2f11f9c3f2a 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -508,10 +508,10 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
static int __kvmppc_svm_page_out(struct vm_area_struct *vma,
unsigned long start,
unsigned long end, unsigned long page_shift,
- struct kvm *kvm, unsigned long gpa)
+ struct kvm *kvm, unsigned long gpa, struct page *fault_page)
{
unsigned long src_pfn, dst_pfn = 0;
- struct migrate_vma mig;
+ struct migrate_vma mig = { 0 };
struct page *dpage, *spage;
struct kvmppc_uvmem_page_pvt *pvt;
unsigned long pfn;
@@ -525,6 +525,7 @@ static int __kvmppc_svm_page_out(struct vm_area_struct *vma,
mig.dst = &dst_pfn;
mig.pgmap_owner = &kvmppc_uvmem_pgmap;
mig.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
+ mig.fault_page = fault_page;
/* The requested page is already paged-out, nothing to do */
if (!kvmppc_gfn_is_uvmem_pfn(gpa >> page_shift, kvm, NULL))
@@ -580,12 +581,14 @@ out_finalize:
static inline int kvmppc_svm_page_out(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
unsigned long page_shift,
- struct kvm *kvm, unsigned long gpa)
+ struct kvm *kvm, unsigned long gpa,
+ struct page *fault_page)
{
int ret;
mutex_lock(&kvm->arch.uvmem_lock);
- ret = __kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa);
+ ret = __kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa,
+ fault_page);
mutex_unlock(&kvm->arch.uvmem_lock);
return ret;
@@ -634,7 +637,7 @@ void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *slot,
pvt->remove_gfn = true;
if (__kvmppc_svm_page_out(vma, addr, addr + PAGE_SIZE,
- PAGE_SHIFT, kvm, pvt->gpa))
+ PAGE_SHIFT, kvm, pvt->gpa, NULL))
pr_err("Can't page out gpa:0x%lx addr:0x%lx\n",
pvt->gpa, addr);
} else {
@@ -715,7 +718,7 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm)
dpage = pfn_to_page(uvmem_pfn);
dpage->zone_device_data = pvt;
- lock_page(dpage);
+ zone_device_page_init(dpage);
return dpage;
out_clear:
spin_lock(&kvmppc_uvmem_bitmap_lock);
@@ -736,7 +739,7 @@ static int kvmppc_svm_page_in(struct vm_area_struct *vma,
bool pagein)
{
unsigned long src_pfn, dst_pfn = 0;
- struct migrate_vma mig;
+ struct migrate_vma mig = { 0 };
struct page *spage;
unsigned long pfn;
struct page *dpage;
@@ -994,7 +997,7 @@ static vm_fault_t kvmppc_uvmem_migrate_to_ram(struct vm_fault *vmf)
if (kvmppc_svm_page_out(vmf->vma, vmf->address,
vmf->address + PAGE_SIZE, PAGE_SHIFT,
- pvt->kvm, pvt->gpa))
+ pvt->kvm, pvt->gpa, vmf->page))
return VM_FAULT_SIGBUS;
else
return 0;
@@ -1065,7 +1068,7 @@ kvmppc_h_svm_page_out(struct kvm *kvm, unsigned long gpa,
if (!vma || vma->vm_start > start || vma->vm_end < end)
goto out;
- if (!kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa))
+ if (!kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa, NULL))
ret = H_SUCCESS;
out:
mmap_read_unlock(kvm->mm);
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 7c74d8cba44f..966aab902d19 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -52,9 +52,6 @@ static unsigned int num_devices = 1;
static size_t huge_class_size;
static const struct block_device_operations zram_devops;
-#ifdef CONFIG_ZRAM_WRITEBACK
-static const struct block_device_operations zram_wb_devops;
-#endif
static void zram_free_page(struct zram *zram, size_t index);
static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
@@ -546,17 +543,6 @@ static ssize_t backing_dev_store(struct device *dev,
zram->backing_dev = backing_dev;
zram->bitmap = bitmap;
zram->nr_pages = nr_pages;
- /*
- * With writeback feature, zram does asynchronous IO so it's no longer
- * synchronous device so let's remove synchronous io flag. Othewise,
- * upper layer(e.g., swap) could wait IO completion rather than
- * (submit and return), which will cause system sluggish.
- * Furthermore, when the IO function returns(e.g., swap_readpage),
- * upper layer expects IO was done so it could deallocate the page
- * freely but in fact, IO is going on so finally could cause
- * use-after-free when the IO is really done.
- */
- zram->disk->fops = &zram_wb_devops;
up_write(&zram->init_lock);
pr_info("setup backing device %s\n", file_name);
@@ -1270,6 +1256,9 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index,
struct bio_vec bvec;
zram_slot_unlock(zram, index);
+ /* A null bio means rw_page was used, we must fallback to bio */
+ if (!bio)
+ return -EOPNOTSUPP;
bvec.bv_page = page;
bvec.bv_len = PAGE_SIZE;
@@ -1856,15 +1845,6 @@ static const struct block_device_operations zram_devops = {
.owner = THIS_MODULE
};
-#ifdef CONFIG_ZRAM_WRITEBACK
-static const struct block_device_operations zram_wb_devops = {
- .open = zram_open,
- .submit_bio = zram_submit_bio,
- .swap_slot_free_notify = zram_slot_free_notify,
- .owner = THIS_MODULE
-};
-#endif
-
static DEVICE_ATTR_WO(compact);
static DEVICE_ATTR_RW(disksize);
static DEVICE_ATTR_RO(initstate);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index c70c026c9a93..2797029bd500 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -223,7 +223,7 @@ svm_migrate_get_vram_page(struct svm_range *prange, unsigned long pfn)
page = pfn_to_page(pfn);
svm_range_bo_ref(prange->svm_bo);
page->zone_device_data = prange->svm_bo;
- lock_page(page);
+ zone_device_page_init(page);
}
static void
@@ -410,7 +410,7 @@ svm_migrate_vma_to_vram(struct amdgpu_device *adev, struct svm_range *prange,
uint64_t npages = (end - start) >> PAGE_SHIFT;
struct kfd_process_device *pdd;
struct dma_fence *mfence = NULL;
- struct migrate_vma migrate;
+ struct migrate_vma migrate = { 0 };
unsigned long cpages = 0;
dma_addr_t *scratch;
void *buf;
@@ -666,7 +666,7 @@ out_oom:
static long
svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
struct vm_area_struct *vma, uint64_t start, uint64_t end,
- uint32_t trigger)
+ uint32_t trigger, struct page *fault_page)
{
struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms);
uint64_t npages = (end - start) >> PAGE_SHIFT;
@@ -674,7 +674,7 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
unsigned long cpages = 0;
struct kfd_process_device *pdd;
struct dma_fence *mfence = NULL;
- struct migrate_vma migrate;
+ struct migrate_vma migrate = { 0 };
dma_addr_t *scratch;
void *buf;
int r = -ENOMEM;
@@ -697,6 +697,7 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
migrate.src = buf;
migrate.dst = migrate.src + npages;
+ migrate.fault_page = fault_page;
scratch = (dma_addr_t *)(migrate.dst + npages);
kfd_smi_event_migration_start(adev->kfd.dev, p->lead_thread->pid,
@@ -764,7 +765,7 @@ out:
* 0 - OK, otherwise error code
*/
int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm,
- uint32_t trigger)
+ uint32_t trigger, struct page *fault_page)
{
struct amdgpu_device *adev;
struct vm_area_struct *vma;
@@ -805,7 +806,8 @@ int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm,
}
next = min(vma->vm_end, end);
- r = svm_migrate_vma_to_ram(adev, prange, vma, addr, next, trigger);
+ r = svm_migrate_vma_to_ram(adev, prange, vma, addr, next, trigger,
+ fault_page);
if (r < 0) {
pr_debug("failed %ld to migrate prange %p\n", r, prange);
break;
@@ -849,7 +851,7 @@ svm_migrate_vram_to_vram(struct svm_range *prange, uint32_t best_loc,
pr_debug("from gpu 0x%x to gpu 0x%x\n", prange->actual_loc, best_loc);
do {
- r = svm_migrate_vram_to_ram(prange, mm, trigger);
+ r = svm_migrate_vram_to_ram(prange, mm, trigger, NULL);
if (r)
return r;
} while (prange->actual_loc && --retries);
@@ -950,7 +952,8 @@ static vm_fault_t svm_migrate_to_ram(struct vm_fault *vmf)
}
r = svm_migrate_vram_to_ram(prange, vmf->vma->vm_mm,
- KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU);
+ KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU,
+ vmf->page);
if (r)
pr_debug("failed %d migrate svms 0x%p range 0x%p [0x%lx 0x%lx]\n",
r, prange->svms, prange, prange->start, prange->last);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h
index b3f0754b32fa..a5d7e6d22264 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h
@@ -43,7 +43,7 @@ enum MIGRATION_COPY_DIR {
int svm_migrate_to_vram(struct svm_range *prange, uint32_t best_loc,
struct mm_struct *mm, uint32_t trigger);
int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm,
- uint32_t trigger);
+ uint32_t trigger, struct page *fault_page);
unsigned long
svm_migrate_addr_to_pfn(struct amdgpu_device *adev, unsigned long addr);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index f5913ba22174..64fdf63093a0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -2913,13 +2913,15 @@ retry_write_locked:
*/
if (prange->actual_loc)
r = svm_migrate_vram_to_ram(prange, mm,
- KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU);
+ KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU,
+ NULL);
else
r = 0;
}
} else {
r = svm_migrate_vram_to_ram(prange, mm,
- KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU);
+ KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU,
+ NULL);
}
if (r) {
pr_debug("failed %d to migrate svms %p [0x%lx 0x%lx]\n",
@@ -3278,7 +3280,8 @@ svm_range_trigger_migration(struct mm_struct *mm, struct svm_range *prange,
return 0;
if (!best_loc) {
- r = svm_migrate_vram_to_ram(prange, mm, KFD_MIGRATE_TRIGGER_PREFETCH);
+ r = svm_migrate_vram_to_ram(prange, mm,
+ KFD_MIGRATE_TRIGGER_PREFETCH, NULL);
*migrated = !r;
return r;
}
@@ -3339,7 +3342,7 @@ static void svm_range_evict_svm_bo_worker(struct work_struct *work)
mutex_lock(&prange->migrate_mutex);
do {
r = svm_migrate_vram_to_ram(prange, mm,
- KFD_MIGRATE_TRIGGER_TTM_EVICTION);
+ KFD_MIGRATE_TRIGGER_TTM_EVICTION, NULL);
} while (!r && prange->actual_loc && --retries);
if (!r && prange->actual_loc)
diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index 16356611b5b9..5fe209107246 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -139,44 +139,24 @@ static void nouveau_dmem_fence_done(struct nouveau_fence **fence)
}
}
-static vm_fault_t nouveau_dmem_fault_copy_one(struct nouveau_drm *drm,
- struct vm_fault *vmf, struct migrate_vma *args,
- dma_addr_t *dma_addr)
+static int nouveau_dmem_copy_one(struct nouveau_drm *drm, struct page *spage,
+ struct page *dpage, dma_addr_t *dma_addr)
{
struct device *dev = drm->dev->dev;
- struct page *dpage, *spage;
- struct nouveau_svmm *svmm;
-
- spage = migrate_pfn_to_page(args->src[0]);
- if (!spage || !(args->src[0] & MIGRATE_PFN_MIGRATE))
- return 0;
- dpage = alloc_page_vma(GFP_HIGHUSER, vmf->vma, vmf->address);
- if (!dpage)
- return VM_FAULT_SIGBUS;
lock_page(dpage);
*dma_addr = dma_map_page(dev, dpage, 0, PAGE_SIZE, DMA_BIDIRECTIONAL);
if (dma_mapping_error(dev, *dma_addr))
- goto error_free_page;
+ return -EIO;
- svmm = spage->zone_device_data;
- mutex_lock(&svmm->mutex);
- nouveau_svmm_invalidate(svmm, args->start, args->end);
if (drm->dmem->migrate.copy_func(drm, 1, NOUVEAU_APER_HOST, *dma_addr,
- NOUVEAU_APER_VRAM, nouveau_dmem_page_addr(spage)))
- goto error_dma_unmap;
- mutex_unlock(&svmm->mutex);
+ NOUVEAU_APER_VRAM, nouveau_dmem_page_addr(spage))) {
+ dma_unmap_page(dev, *dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
+ return -EIO;
+ }
- args->dst[0] = migrate_pfn(page_to_pfn(dpage));
return 0;
-
-error_dma_unmap:
- mutex_unlock(&svmm->mutex);
- dma_unmap_page(dev, *dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
-error_free_page:
- __free_page(dpage);
- return VM_FAULT_SIGBUS;
}
static vm_fault_t nouveau_dmem_migrate_to_ram(struct vm_fault *vmf)
@@ -184,9 +164,11 @@ static vm_fault_t nouveau_dmem_migrate_to_ram(struct vm_fault *vmf)
struct nouveau_drm *drm = page_to_drm(vmf->page);
struct nouveau_dmem *dmem = drm->dmem;
struct nouveau_fence *fence;
+ struct nouveau_svmm *svmm;
+ struct page *spage, *dpage;
unsigned long src = 0, dst = 0;
dma_addr_t dma_addr = 0;
- vm_fault_t ret;
+ vm_fault_t ret = 0;
struct migrate_vma args = {
.vma = vmf->vma,
.start = vmf->address,
@@ -207,9 +189,25 @@ static vm_fault_t nouveau_dmem_migrate_to_ram(struct vm_fault *vmf)
if (!args.cpages)
return 0;
- ret = nouveau_dmem_fault_copy_one(drm, vmf, &args, &dma_addr);
- if (ret || dst == 0)
+ spage = migrate_pfn_to_page(src);
+ if (!spage || !(src & MIGRATE_PFN_MIGRATE))
+ goto done;
+
+ dpage = alloc_page_vma(GFP_HIGHUSER, vmf->vma, vmf->address);
+ if (!dpage)
+ goto done;
+
+ dst = migrate_pfn(page_to_pfn(dpage));
+
+ svmm = spage->zone_device_data;
+ mutex_lock(&svmm->mutex);
+ nouveau_svmm_invalidate(svmm, args.start, args.end);
+ ret = nouveau_dmem_copy_one(drm, spage, dpage, &dma_addr);
+ mutex_unlock(&svmm->mutex);
+ if (ret) {
+ ret = VM_FAULT_SIGBUS;
goto done;
+ }
nouveau_fence_new(dmem->migrate.chan, false, &fence);
migrate_vma_pages(&args);
@@ -326,7 +324,7 @@ nouveau_dmem_page_alloc_locked(struct nouveau_drm *drm)
return NULL;
}
- lock_page(page);
+ zone_device_page_init(page);
return page;
}
@@ -369,6 +367,52 @@ nouveau_dmem_suspend(struct nouveau_drm *drm)
mutex_unlock(&drm->dmem->mutex);
}
+/*
+ * Evict all pages mapping a chunk.
+ */
+static void
+nouveau_dmem_evict_chunk(struct nouveau_dmem_chunk *chunk)
+{
+ unsigned long i, npages = range_len(&chunk->pagemap.range) >> PAGE_SHIFT;
+ unsigned long *src_pfns, *dst_pfns;
+ dma_addr_t *dma_addrs;
+ struct nouveau_fence *fence;
+
+ src_pfns = kcalloc(npages, sizeof(*src_pfns), GFP_KERNEL);
+ dst_pfns = kcalloc(npages, sizeof(*dst_pfns), GFP_KERNEL);
+ dma_addrs = kcalloc(npages, sizeof(*dma_addrs), GFP_KERNEL);
+
+ migrate_device_range(src_pfns, chunk->pagemap.range.start >> PAGE_SHIFT,
+ npages);
+
+ for (i = 0; i < npages; i++) {
+ if (src_pfns[i] & MIGRATE_PFN_MIGRATE) {
+ struct page *dpage;
+
+ /*
+ * _GFP_NOFAIL because the GPU is going away and there
+ * is nothing sensible we can do if we can't copy the
+ * data back.
+ */
+ dpage = alloc_page(GFP_HIGHUSER | __GFP_NOFAIL);
+ dst_pfns[i] = migrate_pfn(page_to_pfn(dpage));
+ nouveau_dmem_copy_one(chunk->drm,
+ migrate_pfn_to_page(src_pfns[i]), dpage,
+ &dma_addrs[i]);
+ }
+ }
+
+ nouveau_fence_new(chunk->drm->dmem->migrate.chan, false, &fence);
+ migrate_device_pages(src_pfns, dst_pfns, npages);
+ nouveau_dmem_fence_done(&fence);
+ migrate_device_finalize(src_pfns, dst_pfns, npages);
+ kfree(src_pfns);
+ kfree(dst_pfns);
+ for (i = 0; i < npages; i++)
+ dma_unmap_page(chunk->drm->dev->dev, dma_addrs[i], PAGE_SIZE, DMA_BIDIRECTIONAL);
+ kfree(dma_addrs);
+}
+
void
nouveau_dmem_fini(struct nouveau_drm *drm)
{
@@ -380,8 +424,10 @@ nouveau_dmem_fini(struct nouveau_drm *drm)
mutex_lock(&drm->dmem->mutex);
list_for_each_entry_safe(chunk, tmp, &drm->dmem->chunks, list) {
+ nouveau_dmem_evict_chunk(chunk);
nouveau_bo_unpin(chunk->bo);
nouveau_bo_ref(NULL, &chunk->bo);
+ WARN_ON(chunk->callocated);
list_del(&chunk->list);
memunmap_pages(&chunk->pagemap);
release_mem_region(chunk->pagemap.range.start,
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index 20cadfb740dc..3c640bd7ecae 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -363,13 +363,14 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode,
pgoff_t index,
unsigned long num_ra_pages)
{
- DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index);
struct page *page;
index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT;
page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED);
if (!page || !PageUptodate(page)) {
+ DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index);
+
if (page)
put_page(page);
else if (num_ra_pages > 1)
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index f0805e51b3fe..c352fff88a5e 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -258,13 +258,14 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode,
pgoff_t index,
unsigned long num_ra_pages)
{
- DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index);
struct page *page;
index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT;
page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED);
if (!page || !PageUptodate(page)) {
+ DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index);
+
if (page)
put_page(page);
else if (num_ra_pages > 1)
diff --git a/include/linux/damon.h b/include/linux/damon.h
index ed5470f50bab..620ada094c3b 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -484,6 +484,12 @@ static inline struct damon_region *damon_first_region(struct damon_target *t)
return list_first_entry(&t->regions_list, struct damon_region, list);
}
+static inline unsigned long damon_sz_region(struct damon_region *r)
+{
+ return r->ar.end - r->ar.start;
+}
+
+
#define damon_for_each_region(r, t) \
list_for_each_entry(r, &t->regions_list, list)
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index c3b4cc84877b..7fcaf3180a5b 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -187,6 +187,7 @@ static inline bool folio_is_device_coherent(const struct folio *folio)
}
#ifdef CONFIG_ZONE_DEVICE
+void zone_device_page_init(struct page *page);
void *memremap_pages(struct dev_pagemap *pgmap, int nid);
void memunmap_pages(struct dev_pagemap *pgmap);
void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 704a04f5a074..3ef77f52a4f0 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -62,6 +62,8 @@ extern const char *migrate_reason_names[MR_TYPES];
#ifdef CONFIG_MIGRATION
extern void putback_movable_pages(struct list_head *l);
+int migrate_folio_extra(struct address_space *mapping, struct folio *dst,
+ struct folio *src, enum migrate_mode mode, int extra_count);
int migrate_folio(struct address_space *mapping, struct folio *dst,
struct folio *src, enum migrate_mode mode);
extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
@@ -197,11 +199,24 @@ struct migrate_vma {
*/
void *pgmap_owner;
unsigned long flags;
+
+ /*
+ * Set to vmf->page if this is being called to migrate a page as part of
+ * a migrate_to_ram() callback.
+ */
+ struct page *fault_page;
};
int migrate_vma_setup(struct migrate_vma *args);
void migrate_vma_pages(struct migrate_vma *migrate);
void migrate_vma_finalize(struct migrate_vma *migrate);
+int migrate_device_range(unsigned long *src_pfns, unsigned long start,
+ unsigned long npages);
+void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns,
+ unsigned long npages);
+void migrate_device_finalize(unsigned long *src_pfns,
+ unsigned long *dst_pfns, unsigned long npages);
+
#endif /* CONFIG_MIGRATION */
#endif /* _LINUX_MIGRATE_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 77f68f8b795c..ffb6eb55cd13 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -870,8 +870,6 @@ struct task_struct {
struct mm_struct *mm;
struct mm_struct *active_mm;
- /* Per-thread vma caching: */
-
#ifdef SPLIT_RSS_COUNTING
struct task_rss_stat rss_stat;
#endif
diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index 6a33f6b1b465..67e6f83fe0f8 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -100,6 +100,7 @@ struct dmirror {
struct dmirror_chunk {
struct dev_pagemap pagemap;
struct dmirror_device *mdevice;
+ bool remove;
};
/*
@@ -192,11 +193,15 @@ static int dmirror_fops_release(struct inode *inode, struct file *filp)
return 0;
}
+static struct dmirror_chunk *dmirror_page_to_chunk(struct page *page)
+{
+ return container_of(page->pgmap, struct dmirror_chunk, pagemap);
+}
+
static struct dmirror_device *dmirror_page_to_device(struct page *page)
{
- return container_of(page->pgmap, struct dmirror_chunk,
- pagemap)->mdevice;
+ return dmirror_page_to_chunk(page)->mdevice;
}
static int dmirror_do_fault(struct dmirror *dmirror, struct hmm_range *range)
@@ -627,8 +632,8 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
goto error;
}
+ zone_device_page_init(dpage);
dpage->zone_device_data = rpage;
- lock_page(dpage);
return dpage;
error:
@@ -907,7 +912,7 @@ static int dmirror_migrate_to_system(struct dmirror *dmirror,
struct vm_area_struct *vma;
unsigned long src_pfns[64] = { 0 };
unsigned long dst_pfns[64] = { 0 };
- struct migrate_vma args;
+ struct migrate_vma args = { 0 };
unsigned long next;
int ret;
@@ -968,7 +973,7 @@ static int dmirror_migrate_to_device(struct dmirror *dmirror,
unsigned long src_pfns[64] = { 0 };
unsigned long dst_pfns[64] = { 0 };
struct dmirror_bounce bounce;
- struct migrate_vma args;
+ struct migrate_vma args = { 0 };
unsigned long next;
int ret;
@@ -1218,6 +1223,85 @@ static int dmirror_snapshot(struct dmirror *dmirror,
return ret;
}
+static void dmirror_device_evict_chunk(struct dmirror_chunk *chunk)
+{
+ unsigned long start_pfn = chunk->pagemap.range.start >> PAGE_SHIFT;
+ unsigned long end_pfn = chunk->pagemap.range.end >> PAGE_SHIFT;
+ unsigned long npages = end_pfn - start_pfn + 1;
+ unsigned long i;
+ unsigned long *src_pfns;
+ unsigned long *dst_pfns;
+
+ src_pfns = kcalloc(npages, sizeof(*src_pfns), GFP_KERNEL);
+ dst_pfns = kcalloc(npages, sizeof(*dst_pfns), GFP_KERNEL);
+
+ migrate_device_range(src_pfns, start_pfn, npages);
+ for (i = 0; i < npages; i++) {
+ struct page *dpage, *spage;
+
+ spage = migrate_pfn_to_page(src_pfns[i]);
+ if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE))
+ continue;
+
+ if (WARN_ON(!is_device_private_page(spage) &&
+ !is_device_coherent_page(spage)))
+ continue;
+ spage = BACKING_PAGE(spage);
+ dpage = alloc_page(GFP_HIGHUSER_MOVABLE | __GFP_NOFAIL);
+ lock_page(dpage);
+ copy_highpage(dpage, spage);
+ dst_pfns[i] = migrate_pfn(page_to_pfn(dpage));
+ if (src_pfns[i] & MIGRATE_PFN_WRITE)
+ dst_pfns[i] |= MIGRATE_PFN_WRITE;
+ }
+ migrate_device_pages(src_pfns, dst_pfns, npages);
+ migrate_device_finalize(src_pfns, dst_pfns, npages);
+ kfree(src_pfns);
+ kfree(dst_pfns);
+}
+
+/* Removes free pages from the free list so they can't be re-allocated */
+static void dmirror_remove_free_pages(struct dmirror_chunk *devmem)
+{
+ struct dmirror_device *mdevice = devmem->mdevice;
+ struct page *page;
+
+ for (page = mdevice->free_pages; page; page = page->zone_device_data)
+ if (dmirror_page_to_chunk(page) == devmem)
+ mdevice->free_pages = page->zone_device_data;
+}
+
+static void dmirror_device_remove_chunks(struct dmirror_device *mdevice)
+{
+ unsigned int i;
+
+ mutex_lock(&mdevice->devmem_lock);
+ if (mdevice->devmem_chunks) {
+ for (i = 0; i < mdevice->devmem_count; i++) {
+ struct dmirror_chunk *devmem =
+ mdevice->devmem_chunks[i];
+
+ spin_lock(&mdevice->lock);
+ devmem->remove = true;
+ dmirror_remove_free_pages(devmem);
+ spin_unlock(&mdevice->lock);
+
+ dmirror_device_evict_chunk(devmem);
+ memunmap_pages(&devmem->pagemap);
+ if (devmem->pagemap.type == MEMORY_DEVICE_PRIVATE)
+ release_mem_region(devmem->pagemap.range.start,
+ range_len(&devmem->pagemap.range));
+ kfree(devmem);
+ }
+ mdevice->devmem_count = 0;
+ mdevice->devmem_capacity = 0;
+ mdevice->free_pages = NULL;
+ kfree(mdevice->devmem_chunks);
+ mdevice->devmem_chunks = NULL;
+ }
+ mutex_unlock(&mdevice->devmem_lock);
+}
+
static long dmirror_fops_unlocked_ioctl(struct file *filp,
unsigned int command,
unsigned long arg)
@@ -1272,6 +1356,11 @@ static long dmirror_fops_unlocked_ioctl(struct file *filp,
ret = dmirror_snapshot(dmirror, &cmd);
break;
+ case HMM_DMIRROR_RELEASE:
+ dmirror_device_remove_chunks(dmirror->mdevice);
+ ret = 0;
+ break;
+
default:
return -EINVAL;
}
@@ -1326,15 +1415,19 @@ static void dmirror_devmem_free(struct page *page)
mdevice = dmirror_page_to_device(page);
spin_lock(&mdevice->lock);
- mdevice->cfree++;
- page->zone_device_data = mdevice->free_pages;
- mdevice->free_pages = page;
+
+ /* Return page to our allocator if not freeing the chunk */
+ if (!dmirror_page_to_chunk(page)->remove) {
+ mdevice->cfree++;
+ page->zone_device_data = mdevice->free_pages;
+ mdevice->free_pages = page;
+ }
spin_unlock(&mdevice->lock);
}
static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
{
- struct migrate_vma args;
+ struct migrate_vma args = { 0 };
unsigned long src_pfns = 0;
unsigned long dst_pfns = 0;
struct page *rpage;
@@ -1357,6 +1450,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
args.dst = &dst_pfns;
args.pgmap_owner = dmirror->mdevice;
args.flags = dmirror_select_device(dmirror);
+ args.fault_page = vmf->page;
if (migrate_vma_setup(&args))
return VM_FAULT_SIGBUS;
@@ -1407,22 +1501,7 @@ static int dmirror_device_init(struct dmirror_device *mdevice, int id)
static void dmirror_device_remove(struct dmirror_device *mdevice)
{
- unsigned int i;
-
- if (mdevice->devmem_chunks) {
- for (i = 0; i < mdevice->devmem_count; i++) {
- struct dmirror_chunk *devmem =
- mdevice->devmem_chunks[i];
-
- memunmap_pages(&devmem->pagemap);
- if (devmem->pagemap.type == MEMORY_DEVICE_PRIVATE)
- release_mem_region(devmem->pagemap.range.start,
- range_len(&devmem->pagemap.range));
- kfree(devmem);
- }
- kfree(mdevice->devmem_chunks);
- }
-
+ dmirror_device_remove_chunks(mdevice);
cdev_device_del(&mdevice->cdevice, &mdevice->device);
}
diff --git a/lib/test_hmm_uapi.h b/lib/test_hmm_uapi.h
index e31d58c9034a..8c818a2cf4f6 100644
--- a/lib/test_hmm_uapi.h
+++ b/lib/test_hmm_uapi.h
@@ -36,6 +36,7 @@ struct hmm_dmirror_cmd {
#define HMM_DMIRROR_SNAPSHOT _IOWR('H', 0x04, struct hmm_dmirror_cmd)
#define HMM_DMIRROR_EXCLUSIVE _IOWR('H', 0x05, struct hmm_dmirror_cmd)
#define HMM_DMIRROR_CHECK_EXCLUSIVE _IOWR('H', 0x06, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_RELEASE _IOWR('H', 0x07, struct hmm_dmirror_cmd)
/*
* Values returned in hmm_dmirror_cmd.ptr for HMM_DMIRROR_SNAPSHOT.
diff --git a/lib/test_meminit.c b/lib/test_meminit.c
index c95db11a6906..60e1984c060f 100644
--- a/lib/test_meminit.c
+++ b/lib/test_meminit.c
@@ -67,17 +67,24 @@ static int __init do_alloc_pages_order(int order, int *total_failures)
size_t size = PAGE_SIZE << order;
page = alloc_pages(GFP_KERNEL, order);
+ if (!page)
+ goto err;
buf = page_address(page);
fill_with_garbage(buf, size);
__free_pages(page, order);
page = alloc_pages(GFP_KERNEL, order);
+ if (!page)
+ goto err;
buf = page_address(page);
if (count_nonzero_bytes(buf, size))
(*total_failures)++;
fill_with_garbage(buf, size);
__free_pages(page, order);
return 1;
+err:
+ (*total_failures)++;
+ return 1;
}
/* Test the page allocator by calling alloc_pages with different orders. */
@@ -100,15 +107,22 @@ static int __init do_kmalloc_size(size_t size, int *total_failures)
void *buf;
buf = kmalloc(size, GFP_KERNEL);
+ if (!buf)
+ goto err;
fill_with_garbage(buf, size);
kfree(buf);
buf = kmalloc(size, GFP_KERNEL);
+ if (!buf)
+ goto err;
if (count_nonzero_bytes(buf, size))
(*total_failures)++;
fill_with_garbage(buf, size);
kfree(buf);
return 1;
+err:
+ (*total_failures)++;
+ return 1;
}
/* Test vmalloc() with given parameters. */
@@ -117,15 +131,22 @@ static int __init do_vmalloc_size(size_t size, int *total_failures)
void *buf;
buf = vmalloc(size);
+ if (!buf)
+ goto err;
fill_with_garbage(buf, size);
vfree(buf);
buf = vmalloc(size);
+ if (!buf)
+ goto err;
if (count_nonzero_bytes(buf, size))
(*total_failures)++;
fill_with_garbage(buf, size);
vfree(buf);
return 1;
+err:
+ (*total_failures)++;
+ return 1;
}
/* Test kmalloc()/vmalloc() by allocating objects of different sizes. */
diff --git a/mm/compaction.c b/mm/compaction.c
index 2dd02c4683c4..c51f7f545afe 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1847,7 +1847,6 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
pfn = cc->zone->zone_start_pfn;
cc->fast_search_fail = 0;
found_block = true;
- set_pageblock_skip(freepage);
break;
}
}
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 8e1ab38d0f1f..36d098d06c55 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -491,7 +491,7 @@ static unsigned long damon_region_sz_limit(struct damon_ctx *ctx)
damon_for_each_target(t, ctx) {
damon_for_each_region(r, t)
- sz += r->ar.end - r->ar.start;
+ sz += damon_sz_region(r);
}
if (ctx->attrs.min_nr_regions)
@@ -674,7 +674,7 @@ static bool __damos_valid_target(struct damon_region *r, struct damos *s)
{
unsigned long sz;
- sz = r->ar.end - r->ar.start;
+ sz = damon_sz_region(r);
return s->pattern.min_sz_region <= sz &&
sz <= s->pattern.max_sz_region &&
s->pattern.min_nr_accesses <= r->nr_accesses &&
@@ -702,7 +702,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
damon_for_each_scheme(s, c) {
struct damos_quota *quota = &s->quota;
- unsigned long sz = r->ar.end - r->ar.start;
+ unsigned long sz = damon_sz_region(r);
struct timespec64 begin, end;
unsigned long sz_applied = 0;
@@ -731,14 +731,14 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
sz = ALIGN_DOWN(quota->charge_addr_from -
r->ar.start, DAMON_MIN_REGION);
if (!sz) {
- if (r->ar.end - r->ar.start <=
- DAMON_MIN_REGION)
+ if (damon_sz_region(r) <=
+ DAMON_MIN_REGION)
continue;
sz = DAMON_MIN_REGION;
}
damon_split_region_at(t, r, sz);
r = damon_next_region(r);
- sz = r->ar.end - r->ar.start;
+ sz = damon_sz_region(r);
}
quota->charge_target_from = NULL;
quota->charge_addr_from = 0;
@@ -843,8 +843,7 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
continue;
score = c->ops.get_scheme_score(
c, t, r, s);
- quota->histogram[score] +=
- r->ar.end - r->ar.start;
+ quota->histogram[score] += damon_sz_region(r);
if (score > max_score)
max_score = score;
}
@@ -865,18 +864,13 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
}
}
-static inline unsigned long sz_damon_region(struct damon_region *r)
-{
- return r->ar.end - r->ar.start;
-}
-
/*
* Merge two adjacent regions into one region
*/
static void damon_merge_two_regions(struct damon_target *t,
struct damon_region *l, struct damon_region *r)
{
- unsigned long sz_l = sz_damon_region(l), sz_r = sz_damon_region(r);
+ unsigned long sz_l = damon_sz_region(l), sz_r = damon_sz_region(r);
l->nr_accesses = (l->nr_accesses * sz_l + r->nr_accesses * sz_r) /
(sz_l + sz_r);
@@ -905,7 +899,7 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres,
if (prev && prev->ar.end == r->ar.start &&
abs(prev->nr_accesses - r->nr_accesses) <= thres &&
- sz_damon_region(prev) + sz_damon_region(r) <= sz_limit)
+ damon_sz_region(prev) + damon_sz_region(r) <= sz_limit)
damon_merge_two_regions(t, prev, r);
else
prev = r;
@@ -963,7 +957,7 @@ static void damon_split_regions_of(struct damon_target *t, int nr_subs)
int i;
damon_for_each_region_safe(r, next, t) {
- sz_region = r->ar.end - r->ar.start;
+ sz_region = damon_sz_region(r);
for (i = 0; i < nr_subs - 1 &&
sz_region > 2 * DAMON_MIN_REGION; i++) {
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index ea94e0b2c311..15f03df66db6 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -72,7 +72,7 @@ static int damon_va_evenly_split_region(struct damon_target *t,
return -EINVAL;
orig_end = r->ar.end;
- sz_orig = r->ar.end - r->ar.start;
+ sz_orig = damon_sz_region(r);
sz_piece = ALIGN_DOWN(sz_orig / nr_pieces, DAMON_MIN_REGION);
if (!sz_piece)
@@ -618,7 +618,7 @@ static unsigned long damos_madvise(struct damon_target *target,
{
struct mm_struct *mm;
unsigned long start = PAGE_ALIGN(r->ar.start);
- unsigned long len = PAGE_ALIGN(r->ar.end - r->ar.start);
+ unsigned long len = PAGE_ALIGN(damon_sz_region(r));
unsigned long applied;
mm = damon_get_mm(target);
diff --git a/mm/highmem.c b/mm/highmem.c
index c707d7202d5f..db251e77f98f 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -30,6 +30,17 @@
#include <asm/tlbflush.h>
#include <linux/vmalloc.h>
+#ifdef CONFIG_KMAP_LOCAL
+static inline int kmap_local_calc_idx(int idx)
+{
+ return idx + KM_MAX_IDX * smp_processor_id();
+}
+
+#ifndef arch_kmap_local_map_idx
+#define arch_kmap_local_map_idx(idx, pfn) kmap_local_calc_idx(idx)
+#endif
+#endif /* CONFIG_KMAP_LOCAL */
+
/*
* Virtual_count is not a pure "count".
* 0 means that it is not mapped, and has not been mapped
@@ -142,12 +153,29 @@ pte_t *pkmap_page_table;
struct page *__kmap_to_page(void *vaddr)
{
+ unsigned long base = (unsigned long) vaddr & PAGE_MASK;
+ struct kmap_ctrl *kctrl = &current->kmap_ctrl;
unsigned long addr = (unsigned long)vaddr;
+ int i;
+
+ /* kmap() mappings */
+ if (WARN_ON_ONCE(addr >= PKMAP_ADDR(0) &&
+ addr < PKMAP_ADDR(LAST_PKMAP)))
+ return pte_page(pkmap_page_table[PKMAP_NR(addr)]);
- if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) {
- int i = PKMAP_NR(addr);
+ /* kmap_local_page() mappings */
+ if (WARN_ON_ONCE(base >= __fix_to_virt(FIX_KMAP_END) &&
+ base < __fix_to_virt(FIX_KMAP_BEGIN))) {
+ for (i = 0; i < kctrl->idx; i++) {
+ unsigned long base_addr;
+ int idx;
- return pte_page(pkmap_page_table[i]);
+ idx = arch_kmap_local_map_idx(i, pte_pfn(pteval));
+ base_addr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+
+ if (base_addr == base)
+ return pte_page(kctrl->pteval[i]);
+ }
}
return virt_to_page(vaddr);
@@ -462,10 +490,6 @@ static inline void kmap_local_idx_pop(void)
# define arch_kmap_local_post_unmap(vaddr) do { } while (0)
#endif
-#ifndef arch_kmap_local_map_idx
-#define arch_kmap_local_map_idx(idx, pfn) kmap_local_calc_idx(idx)
-#endif
-
#ifndef arch_kmap_local_unmap_idx
#define arch_kmap_local_unmap_idx(idx, vaddr) kmap_local_calc_idx(idx)
#endif
@@ -494,11 +518,6 @@ static inline bool kmap_high_unmap_local(unsigned long vaddr)
return false;
}
-static inline int kmap_local_calc_idx(int idx)
-{
- return idx + KM_MAX_IDX * smp_processor_id();
-}
-
static pte_t *__kmap_pte;
static pte_t *kmap_get_pte(unsigned long vaddr, int idx)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 57b7b0b5d9eb..b586cdd75930 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5096,6 +5096,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
* unmapped and its refcount is dropped, so just clear pte here.
*/
if (unlikely(!pte_present(pte))) {
+#ifdef CONFIG_PTE_MARKER_UFFD_WP
/*
* If the pte was wr-protected by uffd-wp in any of the
* swap forms, meanwhile the caller does not want to
@@ -5107,6 +5108,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
set_huge_pte_at(mm, address, ptep,
make_pte_marker(PTE_MARKER_UFFD_WP));
else
+#endif
huge_pte_clear(mm, address, ptep, sz);
spin_unlock(ptl);
continue;
@@ -5135,11 +5137,13 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
if (huge_pte_dirty(pte))
set_page_dirty(page);
+#ifdef CONFIG_PTE_MARKER_UFFD_WP
/* Leave a uffd-wp pte marker if needed */
if (huge_pte_uffd_wp(pte) &&
!(zap_flags & ZAP_FLAG_DROP_MARKER))
set_huge_pte_at(mm, address, ptep,
make_pte_marker(PTE_MARKER_UFFD_WP));
+#endif
hugetlb_count_sub(pages_per_huge_page(h), mm);
page_remove_rmap(page, vma, true);
@@ -5531,6 +5535,23 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
return handle_userfault(&vmf, reason);
}
+/*
+ * Recheck pte with pgtable lock. Returns true if pte didn't change, or
+ * false if pte changed or is changing.
+ */
+static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm,
+ pte_t *ptep, pte_t old_pte)
+{
+ spinlock_t *ptl;
+ bool same;
+
+ ptl = huge_pte_lock(h, mm, ptep);
+ same = pte_same(huge_ptep_get(ptep), old_pte);
+ spin_unlock(ptl);
+
+ return same;
+}
+
static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
struct vm_area_struct *vma,
struct address_space *mapping, pgoff_t idx,
@@ -5571,10 +5592,33 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
if (idx >= size)
goto out;
/* Check for page in userfault range */
- if (userfaultfd_missing(vma))
- return hugetlb_handle_userfault(vma, mapping, idx,
- flags, haddr, address,
- VM_UFFD_MISSING);
+ if (userfaultfd_missing(vma)) {
+ /*
+ * Since hugetlb_no_page() was examining pte
+ * without pgtable lock, we need to re-test under
+ * lock because the pte may not be stable and could
+ * have changed from under us. Try to detect
+ * either changed or during-changing ptes and retry
+ * properly when needed.
+ *
+ * Note that userfaultfd is actually fine with
+ * false positives (e.g. caused by pte changed),
+ * but not wrong logical events (e.g. caused by
+ * reading a pte during changing). The latter can
+ * confuse the userspace, so the strictness is very
+ * much preferred. E.g., MISSING event should
+ * never happen on the page after UFFDIO_COPY has
+ * correctly installed the page and returned.
+ */
+ if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) {
+ ret = 0;
+ goto out;
+ }
+
+ return hugetlb_handle_userfault(vma, mapping, idx, flags,
+ haddr, address,
+ VM_UFFD_MISSING);
+ }
page = alloc_huge_page(vma, haddr, 0);
if (IS_ERR(page)) {
@@ -5590,11 +5634,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
* here. Before returning error, get ptl and make
* sure there really is no pte entry.
*/
- ptl = huge_pte_lock(h, mm, ptep);
- ret = 0;
- if (huge_pte_none(huge_ptep_get(ptep)))
+ if (hugetlb_pte_stable(h, mm, ptep, old_pte))
ret = vmf_error(PTR_ERR(page));
- spin_unlock(ptl);
+ else
+ ret = 0;
goto out;
}
clear_huge_page(page, address, pages_per_huge_page(h));
@@ -5640,9 +5683,14 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
if (userfaultfd_minor(vma)) {
unlock_page(page);
put_page(page);
- return hugetlb_handle_userfault(vma, mapping, idx,
- flags, haddr, address,
- VM_UFFD_MINOR);
+ /* See comment in userfaultfd_missing() block above */
+ if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) {
+ ret = 0;
+ goto out;
+ }
+ return hugetlb_handle_userfault(vma, mapping, idx, flags,
+ haddr, address,
+ VM_UFFD_MINOR);
}
}
@@ -6804,7 +6852,7 @@ void hugetlb_vma_lock_release(struct kref *kref)
kfree(vma_lock);
}
-void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock)
+static void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock)
{
struct vm_area_struct *vma = vma_lock->vma;
diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c
index f25692def781..57e4c72aa8bd 100644
--- a/mm/kasan/kasan_test.c
+++ b/mm/kasan/kasan_test.c
@@ -295,6 +295,9 @@ static void krealloc_more_oob_helper(struct kunit *test,
ptr2 = krealloc(ptr1, size2, GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2);
+ /* Suppress -Warray-bounds warnings. */
+ OPTIMIZER_HIDE_VAR(ptr2);
+
/* All offsets up to size2 must be accessible. */
ptr2[size1 - 1] = 'x';
ptr2[size1] = 'x';
@@ -327,6 +330,9 @@ static void krealloc_less_oob_helper(struct kunit *test,
ptr2 = krealloc(ptr1, size2, GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2);
+ /* Suppress -Warray-bounds warnings. */
+ OPTIMIZER_HIDE_VAR(ptr2);
+
/* Must be accessible for all modes. */
ptr2[size2 - 1] = 'x';
@@ -540,13 +546,14 @@ static void kmalloc_memmove_invalid_size(struct kunit *test)
{
char *ptr;
size_t size = 64;
- volatile size_t invalid_size = size;
+ size_t invalid_size = size;
ptr = kmalloc(size, GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
memset((char *)ptr, 0, 64);
OPTIMIZER_HIDE_VAR(ptr);
+ OPTIMIZER_HIDE_VAR(invalid_size);
KUNIT_EXPECT_KASAN_FAIL(test,
memmove((char *)ptr, (char *)ptr + 4, invalid_size));
kfree(ptr);
diff --git a/mm/memory.c b/mm/memory.c
index df678fa30cdb..f88c351aecd4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1393,10 +1393,12 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
unsigned long addr, pte_t *pte,
struct zap_details *details, pte_t pteval)
{
+#ifdef CONFIG_PTE_MARKER_UFFD_WP
if (zap_drop_file_uffd_wp(details))
return;
pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
+#endif
}
static unsigned long zap_pte_range(struct mmu_gather *tlb,
@@ -3748,7 +3750,21 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
ret = remove_device_exclusive_entry(vmf);
} else if (is_device_private_entry(entry)) {
vmf->page = pfn_swap_entry_to_page(entry);
- ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
+ spin_unlock(vmf->ptl);
+ goto out;
+ }
+
+ /*
+ * Get a page reference while we know the page can't be
+ * freed.
+ */
+ get_page(vmf->page);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ vmf->page->pgmap->ops->migrate_to_ram(vmf);
+ put_page(vmf->page);
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
} else if (is_swapin_error_entry(entry)) {
@@ -4118,7 +4134,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
if (!pte_none(*vmf->pte)) {
- update_mmu_cache(vma, vmf->address, vmf->pte);
+ update_mmu_tlb(vma, vmf->address, vmf->pte);
goto release;
}
diff --git a/mm/memremap.c b/mm/memremap.c
index 25029a474d30..421bec3a29ee 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -138,8 +138,11 @@ void memunmap_pages(struct dev_pagemap *pgmap)
int i;
percpu_ref_kill(&pgmap->ref);
- for (i = 0; i < pgmap->nr_range; i++)
- percpu_ref_put_many(&pgmap->ref, pfn_len(pgmap, i));
+ if (pgmap->type != MEMORY_DEVICE_PRIVATE &&
+ pgmap->type != MEMORY_DEVICE_COHERENT)
+ for (i = 0; i < pgmap->nr_range; i++)
+ percpu_ref_put_many(&pgmap->ref, pfn_len(pgmap, i));
+
wait_for_completion(&pgmap->done);
for (i = 0; i < pgmap->nr_range; i++)
@@ -264,7 +267,9 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
PHYS_PFN(range->start),
PHYS_PFN(range_len(range)), pgmap);
- percpu_ref_get_many(&pgmap->ref, pfn_len(pgmap, range_id));
+ if (pgmap->type != MEMORY_DEVICE_PRIVATE &&
+ pgmap->type != MEMORY_DEVICE_COHERENT)
+ percpu_ref_get_many(&pgmap->ref, pfn_len(pgmap, range_id));
return 0;
err_add_memory:
@@ -502,11 +507,28 @@ void free_zone_device_page(struct page *page)
page->mapping = NULL;
page->pgmap->ops->page_free(page);
+ if (page->pgmap->type != MEMORY_DEVICE_PRIVATE &&
+ page->pgmap->type != MEMORY_DEVICE_COHERENT)
+ /*
+ * Reset the page count to 1 to prepare for handing out the page
+ * again.
+ */
+ set_page_count(page, 1);
+ else
+ put_dev_pagemap(page->pgmap);
+}
+
+void zone_device_page_init(struct page *page)
+{
/*
- * Reset the page count to 1 to prepare for handing out the page again.
+ * Drivers shouldn't be allocating pages after calling
+ * memunmap_pages().
*/
+ WARN_ON_ONCE(!percpu_ref_tryget_live(&page->pgmap->ref));
set_page_count(page, 1);
+ lock_page(page);
}
+EXPORT_SYMBOL_GPL(zone_device_page_init);
#ifdef CONFIG_FS_DAX
bool __put_devmap_managed_page_refs(struct page *page, int refs)
diff --git a/mm/migrate.c b/mm/migrate.c
index c228afba0963..1379e1912772 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -625,6 +625,25 @@ EXPORT_SYMBOL(folio_migrate_copy);
* Migration functions
***********************************************************/
+int migrate_folio_extra(struct address_space *mapping, struct folio *dst,
+ struct folio *src, enum migrate_mode mode, int extra_count)
+{
+ int rc;
+
+ BUG_ON(folio_test_writeback(src)); /* Writeback must be complete */
+
+ rc = folio_migrate_mapping(mapping, dst, src, extra_count);
+
+ if (rc != MIGRATEPAGE_SUCCESS)
+ return rc;
+
+ if (mode != MIGRATE_SYNC_NO_COPY)
+ folio_migrate_copy(dst, src);
+ else
+ folio_migrate_flags(dst, src);
+ return MIGRATEPAGE_SUCCESS;
+}
+
/**
* migrate_folio() - Simple folio migration.
* @mapping: The address_space containing the folio.
@@ -640,20 +659,7 @@ EXPORT_SYMBOL(folio_migrate_copy);
int migrate_folio(struct address_space *mapping, struct folio *dst,
struct folio *src, enum migrate_mode mode)
{
- int rc;
-
- BUG_ON(folio_test_writeback(src)); /* Writeback must be complete */
-
- rc = folio_migrate_mapping(mapping, dst, src, 0);
-
- if (rc != MIGRATEPAGE_SUCCESS)
- return rc;
-
- if (mode != MIGRATE_SYNC_NO_COPY)
- folio_migrate_copy(dst, src);
- else
- folio_migrate_flags(dst, src);
- return MIGRATEPAGE_SUCCESS;
+ return migrate_folio_extra(mapping, dst, src, mode, 0);
}
EXPORT_SYMBOL(migrate_folio);
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 5ab6ab9d2ed8..6fa682eef7a0 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -325,14 +325,14 @@ static void migrate_vma_collect(struct migrate_vma *migrate)
* folio_migrate_mapping(), except that here we allow migration of a
* ZONE_DEVICE page.
*/
-static bool migrate_vma_check_page(struct page *page)
+static bool migrate_vma_check_page(struct page *page, struct page *fault_page)
{
/*
* One extra ref because caller holds an extra reference, either from
* isolate_lru_page() for a regular page, or migrate_vma_collect() for
* a device page.
*/
- int extra = 1;
+ int extra = 1 + (page == fault_page);
/*
* FIXME support THP (transparent huge page), it is bit more complex to
@@ -357,26 +357,20 @@ static bool migrate_vma_check_page(struct page *page)
}
/*
- * migrate_vma_unmap() - replace page mapping with special migration pte entry
- * @migrate: migrate struct containing all migration information
- *
- * Isolate pages from the LRU and replace mappings (CPU page table pte) with a
- * special migration pte entry and check if it has been pinned. Pinned pages are
- * restored because we cannot migrate them.
- *
- * This is the last step before we call the device driver callback to allocate
- * destination memory and copy contents of original page over to new page.
+ * Unmaps pages for migration. Returns number of unmapped pages.
*/
-static void migrate_vma_unmap(struct migrate_vma *migrate)
+static unsigned long migrate_device_unmap(unsigned long *src_pfns,
+ unsigned long npages,
+ struct page *fault_page)
{
- const unsigned long npages = migrate->npages;
unsigned long i, restore = 0;
bool allow_drain = true;
+ unsigned long unmapped = 0;
lru_add_drain();
for (i = 0; i < npages; i++) {
- struct page *page = migrate_pfn_to_page(migrate->src[i]);
+ struct page *page = migrate_pfn_to_page(src_pfns[i]);
struct folio *folio;
if (!page)
@@ -391,8 +385,7 @@ static void migrate_vma_unmap(struct migrate_vma *migrate)
}
if (isolate_lru_page(page)) {
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
- migrate->cpages--;
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
restore++;
continue;
}
@@ -405,34 +398,55 @@ static void migrate_vma_unmap(struct migrate_vma *migrate)
if (folio_mapped(folio))
try_to_migrate(folio, 0);
- if (page_mapped(page) || !migrate_vma_check_page(page)) {
+ if (page_mapped(page) ||
+ !migrate_vma_check_page(page, fault_page)) {
if (!is_zone_device_page(page)) {
get_page(page);
putback_lru_page(page);
}
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
- migrate->cpages--;
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
restore++;
continue;
}
+
+ unmapped++;
}
for (i = 0; i < npages && restore; i++) {
- struct page *page = migrate_pfn_to_page(migrate->src[i]);
+ struct page *page = migrate_pfn_to_page(src_pfns[i]);
struct folio *folio;
- if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
+ if (!page || (src_pfns[i] & MIGRATE_PFN_MIGRATE))
continue;
folio = page_folio(page);
remove_migration_ptes(folio, folio, false);
- migrate->src[i] = 0;
+ src_pfns[i] = 0;
folio_unlock(folio);
folio_put(folio);
restore--;
}
+
+ return unmapped;
+}
+
+/*
+ * migrate_vma_unmap() - replace page mapping with special migration pte entry
+ * @migrate: migrate struct containing all migration information
+ *
+ * Isolate pages from the LRU and replace mappings (CPU page table pte) with a
+ * special migration pte entry and check if it has been pinned. Pinned pages are
+ * restored because we cannot migrate them.
+ *
+ * This is the last step before we call the device driver callback to allocate
+ * destination memory and copy contents of original page over to new page.
+ */
+static void migrate_vma_unmap(struct migrate_vma *migrate)
+{
+ migrate->cpages = migrate_device_unmap(migrate->src, migrate->npages,
+ migrate->fault_page);
}
/**
@@ -517,6 +531,8 @@ int migrate_vma_setup(struct migrate_vma *args)
return -EINVAL;
if (!args->src || !args->dst)
return -EINVAL;
+ if (args->fault_page && !is_device_private_page(args->fault_page))
+ return -EINVAL;
memset(args->src, 0, sizeof(*args->src) * nr_pages);
args->cpages = 0;
@@ -677,42 +693,38 @@ abort:
*src &= ~MIGRATE_PFN_MIGRATE;
}
-/**
- * migrate_vma_pages() - migrate meta-data from src page to dst page
- * @migrate: migrate struct containing all migration information
- *
- * This migrates struct page meta-data from source struct page to destination
- * struct page. This effectively finishes the migration from source page to the
- * destination page.
- */
-void migrate_vma_pages(struct migrate_vma *migrate)
+static void __migrate_device_pages(unsigned long *src_pfns,
+ unsigned long *dst_pfns, unsigned long npages,
+ struct migrate_vma *migrate)
{
- const unsigned long npages = migrate->npages;
- const unsigned long start = migrate->start;
struct mmu_notifier_range range;
- unsigned long addr, i;
+ unsigned long i;
bool notified = false;
- for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
- struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
- struct page *page = migrate_pfn_to_page(migrate->src[i]);
+ for (i = 0; i < npages; i++) {
+ struct page *newpage = migrate_pfn_to_page(dst_pfns[i]);
+ struct page *page = migrate_pfn_to_page(src_pfns[i]);
struct address_space *mapping;
int r;
if (!newpage) {
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
continue;
}
if (!page) {
+ unsigned long addr;
+
+ if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE))
+ continue;
+
/*
* The only time there is no vma is when called from
* migrate_device_coherent_page(). However this isn't
* called if the page could not be unmapped.
*/
- VM_BUG_ON(!migrate->vma);
- if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
- continue;
+ VM_BUG_ON(!migrate);
+ addr = migrate->start + i*PAGE_SIZE;
if (!notified) {
notified = true;
@@ -723,7 +735,7 @@ void migrate_vma_pages(struct migrate_vma *migrate)
mmu_notifier_invalidate_range_start(&range);
}
migrate_vma_insert_page(migrate, addr, newpage,
- &migrate->src[i]);
+ &src_pfns[i]);
continue;
}
@@ -736,21 +748,26 @@ void migrate_vma_pages(struct migrate_vma *migrate)
* device private or coherent memory.
*/
if (mapping) {
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
continue;
}
} else if (is_zone_device_page(newpage)) {
/*
* Other types of ZONE_DEVICE page are not supported.
*/
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
continue;
}
- r = migrate_folio(mapping, page_folio(newpage),
- page_folio(page), MIGRATE_SYNC_NO_COPY);
+ if (migrate && migrate->fault_page == page)
+ r = migrate_folio_extra(mapping, page_folio(newpage),
+ page_folio(page),
+ MIGRATE_SYNC_NO_COPY, 1);
+ else
+ r = migrate_folio(mapping, page_folio(newpage),
+ page_folio(page), MIGRATE_SYNC_NO_COPY);
if (r != MIGRATEPAGE_SUCCESS)
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
}
/*
@@ -761,28 +778,56 @@ void migrate_vma_pages(struct migrate_vma *migrate)
if (notified)
mmu_notifier_invalidate_range_only_end(&range);
}
-EXPORT_SYMBOL(migrate_vma_pages);
/**
- * migrate_vma_finalize() - restore CPU page table entry
+ * migrate_device_pages() - migrate meta-data from src page to dst page
+ * @src_pfns: src_pfns returned from migrate_device_range()
+ * @dst_pfns: array of pfns allocated by the driver to migrate memory to
+ * @npages: number of pages in the range
+ *
+ * Equivalent to migrate_vma_pages(). This is called to migrate struct page
+ * meta-data from source struct page to destination.
+ */
+void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns,
+ unsigned long npages)
+{
+ __migrate_device_pages(src_pfns, dst_pfns, npages, NULL);
+}
+EXPORT_SYMBOL(migrate_device_pages);
+
+/**
+ * migrate_vma_pages() - migrate meta-data from src page to dst page
* @migrate: migrate struct containing all migration information
*
- * This replaces the special migration pte entry with either a mapping to the
- * new page if migration was successful for that page, or to the original page
- * otherwise.
+ * This migrates struct page meta-data from source struct page to destination
+ * struct page. This effectively finishes the migration from source page to the
+ * destination page.
+ */
+void migrate_vma_pages(struct migrate_vma *migrate)
+{
+ __migrate_device_pages(migrate->src, migrate->dst, migrate->npages, migrate);
+}
+EXPORT_SYMBOL(migrate_vma_pages);
+
+/*
+ * migrate_device_finalize() - complete page migration
+ * @src_pfns: src_pfns returned from migrate_device_range()
+ * @dst_pfns: array of pfns allocated by the driver to migrate memory to
+ * @npages: number of pages in the range
*
- * This also unlocks the pages and puts them back on the lru, or drops the extra
- * refcount, for device pages.
+ * Completes migration of the page by removing special migration entries.
+ * Drivers must ensure copying of page data is complete and visible to the CPU
+ * before calling this.
*/
-void migrate_vma_finalize(struct migrate_vma *migrate)
+void migrate_device_finalize(unsigned long *src_pfns,
+ unsigned long *dst_pfns, unsigned long npages)
{
- const unsigned long npages = migrate->npages;
unsigned long i;
for (i = 0; i < npages; i++) {
struct folio *dst, *src;
- struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
- struct page *page = migrate_pfn_to_page(migrate->src[i]);
+ struct page *newpage = migrate_pfn_to_page(dst_pfns[i]);
+ struct page *page = migrate_pfn_to_page(src_pfns[i]);
if (!page) {
if (newpage) {
@@ -792,7 +837,7 @@ void migrate_vma_finalize(struct migrate_vma *migrate)
continue;
}
- if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
+ if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
if (newpage) {
unlock_page(newpage);
put_page(newpage);
@@ -819,8 +864,72 @@ void migrate_vma_finalize(struct migrate_vma *migrate)
}
}
}
+EXPORT_SYMBOL(migrate_device_finalize);
+
+/**
+ * migrate_vma_finalize() - restore CPU page table entry
+ * @migrate: migrate struct containing all migration information
+ *
+ * This replaces the special migration pte entry with either a mapping to the
+ * new page if migration was successful for that page, or to the original page
+ * otherwise.
+ *
+ * This also unlocks the pages and puts them back on the lru, or drops the extra
+ * refcount, for device pages.
+ */
+void migrate_vma_finalize(struct migrate_vma *migrate)
+{
+ migrate_device_finalize(migrate->src, migrate->dst, migrate->npages);
+}
EXPORT_SYMBOL(migrate_vma_finalize);
+/**
+ * migrate_device_range() - migrate device private pfns to normal memory.
+ * @src_pfns: array large enough to hold migrating source device private pfns.
+ * @start: starting pfn in the range to migrate.
+ * @npages: number of pages to migrate.
+ *
+ * migrate_vma_setup() is similar in concept to migrate_vma_setup() except that
+ * instead of looking up pages based on virtual address mappings a range of
+ * device pfns that should be migrated to system memory is used instead.
+ *
+ * This is useful when a driver needs to free device memory but doesn't know the
+ * virtual mappings of every page that may be in device memory. For example this
+ * is often the case when a driver is being unloaded or unbound from a device.
+ *
+ * Like migrate_vma_setup() this function will take a reference and lock any
+ * migrating pages that aren't free before unmapping them. Drivers may then
+ * allocate destination pages and start copying data from the device to CPU
+ * memory before calling migrate_device_pages().
+ */
+int migrate_device_range(unsigned long *src_pfns, unsigned long start,
+ unsigned long npages)
+{
+ unsigned long i, pfn;
+
+ for (pfn = start, i = 0; i < npages; pfn++, i++) {
+ struct page *page = pfn_to_page(pfn);
+
+ if (!get_page_unless_zero(page)) {
+ src_pfns[i] = 0;
+ continue;
+ }
+
+ if (!trylock_page(page)) {
+ src_pfns[i] = 0;
+ put_page(page);
+ continue;
+ }
+
+ src_pfns[i] = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
+ }
+
+ migrate_device_unmap(src_pfns, npages, NULL);
+
+ return 0;
+}
+EXPORT_SYMBOL(migrate_device_range);
+
/*
* Migrate a device coherent page back to normal memory. The caller should have
* a reference on page which will be copied to the new page if migration is
@@ -829,25 +938,19 @@ EXPORT_SYMBOL(migrate_vma_finalize);
int migrate_device_coherent_page(struct page *page)
{
unsigned long src_pfn, dst_pfn = 0;
- struct migrate_vma args;
struct page *dpage;
WARN_ON_ONCE(PageCompound(page));
lock_page(page);
src_pfn = migrate_pfn(page_to_pfn(page)) | MIGRATE_PFN_MIGRATE;
- args.src = &src_pfn;
- args.dst = &dst_pfn;
- args.cpages = 1;
- args.npages = 1;
- args.vma = NULL;
/*
* We don't have a VMA and don't need to walk the page tables to find
* the source page. So call migrate_vma_unmap() directly to unmap the
* page as migrate_vma_setup() will fail if args.vma == NULL.
*/
- migrate_vma_unmap(&args);
+ migrate_device_unmap(&src_pfn, 1, NULL);
if (!(src_pfn & MIGRATE_PFN_MIGRATE))
return -EBUSY;
@@ -857,10 +960,10 @@ int migrate_device_coherent_page(struct page *page)
dst_pfn = migrate_pfn(page_to_pfn(dpage));
}
- migrate_vma_pages(&args);
+ migrate_device_pages(&src_pfn, &dst_pfn, 1);
if (src_pfn & MIGRATE_PFN_MIGRATE)
copy_highpage(dpage, page);
- migrate_vma_finalize(&args);
+ migrate_device_finalize(&src_pfn, &dst_pfn, 1);
if (src_pfn & MIGRATE_PFN_MIGRATE)
return 0;
diff --git a/mm/mmap.c b/mm/mmap.c
index 6e447544f07d..bf2122af94e7 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2673,7 +2673,7 @@ cannot_expand:
if (!arch_validate_flags(vma->vm_flags)) {
error = -EINVAL;
if (file)
- goto unmap_and_free_vma;
+ goto close_and_free_vma;
else
goto free_vma;
}
@@ -2742,6 +2742,9 @@ expanded:
validate_mm(mm);
return addr;
+close_and_free_vma:
+ if (vma->vm_ops && vma->vm_ops->close)
+ vma->vm_ops->close(vma);
unmap_and_free_vma:
fput(vma->vm_file);
vma->vm_file = NULL;
@@ -2942,17 +2945,18 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma,
if (vma &&
(!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)) &&
((vma->vm_flags & ~VM_SOFTDIRTY) == flags)) {
- mas->index = vma->vm_start;
- mas->last = addr + len - 1;
- vma_adjust_trans_huge(vma, addr, addr + len, 0);
+ mas_set_range(mas, vma->vm_start, addr + len - 1);
+ if (mas_preallocate(mas, vma, GFP_KERNEL))
+ return -ENOMEM;
+
+ vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
if (vma->anon_vma) {
anon_vma_lock_write(vma->anon_vma);
anon_vma_interval_tree_pre_update_vma(vma);
}
vma->vm_end = addr + len;
vma->vm_flags |= VM_SOFTDIRTY;
- if (mas_store_gfp(mas, vma, GFP_KERNEL))
- goto mas_expand_failed;
+ mas_store_prealloc(mas, vma);
if (vma->anon_vma) {
anon_vma_interval_tree_post_update_vma(vma);
@@ -2993,13 +2997,6 @@ mas_store_fail:
vma_alloc_fail:
vm_unacct_memory(len >> PAGE_SHIFT);
return -ENOMEM;
-
-mas_expand_failed:
- if (vma->anon_vma) {
- anon_vma_interval_tree_post_update_vma(vma);
- anon_vma_unlock_write(vma->anon_vma);
- }
- return -ENOMEM;
}
int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
@@ -3240,6 +3237,11 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
out_vma_link:
if (new_vma->vm_ops && new_vma->vm_ops->close)
new_vma->vm_ops->close(new_vma);
+
+ if (new_vma->vm_file)
+ fput(new_vma->vm_file);
+
+ unlink_anon_vmas(new_vma);
out_free_mempol:
mpol_put(vma_policy(new_vma));
out_free_vma:
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index a71924bd38c0..add4244e5790 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -1,6 +1,7 @@
#include <linux/gfp.h>
#include <linux/highmem.h>
#include <linux/kernel.h>
+#include <linux/kmsan-checks.h>
#include <linux/mmdebug.h>
#include <linux/mm_types.h>
#include <linux/mm_inline.h>
@@ -265,6 +266,15 @@ void tlb_flush_mmu(struct mmu_gather *tlb)
static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
bool fullmm)
{
+ /*
+ * struct mmu_gather contains 7 1-bit fields packed into a 32-bit
+ * unsigned int value. The remaining 25 bits remain uninitialized
+ * and are never used, but KMSAN updates the origin for them in
+ * zap_pXX_range() in mm/memory.c, thus creating very long origin
+ * chains. This is technically correct, but consumes too much memory.
+ * Unpoisoning the whole structure will prevent creating such chains.
+ */
+ kmsan_unpoison_memory(tlb, sizeof(*tlb));
tlb->mm = mm;
tlb->fullmm = fullmm;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 461dcbd4f21a..668bfaa6ed2a 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -267,6 +267,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
} else {
/* It must be an none page, or what else?.. */
WARN_ON_ONCE(!pte_none(oldpte));
+#ifdef CONFIG_PTE_MARKER_UFFD_WP
if (unlikely(uffd_wp && !vma_is_anonymous(vma))) {
/*
* For file-backed mem, we need to be able to
@@ -278,6 +279,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
make_pte_marker(PTE_MARKER_UFFD_WP));
pages++;
}
+#endif
}
} while (pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ac2c9f12a7b2..e20ade858e71 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3446,7 +3446,7 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
int pindex;
bool free_high;
- __count_vm_event(PGFREE);
+ __count_vm_events(PGFREE, 1 << order);
pindex = order_to_pindex(migratetype, order);
list_add(&page->pcp_list, &pcp->lists[pindex]);
pcp->count += 1 << order;
@@ -3803,7 +3803,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
pcp_spin_unlock_irqrestore(pcp, flags);
pcp_trylock_finish(UP_flags);
if (page) {
- __count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
+ __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone, 1);
}
return page;
@@ -6823,6 +6823,14 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
cond_resched();
}
+
+ /*
+ * ZONE_DEVICE pages are released directly to the driver page allocator
+ * which will set the page count to 1 when allocating the page.
+ */
+ if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
+ pgmap->type == MEMORY_DEVICE_COHERENT)
+ set_page_count(page, 0);
}
/*
diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c
index 7d722265dcd7..4adaad1b822f 100644
--- a/tools/testing/selftests/vm/hmm-tests.c
+++ b/tools/testing/selftests/vm/hmm-tests.c
@@ -1054,6 +1054,55 @@ TEST_F(hmm, migrate_fault)
hmm_buffer_free(buffer);
}
+TEST_F(hmm, migrate_release)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ int *ptr;
+ int ret;
+
+ npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+ ASSERT_NE(npages, 0);
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ buffer->ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Initialize buffer in system memory. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /* Migrate memory to device. */
+ ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+
+ /* Check what the device read. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ /* Release device memory. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_RELEASE, buffer, npages);
+ ASSERT_EQ(ret, 0);
+
+ /* Fault pages back to system memory and check them. */
+ for (i = 0, ptr = buffer->ptr; i < size / (2 * sizeof(*ptr)); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ hmm_buffer_free(buffer);
+}
+
/*
* Migrate anonymous shared memory to device private memory.
*/
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index 74babdbc02e5..297f250c1d95 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -774,7 +774,27 @@ static void uffd_handle_page_fault(struct uffd_msg *msg,
continue_range(uffd, msg->arg.pagefault.address, page_size);
stats->minor_faults++;
} else {
- /* Missing page faults */
+ /*
+ * Missing page faults.
+ *
+ * Here we force a write check for each of the missing mode
+ * faults. It's guaranteed because the only threads that
+ * will trigger uffd faults are the locking threads, and
+ * their first instruction to touch the missing page will
+ * always be pthread_mutex_lock().
+ *
+ * Note that here we relied on an NPTL glibc impl detail to
+ * always read the lock type at the entry of the lock op
+ * (pthread_mutex_t.__data.__type, offset 0x10) before
+ * doing any locking operations to guarantee that. It's
+ * actually not good to rely on this impl detail because
+ * logically a pthread-compatible lib can implement the
+ * locks without types and we can fail when linking with
+ * them. However since we used to find bugs with this
+ * strict check we still keep it around. Hopefully this
+ * could be a good hint when it fails again. If one day
+ * it'll break on some other impl of glibc we'll revisit.
+ */
if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
err("unexpected write fault");