diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/damon/paddr.c | 12 | ||||
-rw-r--r-- | mm/huge_memory.c | 6 | ||||
-rw-r--r-- | mm/hugetlb.c | 14 | ||||
-rw-r--r-- | mm/kasan/Makefile | 9 | ||||
-rw-r--r-- | mm/kasan/kasan.h | 4 | ||||
-rw-r--r-- | mm/kasan/kasan_test.c | 29 | ||||
-rw-r--r-- | mm/kasan/shadow.c | 16 | ||||
-rw-r--r-- | mm/kfence/Makefile | 2 | ||||
-rw-r--r-- | mm/kfence/core.c | 42 | ||||
-rw-r--r-- | mm/ksm.c | 11 | ||||
-rw-r--r-- | mm/memory-failure.c | 8 | ||||
-rw-r--r-- | mm/memory.c | 16 | ||||
-rw-r--r-- | mm/migrate.c | 195 | ||||
-rw-r--r-- | mm/mincore.c | 2 | ||||
-rw-r--r-- | mm/mmap.c | 12 | ||||
-rw-r--r-- | mm/mprotect.c | 2 | ||||
-rw-r--r-- | mm/page_alloc.c | 3 | ||||
-rw-r--r-- | mm/rmap.c | 2 | ||||
-rw-r--r-- | mm/slab.c | 2 | ||||
-rw-r--r-- | mm/swapfile.c | 3 | ||||
-rw-r--r-- | mm/vmalloc.c | 36 |
21 files changed, 273 insertions, 153 deletions
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 607bb69e526c..dd9c33fbe805 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -130,7 +130,6 @@ static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz) accessed = false; else accessed = true; - folio_put(folio); goto out; } @@ -144,10 +143,10 @@ static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz) if (need_lock) folio_unlock(folio); - folio_put(folio); out: *folio_sz = folio_size(folio); + folio_put(folio); return accessed; } @@ -250,12 +249,11 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s) folio_put(folio); continue; } - if (folio_test_unevictable(folio)) { + if (folio_test_unevictable(folio)) folio_putback_lru(folio); - } else { + else list_add(&folio->lru, &folio_list); - folio_put(folio); - } + folio_put(folio); } applied = reclaim_pages(&folio_list); cond_resched(); @@ -282,8 +280,8 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate( folio_mark_accessed(folio); else folio_deactivate(folio); - folio_put(folio); applied += folio_nr_pages(folio); + folio_put(folio); } return applied * PAGE_SIZE; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4fc43859e59a..032fb0ef9cd1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2037,7 +2037,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, { struct mm_struct *mm = vma->vm_mm; pgtable_t pgtable; - pmd_t _pmd; + pmd_t _pmd, old_pmd; int i; /* @@ -2048,7 +2048,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, * * See Documentation/mm/mmu_notifier.rst */ - pmdp_huge_clear_flush(vma, haddr, pmd); + old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd); pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); @@ -2057,6 +2057,8 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, pte_t *pte, entry; entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); entry = pte_mkspecial(entry); + if (pmd_uffd_wp(old_pmd)) + entry = pte_mkuffd_wp(entry); pte = pte_offset_map(&_pmd, haddr); VM_BUG_ON(!pte_none(*pte)); set_pte_at(mm, haddr, pte, entry); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 07abcb6eb203..245038a9fe4e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5478,7 +5478,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, struct folio *pagecache_folio, spinlock_t *ptl) { const bool unshare = flags & FAULT_FLAG_UNSHARE; - pte_t pte; + pte_t pte = huge_ptep_get(ptep); struct hstate *h = hstate_vma(vma); struct page *old_page; struct folio *new_folio; @@ -5488,6 +5488,17 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, struct mmu_notifier_range range; /* + * Never handle CoW for uffd-wp protected pages. It should be only + * handled when the uffd-wp protection is removed. + * + * Note that only the CoW optimization path (in hugetlb_no_page()) + * can trigger this, because hugetlb_fault() will always resolve + * uffd-wp bit first. + */ + if (!unshare && huge_pte_uffd_wp(pte)) + return 0; + + /* * hugetlb does not support FOLL_FORCE-style write faults that keep the * PTE mapped R/O such as maybe_mkwrite() would do. */ @@ -5500,7 +5511,6 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, return 0; } - pte = huge_ptep_get(ptep); old_page = pte_page(pte); delayacct_wpcopy_start(); diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index d4837bff3b60..7634dd2a6128 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -35,7 +35,14 @@ CFLAGS_shadow.o := $(CC_FLAGS_KASAN_RUNTIME) CFLAGS_hw_tags.o := $(CC_FLAGS_KASAN_RUNTIME) CFLAGS_sw_tags.o := $(CC_FLAGS_KASAN_RUNTIME) -CFLAGS_KASAN_TEST := $(CFLAGS_KASAN) -fno-builtin $(call cc-disable-warning, vla) +CFLAGS_KASAN_TEST := $(CFLAGS_KASAN) $(call cc-disable-warning, vla) +ifndef CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX +# If compiler instruments memintrinsics by prefixing them with __asan/__hwasan, +# we need to treat them normally (as builtins), otherwise the compiler won't +# recognize them as instrumentable. If it doesn't instrument them, we need to +# pass -fno-builtin, so the compiler doesn't inline them. +CFLAGS_KASAN_TEST += -fno-builtin +endif CFLAGS_kasan_test.o := $(CFLAGS_KASAN_TEST) CFLAGS_kasan_test_module.o := $(CFLAGS_KASAN_TEST) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 9377b0789edc..a61eeee3095a 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -666,4 +666,8 @@ void __hwasan_storeN_noabort(unsigned long addr, size_t size); void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size); +void *__hwasan_memset(void *addr, int c, size_t len); +void *__hwasan_memmove(void *dest, const void *src, size_t len); +void *__hwasan_memcpy(void *dest, const void *src, size_t len); + #endif /* __MM_KASAN_KASAN_H */ diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c index 74cd80c12b25..627eaf1ee1db 100644 --- a/mm/kasan/kasan_test.c +++ b/mm/kasan/kasan_test.c @@ -165,6 +165,15 @@ static void kasan_test_exit(struct kunit *test) kunit_skip((test), "Test requires " #config "=n"); \ } while (0) +#define KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test) do { \ + if (IS_ENABLED(CONFIG_KASAN_HW_TAGS)) \ + break; /* No compiler instrumentation. */ \ + if (IS_ENABLED(CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX)) \ + break; /* Should always be instrumented! */ \ + if (IS_ENABLED(CONFIG_GENERIC_ENTRY)) \ + kunit_skip((test), "Test requires checked mem*()"); \ +} while (0) + static void kmalloc_oob_right(struct kunit *test) { char *ptr; @@ -454,6 +463,8 @@ static void kmalloc_oob_16(struct kunit *test) u64 words[2]; } *ptr1, *ptr2; + KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test); + /* This test is specifically crafted for the generic mode. */ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC); @@ -476,6 +487,8 @@ static void kmalloc_uaf_16(struct kunit *test) u64 words[2]; } *ptr1, *ptr2; + KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test); + ptr1 = kmalloc(sizeof(*ptr1), GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1); @@ -498,6 +511,8 @@ static void kmalloc_oob_memset_2(struct kunit *test) char *ptr; size_t size = 128 - KASAN_GRANULE_SIZE; + KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test); + ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); @@ -511,6 +526,8 @@ static void kmalloc_oob_memset_4(struct kunit *test) char *ptr; size_t size = 128 - KASAN_GRANULE_SIZE; + KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test); + ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); @@ -524,6 +541,8 @@ static void kmalloc_oob_memset_8(struct kunit *test) char *ptr; size_t size = 128 - KASAN_GRANULE_SIZE; + KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test); + ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); @@ -537,6 +556,8 @@ static void kmalloc_oob_memset_16(struct kunit *test) char *ptr; size_t size = 128 - KASAN_GRANULE_SIZE; + KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test); + ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); @@ -550,6 +571,8 @@ static void kmalloc_oob_in_memset(struct kunit *test) char *ptr; size_t size = 128 - KASAN_GRANULE_SIZE; + KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test); + ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); @@ -566,6 +589,8 @@ static void kmalloc_memmove_negative_size(struct kunit *test) size_t size = 64; size_t invalid_size = -2; + KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test); + /* * Hardware tag-based mode doesn't check memmove for negative size. * As a result, this test introduces a side-effect memory corruption, @@ -590,6 +615,8 @@ static void kmalloc_memmove_invalid_size(struct kunit *test) size_t size = 64; size_t invalid_size = size; + KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test); + ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); @@ -618,6 +645,8 @@ static void kmalloc_uaf_memset(struct kunit *test) char *ptr; size_t size = 33; + KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test); + /* * Only generic KASAN uses quarantine, which is required to avoid a * kernel memory corruption this test causes. diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 3703983a8e55..c8b86f3273b5 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -38,11 +38,14 @@ bool __kasan_check_write(const volatile void *p, unsigned int size) } EXPORT_SYMBOL(__kasan_check_write); -#ifndef CONFIG_GENERIC_ENTRY +#if !defined(CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX) && !defined(CONFIG_GENERIC_ENTRY) /* * CONFIG_GENERIC_ENTRY relies on compiler emitted mem*() calls to not be * instrumented. KASAN enabled toolchains should emit __asan_mem*() functions * for the sites they want to instrument. + * + * If we have a compiler that can instrument meminstrinsics, never override + * these, so that non-instrumented files can safely consider them as builtins. */ #undef memset void *memset(void *addr, int c, size_t len) @@ -107,6 +110,17 @@ void *__asan_memcpy(void *dest, const void *src, size_t len) } EXPORT_SYMBOL(__asan_memcpy); +#ifdef CONFIG_KASAN_SW_TAGS +void *__hwasan_memset(void *addr, int c, size_t len) __alias(__asan_memset); +EXPORT_SYMBOL(__hwasan_memset); +#ifdef __HAVE_ARCH_MEMMOVE +void *__hwasan_memmove(void *dest, const void *src, size_t len) __alias(__asan_memmove); +EXPORT_SYMBOL(__hwasan_memmove); +#endif +void *__hwasan_memcpy(void *dest, const void *src, size_t len) __alias(__asan_memcpy); +EXPORT_SYMBOL(__hwasan_memcpy); +#endif + void kasan_poison(const void *addr, size_t size, u8 value, bool init) { void *shadow_start, *shadow_end; diff --git a/mm/kfence/Makefile b/mm/kfence/Makefile index 0bb95728a784..2de2a58d11a1 100644 --- a/mm/kfence/Makefile +++ b/mm/kfence/Makefile @@ -2,5 +2,5 @@ obj-y := core.o report.o -CFLAGS_kfence_test.o := -g -fno-omit-frame-pointer -fno-optimize-sibling-calls +CFLAGS_kfence_test.o := -fno-omit-frame-pointer -fno-optimize-sibling-calls obj-$(CONFIG_KFENCE_KUNIT_TEST) += kfence_test.o diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 5349c37a5dac..1065e0568d05 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -556,15 +556,11 @@ static unsigned long kfence_init_pool(void) * enters __slab_free() slow-path. */ for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { - struct slab *slab = page_slab(&pages[i]); + struct slab *slab = page_slab(nth_page(pages, i)); if (!i || (i % 2)) continue; - /* Verify we do not have a compound head page. */ - if (WARN_ON(compound_head(&pages[i]) != &pages[i])) - return addr; - __folio_set_slab(slab_folio(slab)); #ifdef CONFIG_MEMCG slab->memcg_data = (unsigned long)&kfence_metadata[i / 2 - 1].objcg | @@ -597,12 +593,26 @@ static unsigned long kfence_init_pool(void) /* Protect the right redzone. */ if (unlikely(!kfence_protect(addr + PAGE_SIZE))) - return addr; + goto reset_slab; addr += 2 * PAGE_SIZE; } return 0; + +reset_slab: + for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { + struct slab *slab = page_slab(nth_page(pages, i)); + + if (!i || (i % 2)) + continue; +#ifdef CONFIG_MEMCG + slab->memcg_data = 0; +#endif + __folio_clear_slab(slab_folio(slab)); + } + + return addr; } static bool __init kfence_init_pool_early(void) @@ -632,16 +642,6 @@ static bool __init kfence_init_pool_early(void) * fails for the first page, and therefore expect addr==__kfence_pool in * most failure cases. */ - for (char *p = (char *)addr; p < __kfence_pool + KFENCE_POOL_SIZE; p += PAGE_SIZE) { - struct slab *slab = virt_to_slab(p); - - if (!slab) - continue; -#ifdef CONFIG_MEMCG - slab->memcg_data = 0; -#endif - __folio_clear_slab(slab_folio(slab)); - } memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool)); __kfence_pool = NULL; return false; @@ -726,10 +726,14 @@ static const struct seq_operations objects_sops = { }; DEFINE_SEQ_ATTRIBUTE(objects); -static int __init kfence_debugfs_init(void) +static int kfence_debugfs_init(void) { - struct dentry *kfence_dir = debugfs_create_dir("kfence", NULL); + struct dentry *kfence_dir; + + if (!READ_ONCE(kfence_enabled)) + return 0; + kfence_dir = debugfs_create_dir("kfence", NULL); debugfs_create_file("stats", 0444, kfence_dir, NULL, &stats_fops); debugfs_create_file("objects", 0400, kfence_dir, NULL, &objects_fops); return 0; @@ -883,6 +887,8 @@ static int kfence_init_late(void) } kfence_init_enable(); + kfence_debugfs_init(); + return 0; } @@ -988,9 +988,15 @@ static int unmerge_and_remove_all_rmap_items(void) mm = mm_slot->slot.mm; mmap_read_lock(mm); + + /* + * Exit right away if mm is exiting to avoid lockdep issue in + * the maple tree + */ + if (ksm_test_exit(mm)) + goto mm_exiting; + for_each_vma(vmi, vma) { - if (ksm_test_exit(mm)) - break; if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) continue; err = unmerge_ksm_pages(vma, @@ -999,6 +1005,7 @@ static int unmerge_and_remove_all_rmap_items(void) goto error; } +mm_exiting: remove_trailing_rmap_items(&mm_slot->rmap_list); mmap_read_unlock(mm); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index a1ede7bdce95..fae9baf3be16 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1069,7 +1069,7 @@ static int me_pagecache_dirty(struct page_state *ps, struct page *p) * cache and swap cache(ie. page is freshly swapped in). So it could be * referenced concurrently by 2 types of PTEs: * normal PTEs and swap PTEs. We try to handle them consistently by calling - * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs, + * try_to_unmap(!TTU_HWPOISON) to convert the normal PTEs to swap PTEs, * and then * - clear dirty bit to prevent IO * - remove from LRU @@ -1486,7 +1486,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, int flags, struct page *hpage) { struct folio *folio = page_folio(hpage); - enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC; + enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON; struct address_space *mapping; LIST_HEAD(tokill); bool unmap_success; @@ -1516,7 +1516,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, if (PageSwapCache(p)) { pr_err("%#lx: keeping poisoned page in swap cache\n", pfn); - ttu |= TTU_IGNORE_HWPOISON; + ttu &= ~TTU_HWPOISON; } /* @@ -1531,7 +1531,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, if (page_mkclean(hpage)) { SetPageDirty(hpage); } else { - ttu |= TTU_IGNORE_HWPOISON; + ttu &= ~TTU_HWPOISON; pr_info("%#lx: corrupted page was clean: dropped without side effects\n", pfn); } diff --git a/mm/memory.c b/mm/memory.c index f456f3b5049c..01a23ad48a04 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3563,8 +3563,21 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; struct mmu_notifier_range range; - if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags)) + /* + * We need a reference to lock the folio because we don't hold + * the PTL so a racing thread can remove the device-exclusive + * entry and unmap it. If the folio is free the entry must + * have been removed already. If it happens to have already + * been re-allocated after being freed all we do is lock and + * unlock it. + */ + if (!folio_try_get(folio)) + return 0; + + if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags)) { + folio_put(folio); return VM_FAULT_RETRY; + } mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma->vm_mm, vmf->address & PAGE_MASK, (vmf->address & PAGE_MASK) + PAGE_SIZE, NULL); @@ -3577,6 +3590,7 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) pte_unmap_unlock(vmf->pte, vmf->ptl); folio_unlock(folio); + folio_put(folio); mmu_notifier_invalidate_range_end(&range); return 0; diff --git a/mm/migrate.c b/mm/migrate.c index 37865f85df6d..db3f154446af 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1035,11 +1035,16 @@ out: * destination folio. This is safe because nobody is using them * except us. */ +union migration_ptr { + struct anon_vma *anon_vma; + struct address_space *mapping; +}; static void __migrate_folio_record(struct folio *dst, unsigned long page_was_mapped, struct anon_vma *anon_vma) { - dst->mapping = (void *)anon_vma; + union migration_ptr ptr = { .anon_vma = anon_vma }; + dst->mapping = ptr.mapping; dst->private = (void *)page_was_mapped; } @@ -1047,7 +1052,8 @@ static void __migrate_folio_extract(struct folio *dst, int *page_was_mappedp, struct anon_vma **anon_vmap) { - *anon_vmap = (void *)dst->mapping; + union migration_ptr ptr = { .mapping = dst->mapping }; + *anon_vmap = ptr.anon_vma; *page_was_mappedp = (unsigned long)dst->private; dst->mapping = NULL; dst->private = NULL; @@ -1106,9 +1112,8 @@ static void migrate_folio_done(struct folio *src, /* Obtain the lock on page, remove all ptes. */ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page, unsigned long private, struct folio *src, - struct folio **dstp, int force, bool avoid_force_lock, - enum migrate_mode mode, enum migrate_reason reason, - struct list_head *ret) + struct folio **dstp, enum migrate_mode mode, + enum migrate_reason reason, struct list_head *ret) { struct folio *dst; int rc = -EAGAIN; @@ -1138,7 +1143,7 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page dst->private = NULL; if (!folio_trylock(src)) { - if (!force || mode == MIGRATE_ASYNC) + if (mode == MIGRATE_ASYNC) goto out; /* @@ -1157,17 +1162,6 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page if (current->flags & PF_MEMALLOC) goto out; - /* - * We have locked some folios and are going to wait to lock - * this folio. To avoid a potential deadlock, let's bail - * out and not do that. The locked folios will be moved and - * unlocked, then we can wait to lock this folio. - */ - if (avoid_force_lock) { - rc = -EDEADLOCK; - goto out; - } - folio_lock(src); } locked = true; @@ -1187,8 +1181,6 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page rc = -EBUSY; goto out; } - if (!force) - goto out; folio_wait_writeback(src); } @@ -1247,7 +1239,7 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page /* Establish migration ptes */ VM_BUG_ON_FOLIO(folio_test_anon(src) && !folio_test_ksm(src) && !anon_vma, src); - try_to_migrate(src, TTU_BATCH_FLUSH); + try_to_migrate(src, mode == MIGRATE_ASYNC ? TTU_BATCH_FLUSH : 0); page_was_mapped = 1; } @@ -1261,7 +1253,7 @@ out: * A folio that has not been unmapped will be restored to * right list unless we want to retry. */ - if (rc == -EAGAIN || rc == -EDEADLOCK) + if (rc == -EAGAIN) ret = NULL; migrate_folio_undo_src(src, page_was_mapped, anon_vma, locked, ret); @@ -1502,6 +1494,9 @@ static inline int try_split_folio(struct folio *folio, struct list_head *split_f #define NR_MAX_BATCHED_MIGRATION 512 #endif #define NR_MAX_MIGRATE_PAGES_RETRY 10 +#define NR_MAX_MIGRATE_ASYNC_RETRY 3 +#define NR_MAX_MIGRATE_SYNC_RETRY \ + (NR_MAX_MIGRATE_PAGES_RETRY - NR_MAX_MIGRATE_ASYNC_RETRY) struct migrate_pages_stats { int nr_succeeded; /* Normal and large folios migrated successfully, in @@ -1612,13 +1607,19 @@ static int migrate_hugetlbs(struct list_head *from, new_page_t get_new_page, /* * migrate_pages_batch() first unmaps folios in the from list as many as * possible, then move the unmapped folios. + * + * We only batch migration if mode == MIGRATE_ASYNC to avoid to wait a + * lock or bit when we have locked more than one folio. Which may cause + * deadlock (e.g., for loop device). So, if mode != MIGRATE_ASYNC, the + * length of the from list must be <= 1. */ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page, free_page_t put_new_page, unsigned long private, enum migrate_mode mode, int reason, struct list_head *ret_folios, - struct migrate_pages_stats *stats) + struct list_head *split_folios, struct migrate_pages_stats *stats, + int nr_pass) { - int retry; + int retry = 1; int large_retry = 1; int thp_retry = 1; int nr_failed = 0; @@ -1628,21 +1629,15 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page, bool is_large = false; bool is_thp = false; struct folio *folio, *folio2, *dst = NULL, *dst2; - int rc, rc_saved, nr_pages; - LIST_HEAD(split_folios); + int rc, rc_saved = 0, nr_pages; LIST_HEAD(unmap_folios); LIST_HEAD(dst_folios); bool nosplit = (reason == MR_NUMA_MISPLACED); - bool no_split_folio_counting = false; - bool avoid_force_lock; -retry: - rc_saved = 0; - avoid_force_lock = false; - retry = 1; - for (pass = 0; - pass < NR_MAX_MIGRATE_PAGES_RETRY && (retry || large_retry); - pass++) { + VM_WARN_ON_ONCE(mode != MIGRATE_ASYNC && + !list_empty(from) && !list_is_singular(from)); + + for (pass = 0; pass < nr_pass && (retry || large_retry); pass++) { retry = 0; large_retry = 0; thp_retry = 0; @@ -1673,7 +1668,7 @@ retry: if (!thp_migration_supported() && is_thp) { nr_large_failed++; stats->nr_thp_failed++; - if (!try_split_folio(folio, &split_folios)) { + if (!try_split_folio(folio, split_folios)) { stats->nr_thp_split++; continue; } @@ -1683,15 +1678,13 @@ retry: } rc = migrate_folio_unmap(get_new_page, put_new_page, private, - folio, &dst, pass > 2, avoid_force_lock, - mode, reason, ret_folios); + folio, &dst, mode, reason, ret_folios); /* * The rules are: * Success: folio will be freed * Unmap: folio will be put on unmap_folios list, * dst folio put on dst_folios list * -EAGAIN: stay on the from list - * -EDEADLOCK: stay on the from list * -ENOMEM: stay on the from list * Other errno: put on ret_folios list */ @@ -1706,7 +1699,7 @@ retry: stats->nr_thp_failed += is_thp; /* Large folio NUMA faulting doesn't split to retry. */ if (!nosplit) { - int ret = try_split_folio(folio, &split_folios); + int ret = try_split_folio(folio, split_folios); if (!ret) { stats->nr_thp_split += is_thp; @@ -1723,18 +1716,11 @@ retry: break; } } - } else if (!no_split_folio_counting) { + } else { nr_failed++; } stats->nr_failed_pages += nr_pages + nr_retry_pages; - /* - * There might be some split folios of fail-to-migrate large - * folios left in split_folios list. Move them to ret_folios - * list so that they could be put back to the right list by - * the caller otherwise the folio refcnt will be leaked. - */ - list_splice_init(&split_folios, ret_folios); /* nr_failed isn't updated for not used */ nr_large_failed += large_retry; stats->nr_thp_failed += thp_retry; @@ -1743,19 +1729,11 @@ retry: goto out; else goto move; - case -EDEADLOCK: - /* - * The folio cannot be locked for potential deadlock. - * Go move (and unlock) all locked folios. Then we can - * try again. - */ - rc_saved = rc; - goto move; case -EAGAIN: if (is_large) { large_retry++; thp_retry += is_thp; - } else if (!no_split_folio_counting) { + } else { retry++; } nr_retry_pages += nr_pages; @@ -1765,11 +1743,6 @@ retry: stats->nr_thp_succeeded += is_thp; break; case MIGRATEPAGE_UNMAP: - /* - * We have locked some folios, don't force lock - * to avoid deadlock. - */ - avoid_force_lock = true; list_move_tail(&folio->lru, &unmap_folios); list_add_tail(&dst->lru, &dst_folios); break; @@ -1783,7 +1756,7 @@ retry: if (is_large) { nr_large_failed++; stats->nr_thp_failed += is_thp; - } else if (!no_split_folio_counting) { + } else { nr_failed++; } @@ -1801,9 +1774,7 @@ move: try_to_unmap_flush(); retry = 1; - for (pass = 0; - pass < NR_MAX_MIGRATE_PAGES_RETRY && (retry || large_retry); - pass++) { + for (pass = 0; pass < nr_pass && (retry || large_retry); pass++) { retry = 0; large_retry = 0; thp_retry = 0; @@ -1832,7 +1803,7 @@ move: if (is_large) { large_retry++; thp_retry += is_thp; - } else if (!no_split_folio_counting) { + } else { retry++; } nr_retry_pages += nr_pages; @@ -1845,7 +1816,7 @@ move: if (is_large) { nr_large_failed++; stats->nr_thp_failed += is_thp; - } else if (!no_split_folio_counting) { + } else { nr_failed++; } @@ -1882,30 +1853,52 @@ out: dst2 = list_next_entry(dst, lru); } - /* - * Try to migrate split folios of fail-to-migrate large folios, no - * nr_failed counting in this round, since all split folios of a - * large folio is counted as 1 failure in the first round. - */ - if (rc >= 0 && !list_empty(&split_folios)) { - /* - * Move non-migrated folios (after NR_MAX_MIGRATE_PAGES_RETRY - * retries) to ret_folios to avoid migrating them again. - */ - list_splice_init(from, ret_folios); - list_splice_init(&split_folios, from); - no_split_folio_counting = true; - goto retry; - } + return rc; +} +static int migrate_pages_sync(struct list_head *from, new_page_t get_new_page, + free_page_t put_new_page, unsigned long private, + enum migrate_mode mode, int reason, struct list_head *ret_folios, + struct list_head *split_folios, struct migrate_pages_stats *stats) +{ + int rc, nr_failed = 0; + LIST_HEAD(folios); + struct migrate_pages_stats astats; + + memset(&astats, 0, sizeof(astats)); + /* Try to migrate in batch with MIGRATE_ASYNC mode firstly */ + rc = migrate_pages_batch(from, get_new_page, put_new_page, private, MIGRATE_ASYNC, + reason, &folios, split_folios, &astats, + NR_MAX_MIGRATE_ASYNC_RETRY); + stats->nr_succeeded += astats.nr_succeeded; + stats->nr_thp_succeeded += astats.nr_thp_succeeded; + stats->nr_thp_split += astats.nr_thp_split; + if (rc < 0) { + stats->nr_failed_pages += astats.nr_failed_pages; + stats->nr_thp_failed += astats.nr_thp_failed; + list_splice_tail(&folios, ret_folios); + return rc; + } + stats->nr_thp_failed += astats.nr_thp_split; + nr_failed += astats.nr_thp_split; /* - * We have unlocked all locked folios, so we can force lock now, let's - * try again. + * Fall back to migrate all failed folios one by one synchronously. All + * failed folios except split THPs will be retried, so their failure + * isn't counted */ - if (rc == -EDEADLOCK) - goto retry; + list_splice_tail_init(&folios, from); + while (!list_empty(from)) { + list_move(from->next, &folios); + rc = migrate_pages_batch(&folios, get_new_page, put_new_page, + private, mode, reason, ret_folios, + split_folios, stats, NR_MAX_MIGRATE_SYNC_RETRY); + list_splice_tail_init(&folios, ret_folios); + if (rc < 0) + return rc; + nr_failed += rc; + } - return rc; + return nr_failed; } /* @@ -1943,6 +1936,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, struct folio *folio, *folio2; LIST_HEAD(folios); LIST_HEAD(ret_folios); + LIST_HEAD(split_folios); struct migrate_pages_stats stats; trace_mm_migrate_pages_start(mode, reason); @@ -1953,6 +1947,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, mode, reason, &stats, &ret_folios); if (rc_gather < 0) goto out; + again: nr_pages = 0; list_for_each_entry_safe(folio, folio2, from, lru) { @@ -1963,20 +1958,36 @@ again: } nr_pages += folio_nr_pages(folio); - if (nr_pages > NR_MAX_BATCHED_MIGRATION) + if (nr_pages >= NR_MAX_BATCHED_MIGRATION) break; } - if (nr_pages > NR_MAX_BATCHED_MIGRATION) - list_cut_before(&folios, from, &folio->lru); + if (nr_pages >= NR_MAX_BATCHED_MIGRATION) + list_cut_before(&folios, from, &folio2->lru); else list_splice_init(from, &folios); - rc = migrate_pages_batch(&folios, get_new_page, put_new_page, private, - mode, reason, &ret_folios, &stats); + if (mode == MIGRATE_ASYNC) + rc = migrate_pages_batch(&folios, get_new_page, put_new_page, private, + mode, reason, &ret_folios, &split_folios, &stats, + NR_MAX_MIGRATE_PAGES_RETRY); + else + rc = migrate_pages_sync(&folios, get_new_page, put_new_page, private, + mode, reason, &ret_folios, &split_folios, &stats); list_splice_tail_init(&folios, &ret_folios); if (rc < 0) { rc_gather = rc; + list_splice_tail(&split_folios, &ret_folios); goto out; } + if (!list_empty(&split_folios)) { + /* + * Failure isn't counted since all split folios of a large folio + * is counted as 1 failure already. And, we only try to migrate + * with minimal effort, force MIGRATE_ASYNC mode and retry once. + */ + migrate_pages_batch(&split_folios, get_new_page, put_new_page, private, + MIGRATE_ASYNC, reason, &ret_folios, NULL, &stats, 1); + list_splice_tail_init(&split_folios, &ret_folios); + } rc_gather += rc; if (!list_empty(from)) goto again; diff --git a/mm/mincore.c b/mm/mincore.c index cd69b9db0081..d359650b0f75 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -33,7 +33,7 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, * Hugepages under user process are always in RAM and never * swapped out, but theoretically it needs to be checked. */ - present = pte && !huge_pte_none(huge_ptep_get(pte)); + present = pte && !huge_pte_none_mostly(huge_ptep_get(pte)); for (; addr != end; vec++, addr += PAGE_SIZE) *vec = present; walk->private = vec; diff --git a/mm/mmap.c b/mm/mmap.c index 20f21f0949dd..ff68a67a2a7c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -973,7 +973,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, vma_end = addr; adjust = mid; adj_next = -(vma->vm_end - addr); - err = dup_anon_vma(res, adjust); + err = dup_anon_vma(adjust, prev); } else { vma = next; /* case 3 */ vma_start = addr; @@ -2277,7 +2277,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, int count = 0; int error = -ENOMEM; MA_STATE(mas_detach, &mt_detach, 0, 0); - mt_init_flags(&mt_detach, MT_FLAGS_LOCK_EXTERN); + mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); mt_set_external_lock(&mt_detach, &mm->mmap_lock); /* @@ -2621,12 +2621,7 @@ cannot_expand: if (map_deny_write_exec(vma, vma->vm_flags)) { error = -EACCES; - if (file) - goto close_and_free_vma; - else if (vma->vm_file) - goto unmap_and_free_vma; - else - goto free_vma; + goto close_and_free_vma; } /* Allow architectures to sanity-check the vm_flags */ @@ -3042,6 +3037,7 @@ void exit_mmap(struct mm_struct *mm) */ set_bit(MMF_OOM_SKIP, &mm->flags); mmap_write_lock(mm); + mt_clear_in_rcu(&mm->mm_mt); free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb); diff --git a/mm/mprotect.c b/mm/mprotect.c index 231929f119d9..13e84d8c0797 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -805,7 +805,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, if (map_deny_write_exec(vma, newflags)) { error = -EACCES; - goto out; + break; } /* Allow architectures to sanity-check the new flags */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ac1fc986af44..7136c36c5d01 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1398,6 +1398,7 @@ static __always_inline bool free_pages_prepare(struct page *page, unsigned int order, bool check_free, fpi_t fpi_flags) { int bad = 0; + bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags); bool init = want_init_on_free(); VM_BUG_ON_PAGE(PageTail(page), page); @@ -1470,7 +1471,7 @@ static __always_inline bool free_pages_prepare(struct page *page, * With hardware tag-based KASAN, memory tags must be set before the * page becomes unavailable via debug_pagealloc or arch_free_page. */ - if (!should_skip_kasan_poison(page, fpi_flags)) { + if (!skip_kasan_poison) { kasan_poison_pages(page, order, init); /* Memory is already initialized if KASAN did it internally. */ diff --git a/mm/rmap.c b/mm/rmap.c index 15ae24585fc4..8632e02661ac 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1602,7 +1602,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, /* Update high watermark before we lower rss */ update_hiwater_rss(mm); - if (PageHWPoison(subpage) && !(flags & TTU_IGNORE_HWPOISON)) { + if (PageHWPoison(subpage) && (flags & TTU_HWPOISON)) { pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); if (folio_test_hugetlb(folio)) { hugetlb_count_sub(folio_nr_pages(folio), mm); diff --git a/mm/slab.c b/mm/slab.c index dabc2a671fc6..edbe722fb906 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -839,7 +839,7 @@ static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp) return 0; } -#if (defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)) || defined(CONFIG_SMP) +#if defined(CONFIG_NUMA) || defined(CONFIG_SMP) /* * Allocates and initializes node for a node on each slab cache, used for * either memory or cpu hotplug. If memory is being hot-added, the kmem_cache_node diff --git a/mm/swapfile.c b/mm/swapfile.c index 62ba2bf577d7..2c718f45745f 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -679,6 +679,7 @@ static void __del_from_avail_list(struct swap_info_struct *p) { int nid; + assert_spin_locked(&p->lock); for_each_node(nid) plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]); } @@ -2434,8 +2435,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_unlock(&swap_lock); goto out_dput; } - del_from_avail_list(p); spin_lock(&p->lock); + del_from_avail_list(p); if (p->prio < 0) { struct swap_info_struct *si = p; int nid; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ef910bf349e1..a50072066221 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2883,6 +2883,8 @@ vm_area_alloc_pages(gfp_t gfp, int nid, unsigned int order, unsigned int nr_pages, struct page **pages) { unsigned int nr_allocated = 0; + gfp_t alloc_gfp = gfp; + bool nofail = false; struct page *page; int i; @@ -2893,6 +2895,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid, * more permissive. */ if (!order) { + /* bulk allocator doesn't support nofail req. officially */ gfp_t bulk_gfp = gfp & ~__GFP_NOFAIL; while (nr_allocated < nr_pages) { @@ -2931,20 +2934,35 @@ vm_area_alloc_pages(gfp_t gfp, int nid, if (nr != nr_pages_request) break; } + } else if (gfp & __GFP_NOFAIL) { + /* + * Higher order nofail allocations are really expensive and + * potentially dangerous (pre-mature OOM, disruptive reclaim + * and compaction etc. + */ + alloc_gfp &= ~__GFP_NOFAIL; + nofail = true; } /* High-order pages or fallback path if "bulk" fails. */ - while (nr_allocated < nr_pages) { if (fatal_signal_pending(current)) break; if (nid == NUMA_NO_NODE) - page = alloc_pages(gfp, order); + page = alloc_pages(alloc_gfp, order); else - page = alloc_pages_node(nid, gfp, order); - if (unlikely(!page)) - break; + page = alloc_pages_node(nid, alloc_gfp, order); + if (unlikely(!page)) { + if (!nofail) + break; + + /* fall back to the zero order allocations */ + alloc_gfp |= __GFP_NOFAIL; + order = 0; + continue; + } + /* * Higher order allocations must be able to be treated as * indepdenent small pages by callers (as they can with @@ -3024,9 +3042,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, * allocation request, free them via vfree() if any. */ if (area->nr_pages != nr_small_pages) { - warn_alloc(gfp_mask, NULL, - "vmalloc error: size %lu, page order %u, failed to allocate pages", - area->nr_pages * PAGE_SIZE, page_order); + /* vm_area_alloc_pages() can also fail due to a fatal signal */ + if (!fatal_signal_pending(current)) + warn_alloc(gfp_mask, NULL, + "vmalloc error: size %lu, page order %u, failed to allocate pages", + area->nr_pages * PAGE_SIZE, page_order); goto fail; } |