summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/damon/paddr.c12
-rw-r--r--mm/huge_memory.c6
-rw-r--r--mm/hugetlb.c14
-rw-r--r--mm/kasan/Makefile9
-rw-r--r--mm/kasan/kasan.h4
-rw-r--r--mm/kasan/kasan_test.c29
-rw-r--r--mm/kasan/shadow.c16
-rw-r--r--mm/kfence/Makefile2
-rw-r--r--mm/kfence/core.c42
-rw-r--r--mm/ksm.c11
-rw-r--r--mm/memory-failure.c8
-rw-r--r--mm/memory.c16
-rw-r--r--mm/migrate.c195
-rw-r--r--mm/mincore.c2
-rw-r--r--mm/mmap.c12
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/page_alloc.c3
-rw-r--r--mm/rmap.c2
-rw-r--r--mm/slab.c2
-rw-r--r--mm/swapfile.c3
-rw-r--r--mm/vmalloc.c36
21 files changed, 273 insertions, 153 deletions
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 607bb69e526c..dd9c33fbe805 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -130,7 +130,6 @@ static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz)
accessed = false;
else
accessed = true;
- folio_put(folio);
goto out;
}
@@ -144,10 +143,10 @@ static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz)
if (need_lock)
folio_unlock(folio);
- folio_put(folio);
out:
*folio_sz = folio_size(folio);
+ folio_put(folio);
return accessed;
}
@@ -250,12 +249,11 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s)
folio_put(folio);
continue;
}
- if (folio_test_unevictable(folio)) {
+ if (folio_test_unevictable(folio))
folio_putback_lru(folio);
- } else {
+ else
list_add(&folio->lru, &folio_list);
- folio_put(folio);
- }
+ folio_put(folio);
}
applied = reclaim_pages(&folio_list);
cond_resched();
@@ -282,8 +280,8 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate(
folio_mark_accessed(folio);
else
folio_deactivate(folio);
- folio_put(folio);
applied += folio_nr_pages(folio);
+ folio_put(folio);
}
return applied * PAGE_SIZE;
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 4fc43859e59a..032fb0ef9cd1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2037,7 +2037,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
{
struct mm_struct *mm = vma->vm_mm;
pgtable_t pgtable;
- pmd_t _pmd;
+ pmd_t _pmd, old_pmd;
int i;
/*
@@ -2048,7 +2048,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
*
* See Documentation/mm/mmu_notifier.rst
*/
- pmdp_huge_clear_flush(vma, haddr, pmd);
+ old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
@@ -2057,6 +2057,8 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
pte_t *pte, entry;
entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
entry = pte_mkspecial(entry);
+ if (pmd_uffd_wp(old_pmd))
+ entry = pte_mkuffd_wp(entry);
pte = pte_offset_map(&_pmd, haddr);
VM_BUG_ON(!pte_none(*pte));
set_pte_at(mm, haddr, pte, entry);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 07abcb6eb203..245038a9fe4e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5478,7 +5478,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
struct folio *pagecache_folio, spinlock_t *ptl)
{
const bool unshare = flags & FAULT_FLAG_UNSHARE;
- pte_t pte;
+ pte_t pte = huge_ptep_get(ptep);
struct hstate *h = hstate_vma(vma);
struct page *old_page;
struct folio *new_folio;
@@ -5488,6 +5488,17 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
struct mmu_notifier_range range;
/*
+ * Never handle CoW for uffd-wp protected pages. It should be only
+ * handled when the uffd-wp protection is removed.
+ *
+ * Note that only the CoW optimization path (in hugetlb_no_page())
+ * can trigger this, because hugetlb_fault() will always resolve
+ * uffd-wp bit first.
+ */
+ if (!unshare && huge_pte_uffd_wp(pte))
+ return 0;
+
+ /*
* hugetlb does not support FOLL_FORCE-style write faults that keep the
* PTE mapped R/O such as maybe_mkwrite() would do.
*/
@@ -5500,7 +5511,6 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
return 0;
}
- pte = huge_ptep_get(ptep);
old_page = pte_page(pte);
delayacct_wpcopy_start();
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index d4837bff3b60..7634dd2a6128 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -35,7 +35,14 @@ CFLAGS_shadow.o := $(CC_FLAGS_KASAN_RUNTIME)
CFLAGS_hw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
CFLAGS_sw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
-CFLAGS_KASAN_TEST := $(CFLAGS_KASAN) -fno-builtin $(call cc-disable-warning, vla)
+CFLAGS_KASAN_TEST := $(CFLAGS_KASAN) $(call cc-disable-warning, vla)
+ifndef CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX
+# If compiler instruments memintrinsics by prefixing them with __asan/__hwasan,
+# we need to treat them normally (as builtins), otherwise the compiler won't
+# recognize them as instrumentable. If it doesn't instrument them, we need to
+# pass -fno-builtin, so the compiler doesn't inline them.
+CFLAGS_KASAN_TEST += -fno-builtin
+endif
CFLAGS_kasan_test.o := $(CFLAGS_KASAN_TEST)
CFLAGS_kasan_test_module.o := $(CFLAGS_KASAN_TEST)
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 9377b0789edc..a61eeee3095a 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -666,4 +666,8 @@ void __hwasan_storeN_noabort(unsigned long addr, size_t size);
void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size);
+void *__hwasan_memset(void *addr, int c, size_t len);
+void *__hwasan_memmove(void *dest, const void *src, size_t len);
+void *__hwasan_memcpy(void *dest, const void *src, size_t len);
+
#endif /* __MM_KASAN_KASAN_H */
diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c
index 74cd80c12b25..627eaf1ee1db 100644
--- a/mm/kasan/kasan_test.c
+++ b/mm/kasan/kasan_test.c
@@ -165,6 +165,15 @@ static void kasan_test_exit(struct kunit *test)
kunit_skip((test), "Test requires " #config "=n"); \
} while (0)
+#define KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test) do { \
+ if (IS_ENABLED(CONFIG_KASAN_HW_TAGS)) \
+ break; /* No compiler instrumentation. */ \
+ if (IS_ENABLED(CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX)) \
+ break; /* Should always be instrumented! */ \
+ if (IS_ENABLED(CONFIG_GENERIC_ENTRY)) \
+ kunit_skip((test), "Test requires checked mem*()"); \
+} while (0)
+
static void kmalloc_oob_right(struct kunit *test)
{
char *ptr;
@@ -454,6 +463,8 @@ static void kmalloc_oob_16(struct kunit *test)
u64 words[2];
} *ptr1, *ptr2;
+ KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test);
+
/* This test is specifically crafted for the generic mode. */
KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
@@ -476,6 +487,8 @@ static void kmalloc_uaf_16(struct kunit *test)
u64 words[2];
} *ptr1, *ptr2;
+ KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test);
+
ptr1 = kmalloc(sizeof(*ptr1), GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1);
@@ -498,6 +511,8 @@ static void kmalloc_oob_memset_2(struct kunit *test)
char *ptr;
size_t size = 128 - KASAN_GRANULE_SIZE;
+ KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test);
+
ptr = kmalloc(size, GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
@@ -511,6 +526,8 @@ static void kmalloc_oob_memset_4(struct kunit *test)
char *ptr;
size_t size = 128 - KASAN_GRANULE_SIZE;
+ KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test);
+
ptr = kmalloc(size, GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
@@ -524,6 +541,8 @@ static void kmalloc_oob_memset_8(struct kunit *test)
char *ptr;
size_t size = 128 - KASAN_GRANULE_SIZE;
+ KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test);
+
ptr = kmalloc(size, GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
@@ -537,6 +556,8 @@ static void kmalloc_oob_memset_16(struct kunit *test)
char *ptr;
size_t size = 128 - KASAN_GRANULE_SIZE;
+ KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test);
+
ptr = kmalloc(size, GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
@@ -550,6 +571,8 @@ static void kmalloc_oob_in_memset(struct kunit *test)
char *ptr;
size_t size = 128 - KASAN_GRANULE_SIZE;
+ KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test);
+
ptr = kmalloc(size, GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
@@ -566,6 +589,8 @@ static void kmalloc_memmove_negative_size(struct kunit *test)
size_t size = 64;
size_t invalid_size = -2;
+ KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test);
+
/*
* Hardware tag-based mode doesn't check memmove for negative size.
* As a result, this test introduces a side-effect memory corruption,
@@ -590,6 +615,8 @@ static void kmalloc_memmove_invalid_size(struct kunit *test)
size_t size = 64;
size_t invalid_size = size;
+ KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test);
+
ptr = kmalloc(size, GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
@@ -618,6 +645,8 @@ static void kmalloc_uaf_memset(struct kunit *test)
char *ptr;
size_t size = 33;
+ KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test);
+
/*
* Only generic KASAN uses quarantine, which is required to avoid a
* kernel memory corruption this test causes.
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index 3703983a8e55..c8b86f3273b5 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -38,11 +38,14 @@ bool __kasan_check_write(const volatile void *p, unsigned int size)
}
EXPORT_SYMBOL(__kasan_check_write);
-#ifndef CONFIG_GENERIC_ENTRY
+#if !defined(CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX) && !defined(CONFIG_GENERIC_ENTRY)
/*
* CONFIG_GENERIC_ENTRY relies on compiler emitted mem*() calls to not be
* instrumented. KASAN enabled toolchains should emit __asan_mem*() functions
* for the sites they want to instrument.
+ *
+ * If we have a compiler that can instrument meminstrinsics, never override
+ * these, so that non-instrumented files can safely consider them as builtins.
*/
#undef memset
void *memset(void *addr, int c, size_t len)
@@ -107,6 +110,17 @@ void *__asan_memcpy(void *dest, const void *src, size_t len)
}
EXPORT_SYMBOL(__asan_memcpy);
+#ifdef CONFIG_KASAN_SW_TAGS
+void *__hwasan_memset(void *addr, int c, size_t len) __alias(__asan_memset);
+EXPORT_SYMBOL(__hwasan_memset);
+#ifdef __HAVE_ARCH_MEMMOVE
+void *__hwasan_memmove(void *dest, const void *src, size_t len) __alias(__asan_memmove);
+EXPORT_SYMBOL(__hwasan_memmove);
+#endif
+void *__hwasan_memcpy(void *dest, const void *src, size_t len) __alias(__asan_memcpy);
+EXPORT_SYMBOL(__hwasan_memcpy);
+#endif
+
void kasan_poison(const void *addr, size_t size, u8 value, bool init)
{
void *shadow_start, *shadow_end;
diff --git a/mm/kfence/Makefile b/mm/kfence/Makefile
index 0bb95728a784..2de2a58d11a1 100644
--- a/mm/kfence/Makefile
+++ b/mm/kfence/Makefile
@@ -2,5 +2,5 @@
obj-y := core.o report.o
-CFLAGS_kfence_test.o := -g -fno-omit-frame-pointer -fno-optimize-sibling-calls
+CFLAGS_kfence_test.o := -fno-omit-frame-pointer -fno-optimize-sibling-calls
obj-$(CONFIG_KFENCE_KUNIT_TEST) += kfence_test.o
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 5349c37a5dac..1065e0568d05 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -556,15 +556,11 @@ static unsigned long kfence_init_pool(void)
* enters __slab_free() slow-path.
*/
for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
- struct slab *slab = page_slab(&pages[i]);
+ struct slab *slab = page_slab(nth_page(pages, i));
if (!i || (i % 2))
continue;
- /* Verify we do not have a compound head page. */
- if (WARN_ON(compound_head(&pages[i]) != &pages[i]))
- return addr;
-
__folio_set_slab(slab_folio(slab));
#ifdef CONFIG_MEMCG
slab->memcg_data = (unsigned long)&kfence_metadata[i / 2 - 1].objcg |
@@ -597,12 +593,26 @@ static unsigned long kfence_init_pool(void)
/* Protect the right redzone. */
if (unlikely(!kfence_protect(addr + PAGE_SIZE)))
- return addr;
+ goto reset_slab;
addr += 2 * PAGE_SIZE;
}
return 0;
+
+reset_slab:
+ for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
+ struct slab *slab = page_slab(nth_page(pages, i));
+
+ if (!i || (i % 2))
+ continue;
+#ifdef CONFIG_MEMCG
+ slab->memcg_data = 0;
+#endif
+ __folio_clear_slab(slab_folio(slab));
+ }
+
+ return addr;
}
static bool __init kfence_init_pool_early(void)
@@ -632,16 +642,6 @@ static bool __init kfence_init_pool_early(void)
* fails for the first page, and therefore expect addr==__kfence_pool in
* most failure cases.
*/
- for (char *p = (char *)addr; p < __kfence_pool + KFENCE_POOL_SIZE; p += PAGE_SIZE) {
- struct slab *slab = virt_to_slab(p);
-
- if (!slab)
- continue;
-#ifdef CONFIG_MEMCG
- slab->memcg_data = 0;
-#endif
- __folio_clear_slab(slab_folio(slab));
- }
memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool));
__kfence_pool = NULL;
return false;
@@ -726,10 +726,14 @@ static const struct seq_operations objects_sops = {
};
DEFINE_SEQ_ATTRIBUTE(objects);
-static int __init kfence_debugfs_init(void)
+static int kfence_debugfs_init(void)
{
- struct dentry *kfence_dir = debugfs_create_dir("kfence", NULL);
+ struct dentry *kfence_dir;
+
+ if (!READ_ONCE(kfence_enabled))
+ return 0;
+ kfence_dir = debugfs_create_dir("kfence", NULL);
debugfs_create_file("stats", 0444, kfence_dir, NULL, &stats_fops);
debugfs_create_file("objects", 0400, kfence_dir, NULL, &objects_fops);
return 0;
@@ -883,6 +887,8 @@ static int kfence_init_late(void)
}
kfence_init_enable();
+ kfence_debugfs_init();
+
return 0;
}
diff --git a/mm/ksm.c b/mm/ksm.c
index ad591b779d53..2b8d30068cbb 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -988,9 +988,15 @@ static int unmerge_and_remove_all_rmap_items(void)
mm = mm_slot->slot.mm;
mmap_read_lock(mm);
+
+ /*
+ * Exit right away if mm is exiting to avoid lockdep issue in
+ * the maple tree
+ */
+ if (ksm_test_exit(mm))
+ goto mm_exiting;
+
for_each_vma(vmi, vma) {
- if (ksm_test_exit(mm))
- break;
if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
continue;
err = unmerge_ksm_pages(vma,
@@ -999,6 +1005,7 @@ static int unmerge_and_remove_all_rmap_items(void)
goto error;
}
+mm_exiting:
remove_trailing_rmap_items(&mm_slot->rmap_list);
mmap_read_unlock(mm);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index a1ede7bdce95..fae9baf3be16 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1069,7 +1069,7 @@ static int me_pagecache_dirty(struct page_state *ps, struct page *p)
* cache and swap cache(ie. page is freshly swapped in). So it could be
* referenced concurrently by 2 types of PTEs:
* normal PTEs and swap PTEs. We try to handle them consistently by calling
- * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs,
+ * try_to_unmap(!TTU_HWPOISON) to convert the normal PTEs to swap PTEs,
* and then
* - clear dirty bit to prevent IO
* - remove from LRU
@@ -1486,7 +1486,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
int flags, struct page *hpage)
{
struct folio *folio = page_folio(hpage);
- enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC;
+ enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON;
struct address_space *mapping;
LIST_HEAD(tokill);
bool unmap_success;
@@ -1516,7 +1516,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
if (PageSwapCache(p)) {
pr_err("%#lx: keeping poisoned page in swap cache\n", pfn);
- ttu |= TTU_IGNORE_HWPOISON;
+ ttu &= ~TTU_HWPOISON;
}
/*
@@ -1531,7 +1531,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
if (page_mkclean(hpage)) {
SetPageDirty(hpage);
} else {
- ttu |= TTU_IGNORE_HWPOISON;
+ ttu &= ~TTU_HWPOISON;
pr_info("%#lx: corrupted page was clean: dropped without side effects\n",
pfn);
}
diff --git a/mm/memory.c b/mm/memory.c
index f456f3b5049c..01a23ad48a04 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3563,8 +3563,21 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
struct mmu_notifier_range range;
- if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags))
+ /*
+ * We need a reference to lock the folio because we don't hold
+ * the PTL so a racing thread can remove the device-exclusive
+ * entry and unmap it. If the folio is free the entry must
+ * have been removed already. If it happens to have already
+ * been re-allocated after being freed all we do is lock and
+ * unlock it.
+ */
+ if (!folio_try_get(folio))
+ return 0;
+
+ if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags)) {
+ folio_put(folio);
return VM_FAULT_RETRY;
+ }
mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
vma->vm_mm, vmf->address & PAGE_MASK,
(vmf->address & PAGE_MASK) + PAGE_SIZE, NULL);
@@ -3577,6 +3590,7 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
pte_unmap_unlock(vmf->pte, vmf->ptl);
folio_unlock(folio);
+ folio_put(folio);
mmu_notifier_invalidate_range_end(&range);
return 0;
diff --git a/mm/migrate.c b/mm/migrate.c
index 37865f85df6d..db3f154446af 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1035,11 +1035,16 @@ out:
* destination folio. This is safe because nobody is using them
* except us.
*/
+union migration_ptr {
+ struct anon_vma *anon_vma;
+ struct address_space *mapping;
+};
static void __migrate_folio_record(struct folio *dst,
unsigned long page_was_mapped,
struct anon_vma *anon_vma)
{
- dst->mapping = (void *)anon_vma;
+ union migration_ptr ptr = { .anon_vma = anon_vma };
+ dst->mapping = ptr.mapping;
dst->private = (void *)page_was_mapped;
}
@@ -1047,7 +1052,8 @@ static void __migrate_folio_extract(struct folio *dst,
int *page_was_mappedp,
struct anon_vma **anon_vmap)
{
- *anon_vmap = (void *)dst->mapping;
+ union migration_ptr ptr = { .mapping = dst->mapping };
+ *anon_vmap = ptr.anon_vma;
*page_was_mappedp = (unsigned long)dst->private;
dst->mapping = NULL;
dst->private = NULL;
@@ -1106,9 +1112,8 @@ static void migrate_folio_done(struct folio *src,
/* Obtain the lock on page, remove all ptes. */
static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page,
unsigned long private, struct folio *src,
- struct folio **dstp, int force, bool avoid_force_lock,
- enum migrate_mode mode, enum migrate_reason reason,
- struct list_head *ret)
+ struct folio **dstp, enum migrate_mode mode,
+ enum migrate_reason reason, struct list_head *ret)
{
struct folio *dst;
int rc = -EAGAIN;
@@ -1138,7 +1143,7 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page
dst->private = NULL;
if (!folio_trylock(src)) {
- if (!force || mode == MIGRATE_ASYNC)
+ if (mode == MIGRATE_ASYNC)
goto out;
/*
@@ -1157,17 +1162,6 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page
if (current->flags & PF_MEMALLOC)
goto out;
- /*
- * We have locked some folios and are going to wait to lock
- * this folio. To avoid a potential deadlock, let's bail
- * out and not do that. The locked folios will be moved and
- * unlocked, then we can wait to lock this folio.
- */
- if (avoid_force_lock) {
- rc = -EDEADLOCK;
- goto out;
- }
-
folio_lock(src);
}
locked = true;
@@ -1187,8 +1181,6 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page
rc = -EBUSY;
goto out;
}
- if (!force)
- goto out;
folio_wait_writeback(src);
}
@@ -1247,7 +1239,7 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page
/* Establish migration ptes */
VM_BUG_ON_FOLIO(folio_test_anon(src) &&
!folio_test_ksm(src) && !anon_vma, src);
- try_to_migrate(src, TTU_BATCH_FLUSH);
+ try_to_migrate(src, mode == MIGRATE_ASYNC ? TTU_BATCH_FLUSH : 0);
page_was_mapped = 1;
}
@@ -1261,7 +1253,7 @@ out:
* A folio that has not been unmapped will be restored to
* right list unless we want to retry.
*/
- if (rc == -EAGAIN || rc == -EDEADLOCK)
+ if (rc == -EAGAIN)
ret = NULL;
migrate_folio_undo_src(src, page_was_mapped, anon_vma, locked, ret);
@@ -1502,6 +1494,9 @@ static inline int try_split_folio(struct folio *folio, struct list_head *split_f
#define NR_MAX_BATCHED_MIGRATION 512
#endif
#define NR_MAX_MIGRATE_PAGES_RETRY 10
+#define NR_MAX_MIGRATE_ASYNC_RETRY 3
+#define NR_MAX_MIGRATE_SYNC_RETRY \
+ (NR_MAX_MIGRATE_PAGES_RETRY - NR_MAX_MIGRATE_ASYNC_RETRY)
struct migrate_pages_stats {
int nr_succeeded; /* Normal and large folios migrated successfully, in
@@ -1612,13 +1607,19 @@ static int migrate_hugetlbs(struct list_head *from, new_page_t get_new_page,
/*
* migrate_pages_batch() first unmaps folios in the from list as many as
* possible, then move the unmapped folios.
+ *
+ * We only batch migration if mode == MIGRATE_ASYNC to avoid to wait a
+ * lock or bit when we have locked more than one folio. Which may cause
+ * deadlock (e.g., for loop device). So, if mode != MIGRATE_ASYNC, the
+ * length of the from list must be <= 1.
*/
static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
free_page_t put_new_page, unsigned long private,
enum migrate_mode mode, int reason, struct list_head *ret_folios,
- struct migrate_pages_stats *stats)
+ struct list_head *split_folios, struct migrate_pages_stats *stats,
+ int nr_pass)
{
- int retry;
+ int retry = 1;
int large_retry = 1;
int thp_retry = 1;
int nr_failed = 0;
@@ -1628,21 +1629,15 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
bool is_large = false;
bool is_thp = false;
struct folio *folio, *folio2, *dst = NULL, *dst2;
- int rc, rc_saved, nr_pages;
- LIST_HEAD(split_folios);
+ int rc, rc_saved = 0, nr_pages;
LIST_HEAD(unmap_folios);
LIST_HEAD(dst_folios);
bool nosplit = (reason == MR_NUMA_MISPLACED);
- bool no_split_folio_counting = false;
- bool avoid_force_lock;
-retry:
- rc_saved = 0;
- avoid_force_lock = false;
- retry = 1;
- for (pass = 0;
- pass < NR_MAX_MIGRATE_PAGES_RETRY && (retry || large_retry);
- pass++) {
+ VM_WARN_ON_ONCE(mode != MIGRATE_ASYNC &&
+ !list_empty(from) && !list_is_singular(from));
+
+ for (pass = 0; pass < nr_pass && (retry || large_retry); pass++) {
retry = 0;
large_retry = 0;
thp_retry = 0;
@@ -1673,7 +1668,7 @@ retry:
if (!thp_migration_supported() && is_thp) {
nr_large_failed++;
stats->nr_thp_failed++;
- if (!try_split_folio(folio, &split_folios)) {
+ if (!try_split_folio(folio, split_folios)) {
stats->nr_thp_split++;
continue;
}
@@ -1683,15 +1678,13 @@ retry:
}
rc = migrate_folio_unmap(get_new_page, put_new_page, private,
- folio, &dst, pass > 2, avoid_force_lock,
- mode, reason, ret_folios);
+ folio, &dst, mode, reason, ret_folios);
/*
* The rules are:
* Success: folio will be freed
* Unmap: folio will be put on unmap_folios list,
* dst folio put on dst_folios list
* -EAGAIN: stay on the from list
- * -EDEADLOCK: stay on the from list
* -ENOMEM: stay on the from list
* Other errno: put on ret_folios list
*/
@@ -1706,7 +1699,7 @@ retry:
stats->nr_thp_failed += is_thp;
/* Large folio NUMA faulting doesn't split to retry. */
if (!nosplit) {
- int ret = try_split_folio(folio, &split_folios);
+ int ret = try_split_folio(folio, split_folios);
if (!ret) {
stats->nr_thp_split += is_thp;
@@ -1723,18 +1716,11 @@ retry:
break;
}
}
- } else if (!no_split_folio_counting) {
+ } else {
nr_failed++;
}
stats->nr_failed_pages += nr_pages + nr_retry_pages;
- /*
- * There might be some split folios of fail-to-migrate large
- * folios left in split_folios list. Move them to ret_folios
- * list so that they could be put back to the right list by
- * the caller otherwise the folio refcnt will be leaked.
- */
- list_splice_init(&split_folios, ret_folios);
/* nr_failed isn't updated for not used */
nr_large_failed += large_retry;
stats->nr_thp_failed += thp_retry;
@@ -1743,19 +1729,11 @@ retry:
goto out;
else
goto move;
- case -EDEADLOCK:
- /*
- * The folio cannot be locked for potential deadlock.
- * Go move (and unlock) all locked folios. Then we can
- * try again.
- */
- rc_saved = rc;
- goto move;
case -EAGAIN:
if (is_large) {
large_retry++;
thp_retry += is_thp;
- } else if (!no_split_folio_counting) {
+ } else {
retry++;
}
nr_retry_pages += nr_pages;
@@ -1765,11 +1743,6 @@ retry:
stats->nr_thp_succeeded += is_thp;
break;
case MIGRATEPAGE_UNMAP:
- /*
- * We have locked some folios, don't force lock
- * to avoid deadlock.
- */
- avoid_force_lock = true;
list_move_tail(&folio->lru, &unmap_folios);
list_add_tail(&dst->lru, &dst_folios);
break;
@@ -1783,7 +1756,7 @@ retry:
if (is_large) {
nr_large_failed++;
stats->nr_thp_failed += is_thp;
- } else if (!no_split_folio_counting) {
+ } else {
nr_failed++;
}
@@ -1801,9 +1774,7 @@ move:
try_to_unmap_flush();
retry = 1;
- for (pass = 0;
- pass < NR_MAX_MIGRATE_PAGES_RETRY && (retry || large_retry);
- pass++) {
+ for (pass = 0; pass < nr_pass && (retry || large_retry); pass++) {
retry = 0;
large_retry = 0;
thp_retry = 0;
@@ -1832,7 +1803,7 @@ move:
if (is_large) {
large_retry++;
thp_retry += is_thp;
- } else if (!no_split_folio_counting) {
+ } else {
retry++;
}
nr_retry_pages += nr_pages;
@@ -1845,7 +1816,7 @@ move:
if (is_large) {
nr_large_failed++;
stats->nr_thp_failed += is_thp;
- } else if (!no_split_folio_counting) {
+ } else {
nr_failed++;
}
@@ -1882,30 +1853,52 @@ out:
dst2 = list_next_entry(dst, lru);
}
- /*
- * Try to migrate split folios of fail-to-migrate large folios, no
- * nr_failed counting in this round, since all split folios of a
- * large folio is counted as 1 failure in the first round.
- */
- if (rc >= 0 && !list_empty(&split_folios)) {
- /*
- * Move non-migrated folios (after NR_MAX_MIGRATE_PAGES_RETRY
- * retries) to ret_folios to avoid migrating them again.
- */
- list_splice_init(from, ret_folios);
- list_splice_init(&split_folios, from);
- no_split_folio_counting = true;
- goto retry;
- }
+ return rc;
+}
+static int migrate_pages_sync(struct list_head *from, new_page_t get_new_page,
+ free_page_t put_new_page, unsigned long private,
+ enum migrate_mode mode, int reason, struct list_head *ret_folios,
+ struct list_head *split_folios, struct migrate_pages_stats *stats)
+{
+ int rc, nr_failed = 0;
+ LIST_HEAD(folios);
+ struct migrate_pages_stats astats;
+
+ memset(&astats, 0, sizeof(astats));
+ /* Try to migrate in batch with MIGRATE_ASYNC mode firstly */
+ rc = migrate_pages_batch(from, get_new_page, put_new_page, private, MIGRATE_ASYNC,
+ reason, &folios, split_folios, &astats,
+ NR_MAX_MIGRATE_ASYNC_RETRY);
+ stats->nr_succeeded += astats.nr_succeeded;
+ stats->nr_thp_succeeded += astats.nr_thp_succeeded;
+ stats->nr_thp_split += astats.nr_thp_split;
+ if (rc < 0) {
+ stats->nr_failed_pages += astats.nr_failed_pages;
+ stats->nr_thp_failed += astats.nr_thp_failed;
+ list_splice_tail(&folios, ret_folios);
+ return rc;
+ }
+ stats->nr_thp_failed += astats.nr_thp_split;
+ nr_failed += astats.nr_thp_split;
/*
- * We have unlocked all locked folios, so we can force lock now, let's
- * try again.
+ * Fall back to migrate all failed folios one by one synchronously. All
+ * failed folios except split THPs will be retried, so their failure
+ * isn't counted
*/
- if (rc == -EDEADLOCK)
- goto retry;
+ list_splice_tail_init(&folios, from);
+ while (!list_empty(from)) {
+ list_move(from->next, &folios);
+ rc = migrate_pages_batch(&folios, get_new_page, put_new_page,
+ private, mode, reason, ret_folios,
+ split_folios, stats, NR_MAX_MIGRATE_SYNC_RETRY);
+ list_splice_tail_init(&folios, ret_folios);
+ if (rc < 0)
+ return rc;
+ nr_failed += rc;
+ }
- return rc;
+ return nr_failed;
}
/*
@@ -1943,6 +1936,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
struct folio *folio, *folio2;
LIST_HEAD(folios);
LIST_HEAD(ret_folios);
+ LIST_HEAD(split_folios);
struct migrate_pages_stats stats;
trace_mm_migrate_pages_start(mode, reason);
@@ -1953,6 +1947,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
mode, reason, &stats, &ret_folios);
if (rc_gather < 0)
goto out;
+
again:
nr_pages = 0;
list_for_each_entry_safe(folio, folio2, from, lru) {
@@ -1963,20 +1958,36 @@ again:
}
nr_pages += folio_nr_pages(folio);
- if (nr_pages > NR_MAX_BATCHED_MIGRATION)
+ if (nr_pages >= NR_MAX_BATCHED_MIGRATION)
break;
}
- if (nr_pages > NR_MAX_BATCHED_MIGRATION)
- list_cut_before(&folios, from, &folio->lru);
+ if (nr_pages >= NR_MAX_BATCHED_MIGRATION)
+ list_cut_before(&folios, from, &folio2->lru);
else
list_splice_init(from, &folios);
- rc = migrate_pages_batch(&folios, get_new_page, put_new_page, private,
- mode, reason, &ret_folios, &stats);
+ if (mode == MIGRATE_ASYNC)
+ rc = migrate_pages_batch(&folios, get_new_page, put_new_page, private,
+ mode, reason, &ret_folios, &split_folios, &stats,
+ NR_MAX_MIGRATE_PAGES_RETRY);
+ else
+ rc = migrate_pages_sync(&folios, get_new_page, put_new_page, private,
+ mode, reason, &ret_folios, &split_folios, &stats);
list_splice_tail_init(&folios, &ret_folios);
if (rc < 0) {
rc_gather = rc;
+ list_splice_tail(&split_folios, &ret_folios);
goto out;
}
+ if (!list_empty(&split_folios)) {
+ /*
+ * Failure isn't counted since all split folios of a large folio
+ * is counted as 1 failure already. And, we only try to migrate
+ * with minimal effort, force MIGRATE_ASYNC mode and retry once.
+ */
+ migrate_pages_batch(&split_folios, get_new_page, put_new_page, private,
+ MIGRATE_ASYNC, reason, &ret_folios, NULL, &stats, 1);
+ list_splice_tail_init(&split_folios, &ret_folios);
+ }
rc_gather += rc;
if (!list_empty(from))
goto again;
diff --git a/mm/mincore.c b/mm/mincore.c
index cd69b9db0081..d359650b0f75 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -33,7 +33,7 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
* Hugepages under user process are always in RAM and never
* swapped out, but theoretically it needs to be checked.
*/
- present = pte && !huge_pte_none(huge_ptep_get(pte));
+ present = pte && !huge_pte_none_mostly(huge_ptep_get(pte));
for (; addr != end; vec++, addr += PAGE_SIZE)
*vec = present;
walk->private = vec;
diff --git a/mm/mmap.c b/mm/mmap.c
index 20f21f0949dd..ff68a67a2a7c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -973,7 +973,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
vma_end = addr;
adjust = mid;
adj_next = -(vma->vm_end - addr);
- err = dup_anon_vma(res, adjust);
+ err = dup_anon_vma(adjust, prev);
} else {
vma = next; /* case 3 */
vma_start = addr;
@@ -2277,7 +2277,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
int count = 0;
int error = -ENOMEM;
MA_STATE(mas_detach, &mt_detach, 0, 0);
- mt_init_flags(&mt_detach, MT_FLAGS_LOCK_EXTERN);
+ mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
mt_set_external_lock(&mt_detach, &mm->mmap_lock);
/*
@@ -2621,12 +2621,7 @@ cannot_expand:
if (map_deny_write_exec(vma, vma->vm_flags)) {
error = -EACCES;
- if (file)
- goto close_and_free_vma;
- else if (vma->vm_file)
- goto unmap_and_free_vma;
- else
- goto free_vma;
+ goto close_and_free_vma;
}
/* Allow architectures to sanity-check the vm_flags */
@@ -3042,6 +3037,7 @@ void exit_mmap(struct mm_struct *mm)
*/
set_bit(MMF_OOM_SKIP, &mm->flags);
mmap_write_lock(mm);
+ mt_clear_in_rcu(&mm->mm_mt);
free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS,
USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 231929f119d9..13e84d8c0797 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -805,7 +805,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
if (map_deny_write_exec(vma, newflags)) {
error = -EACCES;
- goto out;
+ break;
}
/* Allow architectures to sanity-check the new flags */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ac1fc986af44..7136c36c5d01 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1398,6 +1398,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
unsigned int order, bool check_free, fpi_t fpi_flags)
{
int bad = 0;
+ bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags);
bool init = want_init_on_free();
VM_BUG_ON_PAGE(PageTail(page), page);
@@ -1470,7 +1471,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
* With hardware tag-based KASAN, memory tags must be set before the
* page becomes unavailable via debug_pagealloc or arch_free_page.
*/
- if (!should_skip_kasan_poison(page, fpi_flags)) {
+ if (!skip_kasan_poison) {
kasan_poison_pages(page, order, init);
/* Memory is already initialized if KASAN did it internally. */
diff --git a/mm/rmap.c b/mm/rmap.c
index 15ae24585fc4..8632e02661ac 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1602,7 +1602,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
/* Update high watermark before we lower rss */
update_hiwater_rss(mm);
- if (PageHWPoison(subpage) && !(flags & TTU_IGNORE_HWPOISON)) {
+ if (PageHWPoison(subpage) && (flags & TTU_HWPOISON)) {
pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
if (folio_test_hugetlb(folio)) {
hugetlb_count_sub(folio_nr_pages(folio), mm);
diff --git a/mm/slab.c b/mm/slab.c
index dabc2a671fc6..edbe722fb906 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -839,7 +839,7 @@ static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp)
return 0;
}
-#if (defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)) || defined(CONFIG_SMP)
+#if defined(CONFIG_NUMA) || defined(CONFIG_SMP)
/*
* Allocates and initializes node for a node on each slab cache, used for
* either memory or cpu hotplug. If memory is being hot-added, the kmem_cache_node
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 62ba2bf577d7..2c718f45745f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -679,6 +679,7 @@ static void __del_from_avail_list(struct swap_info_struct *p)
{
int nid;
+ assert_spin_locked(&p->lock);
for_each_node(nid)
plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
}
@@ -2434,8 +2435,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_unlock(&swap_lock);
goto out_dput;
}
- del_from_avail_list(p);
spin_lock(&p->lock);
+ del_from_avail_list(p);
if (p->prio < 0) {
struct swap_info_struct *si = p;
int nid;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ef910bf349e1..a50072066221 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2883,6 +2883,8 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
unsigned int order, unsigned int nr_pages, struct page **pages)
{
unsigned int nr_allocated = 0;
+ gfp_t alloc_gfp = gfp;
+ bool nofail = false;
struct page *page;
int i;
@@ -2893,6 +2895,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
* more permissive.
*/
if (!order) {
+ /* bulk allocator doesn't support nofail req. officially */
gfp_t bulk_gfp = gfp & ~__GFP_NOFAIL;
while (nr_allocated < nr_pages) {
@@ -2931,20 +2934,35 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
if (nr != nr_pages_request)
break;
}
+ } else if (gfp & __GFP_NOFAIL) {
+ /*
+ * Higher order nofail allocations are really expensive and
+ * potentially dangerous (pre-mature OOM, disruptive reclaim
+ * and compaction etc.
+ */
+ alloc_gfp &= ~__GFP_NOFAIL;
+ nofail = true;
}
/* High-order pages or fallback path if "bulk" fails. */
-
while (nr_allocated < nr_pages) {
if (fatal_signal_pending(current))
break;
if (nid == NUMA_NO_NODE)
- page = alloc_pages(gfp, order);
+ page = alloc_pages(alloc_gfp, order);
else
- page = alloc_pages_node(nid, gfp, order);
- if (unlikely(!page))
- break;
+ page = alloc_pages_node(nid, alloc_gfp, order);
+ if (unlikely(!page)) {
+ if (!nofail)
+ break;
+
+ /* fall back to the zero order allocations */
+ alloc_gfp |= __GFP_NOFAIL;
+ order = 0;
+ continue;
+ }
+
/*
* Higher order allocations must be able to be treated as
* indepdenent small pages by callers (as they can with
@@ -3024,9 +3042,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
* allocation request, free them via vfree() if any.
*/
if (area->nr_pages != nr_small_pages) {
- warn_alloc(gfp_mask, NULL,
- "vmalloc error: size %lu, page order %u, failed to allocate pages",
- area->nr_pages * PAGE_SIZE, page_order);
+ /* vm_area_alloc_pages() can also fail due to a fatal signal */
+ if (!fatal_signal_pending(current))
+ warn_alloc(gfp_mask, NULL,
+ "vmalloc error: size %lu, page order %u, failed to allocate pages",
+ area->nr_pages * PAGE_SIZE, page_order);
goto fail;
}