diff options
-rw-r--r-- | fs/proc/task_mmu.c | 1 | ||||
-rw-r--r-- | include/linux/mm.h | 7 | ||||
-rw-r--r-- | include/linux/userfaultfd_k.h | 3 | ||||
-rw-r--r-- | include/trace/events/mmflags.h | 7 | ||||
-rw-r--r-- | include/uapi/linux/mman.h | 1 | ||||
-rw-r--r-- | mm/ksm.c | 2 | ||||
-rw-r--r-- | mm/madvise.c | 5 | ||||
-rw-r--r-- | mm/memory.c | 13 | ||||
-rw-r--r-- | mm/mempolicy.c | 3 | ||||
-rw-r--r-- | mm/mlock.c | 2 | ||||
-rw-r--r-- | mm/mmap.c | 30 | ||||
-rw-r--r-- | mm/rmap.c | 22 | ||||
-rw-r--r-- | mm/vmscan.c | 9 | ||||
-rw-r--r-- | tools/include/uapi/linux/mman.h | 1 | ||||
-rw-r--r-- | tools/testing/selftests/mm/.gitignore | 1 | ||||
-rw-r--r-- | tools/testing/selftests/mm/Makefile | 1 | ||||
-rw-r--r-- | tools/testing/selftests/mm/droppable.c | 53 |
17 files changed, 146 insertions, 15 deletions
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 71e5039d940d..46f0b0fe9ee3 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -708,6 +708,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_SHADOW_STACK)] = "ss", #endif #ifdef CONFIG_64BIT + [ilog2(VM_DROPPABLE)] = "dp", [ilog2(VM_SEALED)] = "sl", #endif }; diff --git a/include/linux/mm.h b/include/linux/mm.h index eb7c96d24ac0..e078c2890bf8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -407,6 +407,13 @@ extern unsigned int kobjsize(const void *objp); #endif #ifdef CONFIG_64BIT +#define VM_DROPPABLE_BIT 40 +#define VM_DROPPABLE BIT(VM_DROPPABLE_BIT) +#else +#define VM_DROPPABLE VM_NONE +#endif + +#ifdef CONFIG_64BIT /* VM is sealed, in vm_flags */ #define VM_SEALED _BITUL(63) #endif diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 05d59f74fc88..a12bcf042551 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -218,6 +218,9 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, { vm_flags &= __VM_UFFD_FLAGS; + if (vm_flags & VM_DROPPABLE) + return false; + if ((vm_flags & VM_UFFD_MINOR) && (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma))) return false; diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index e46d6e82765e..b63d211bd141 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -165,6 +165,12 @@ IF_HAVE_PG_ARCH_X(arch_3) # define IF_HAVE_UFFD_MINOR(flag, name) #endif +#ifdef CONFIG_64BIT +# define IF_HAVE_VM_DROPPABLE(flag, name) {flag, name}, +#else +# define IF_HAVE_VM_DROPPABLE(flag, name) +#endif + #define __def_vmaflag_names \ {VM_READ, "read" }, \ {VM_WRITE, "write" }, \ @@ -197,6 +203,7 @@ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \ {VM_MIXEDMAP, "mixedmap" }, \ {VM_HUGEPAGE, "hugepage" }, \ {VM_NOHUGEPAGE, "nohugepage" }, \ +IF_HAVE_VM_DROPPABLE(VM_DROPPABLE, "droppable" ) \ {VM_MERGEABLE, "mergeable" } \ #define show_vma_flags(flags) \ diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h index a246e11988d5..e89d00528f2f 100644 --- a/include/uapi/linux/mman.h +++ b/include/uapi/linux/mman.h @@ -17,6 +17,7 @@ #define MAP_SHARED 0x01 /* Share changes */ #define MAP_PRIVATE 0x02 /* Changes are private */ #define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */ +#define MAP_DROPPABLE 0x08 /* Zero memory under memory pressure. */ /* * Huge page size encoding when MAP_HUGETLB is specified, and a huge page @@ -717,7 +717,7 @@ static bool vma_ksm_compatible(struct vm_area_struct *vma) { if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE | VM_PFNMAP | VM_IO | VM_DONTEXPAND | VM_HUGETLB | - VM_MIXEDMAP)) + VM_MIXEDMAP| VM_DROPPABLE)) return false; /* just ignore the advice */ if (vma_is_dax(vma)) diff --git a/mm/madvise.c b/mm/madvise.c index a77893462b92..cba5bc652fc4 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1068,13 +1068,16 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, new_flags |= VM_WIPEONFORK; break; case MADV_KEEPONFORK: + if (vma->vm_flags & VM_DROPPABLE) + return -EINVAL; new_flags &= ~VM_WIPEONFORK; break; case MADV_DONTDUMP: new_flags |= VM_DONTDUMP; break; case MADV_DODUMP: - if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) + if ((!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) || + (vma->vm_flags & VM_DROPPABLE)) return -EINVAL; new_flags &= ~VM_DONTDUMP; break; diff --git a/mm/memory.c b/mm/memory.c index d10e616d7389..98d9a4485d24 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5660,6 +5660,7 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, /* If the fault handler drops the mmap_lock, vma may be freed */ struct mm_struct *mm = vma->vm_mm; vm_fault_t ret; + bool is_droppable; __set_current_state(TASK_RUNNING); @@ -5674,6 +5675,8 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, goto out; } + is_droppable = !!(vma->vm_flags & VM_DROPPABLE); + /* * Enable the memcg OOM handling for faults triggered in user * space. Kernel faults are handled more gracefully. @@ -5688,8 +5691,18 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, else ret = __handle_mm_fault(vma, address, flags); + /* + * Warning: It is no longer safe to dereference vma-> after this point, + * because mmap_lock might have been dropped by __handle_mm_fault(), so + * vma might be destroyed from underneath us. + */ + lru_gen_exit_fault(); + /* If the mapping is droppable, then errors due to OOM aren't fatal. */ + if (is_droppable) + ret &= ~VM_FAULT_OOM; + if (flags & FAULT_FLAG_USER) { mem_cgroup_exit_user_fault(); /* diff --git a/mm/mempolicy.c b/mm/mempolicy.c index aec756ae5637..32291ab25960 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2300,6 +2300,9 @@ struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct pgoff_t ilx; struct page *page; + if (vma->vm_flags & VM_DROPPABLE) + gfp |= __GFP_NOWARN; + pol = get_vma_policy(vma, addr, order, &ilx); page = alloc_pages_mpol_noprof(gfp | __GFP_COMP, order, pol, ilx, numa_node_id()); diff --git a/mm/mlock.c b/mm/mlock.c index 30b51cdea89d..b87b3d8cc9cc 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -485,7 +485,7 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, if (newflags == oldflags || (oldflags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) || - vma_is_dax(vma) || vma_is_secretmem(vma)) + vma_is_dax(vma) || vma_is_secretmem(vma) || (oldflags & VM_DROPPABLE)) /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */ goto out; diff --git a/mm/mmap.c b/mm/mmap.c index 83b4682ec85c..8aeedeb784c2 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1369,6 +1369,36 @@ unsigned long do_mmap(struct file *file, unsigned long addr, pgoff = 0; vm_flags |= VM_SHARED | VM_MAYSHARE; break; + case MAP_DROPPABLE: + if (VM_DROPPABLE == VM_NONE) + return -ENOTSUPP; + /* + * A locked or stack area makes no sense to be droppable. + * + * Also, since droppable pages can just go away at any time + * it makes no sense to copy them on fork or dump them. + * + * And don't attempt to combine with hugetlb for now. + */ + if (flags & (MAP_LOCKED | MAP_HUGETLB)) + return -EINVAL; + if (vm_flags & (VM_GROWSDOWN | VM_GROWSUP)) + return -EINVAL; + + vm_flags |= VM_DROPPABLE; + + /* + * If the pages can be dropped, then it doesn't make + * sense to reserve them. + */ + vm_flags |= VM_NORESERVE; + + /* + * Likewise, they're volatile enough that they + * shouldn't survive forks or coredumps. + */ + vm_flags |= VM_WIPEONFORK | VM_DONTDUMP; + fallthrough; case MAP_PRIVATE: /* * Set pgoff according to addr for anon_vma. diff --git a/mm/rmap.c b/mm/rmap.c index e8fc5ecb59b2..1f9b5a9cb121 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1397,7 +1397,12 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); VM_BUG_ON_VMA(address < vma->vm_start || address + (nr << PAGE_SHIFT) > vma->vm_end, vma); - __folio_set_swapbacked(folio); + /* + * VM_DROPPABLE mappings don't swap; instead they're just dropped when + * under memory pressure. + */ + if (!(vma->vm_flags & VM_DROPPABLE)) + __folio_set_swapbacked(folio); __folio_set_anon(folio, vma, address, true); if (likely(!folio_test_large(folio))) { @@ -1841,7 +1846,13 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * plus the rmap(s) (dropped by discard:). */ if (ref_count == 1 + map_count && - !folio_test_dirty(folio)) { + (!folio_test_dirty(folio) || + /* + * Unlike MADV_FREE mappings, VM_DROPPABLE + * ones can be dropped even if they've + * been dirtied. + */ + (vma->vm_flags & VM_DROPPABLE))) { dec_mm_counter(mm, MM_ANONPAGES); goto discard; } @@ -1851,7 +1862,12 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * discarded. Remap the page to page table. */ set_pte_at(mm, address, pvmw.pte, pteval); - folio_set_swapbacked(folio); + /* + * Unlike MADV_FREE mappings, VM_DROPPABLE ones + * never get swap backed on failure to drop. + */ + if (!(vma->vm_flags & VM_DROPPABLE)) + folio_set_swapbacked(folio); ret = false; page_vma_mapped_walk_done(&pvmw); break; diff --git a/mm/vmscan.c b/mm/vmscan.c index 2e34de9cd0d4..5ef0ee4610d6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4265,15 +4265,6 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c return true; } - /* dirty lazyfree */ - if (type == LRU_GEN_FILE && folio_test_anon(folio) && folio_test_dirty(folio)) { - success = lru_gen_del_folio(lruvec, folio, true); - VM_WARN_ON_ONCE_FOLIO(!success, folio); - folio_set_swapbacked(folio); - lruvec_add_folio_tail(lruvec, folio); - return true; - } - /* promoted */ if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { list_move(&folio->lru, &lrugen->folios[gen][type][zone]); diff --git a/tools/include/uapi/linux/mman.h b/tools/include/uapi/linux/mman.h index a246e11988d5..e89d00528f2f 100644 --- a/tools/include/uapi/linux/mman.h +++ b/tools/include/uapi/linux/mman.h @@ -17,6 +17,7 @@ #define MAP_SHARED 0x01 /* Share changes */ #define MAP_PRIVATE 0x02 /* Changes are private */ #define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */ +#define MAP_DROPPABLE 0x08 /* Zero memory under memory pressure. */ /* * Huge page size encoding when MAP_HUGETLB is specified, and a huge page diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore index 0b9ab987601c..a8beeb43c2b5 100644 --- a/tools/testing/selftests/mm/.gitignore +++ b/tools/testing/selftests/mm/.gitignore @@ -49,3 +49,4 @@ hugetlb_fault_after_madv hugetlb_madv_vs_map mseal_test seal_elf +droppable diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 3b49bc3d0a3b..e3e5740e13e1 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -73,6 +73,7 @@ TEST_GEN_FILES += ksm_functional_tests TEST_GEN_FILES += mdwe_test TEST_GEN_FILES += hugetlb_fault_after_madv TEST_GEN_FILES += hugetlb_madv_vs_map +TEST_GEN_FILES += droppable ifneq ($(ARCH),arm64) TEST_GEN_FILES += soft-dirty diff --git a/tools/testing/selftests/mm/droppable.c b/tools/testing/selftests/mm/droppable.c new file mode 100644 index 000000000000..f3d9ecf96890 --- /dev/null +++ b/tools/testing/selftests/mm/droppable.c @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + */ + +#include <assert.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <signal.h> +#include <sys/mman.h> +#include <linux/mman.h> + +#include "../kselftest.h" + +int main(int argc, char *argv[]) +{ + size_t alloc_size = 134217728; + size_t page_size = getpagesize(); + void *alloc; + pid_t child; + + ksft_print_header(); + ksft_set_plan(1); + + alloc = mmap(0, alloc_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_DROPPABLE, -1, 0); + assert(alloc != MAP_FAILED); + memset(alloc, 'A', alloc_size); + for (size_t i = 0; i < alloc_size; i += page_size) + assert(*(uint8_t *)(alloc + i)); + + child = fork(); + assert(child >= 0); + if (!child) { + for (;;) + *(char *)malloc(page_size) = 'B'; + } + + for (bool done = false; !done;) { + for (size_t i = 0; i < alloc_size; i += page_size) { + if (!*(uint8_t *)(alloc + i)) { + done = true; + break; + } + } + } + kill(child, SIGTERM); + + ksft_test_result_pass("MAP_DROPPABLE: PASS\n"); + exit(KSFT_PASS); +} |