summaryrefslogtreecommitdiffstats
path: root/mm/huge_memory.c (follow)
Commit message (Collapse)AuthorAgeFilesLines
* mm/numa: no task_numa_fault() call if PMD is changedZi Yan2024-08-161-16/+13
| | | | | | | | | | | | | | | | | | | | | | | | | | | | When handling a numa page fault, task_numa_fault() should be called by a process that restores the page table of the faulted folio to avoid duplicated stats counting. Commit c5b5a3dd2c1f ("mm: thp: refactor NUMA fault handling") restructured do_huge_pmd_numa_page() and did not avoid task_numa_fault() call in the second page table check after a numa migration failure. Fix it by making all !pmd_same() return immediately. This issue can cause task_numa_fault() being called more than necessary and lead to unexpected numa balancing results (It is hard to tell whether the issue will cause positive or negative performance impact due to duplicated numa fault counting). Link: https://lkml.kernel.org/r/20240809145906.1513458-3-ziy@nvidia.com Fixes: c5b5a3dd2c1f ("mm: thp: refactor NUMA fault handling") Reported-by: "Huang, Ying" <ying.huang@intel.com> Closes: https://lore.kernel.org/linux-mm/87zfqfw0yw.fsf@yhuang6-desk2.ccr.corp.intel.com/ Signed-off-by: Zi Yan <ziy@nvidia.com> Acked-by: David Hildenbrand <david@redhat.com> Cc: Baolin Wang <baolin.wang@linux.alibaba.com> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Yang Shi <shy828301@gmail.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* mm/huge_memory: avoid PMD-size page cache if neededGavin Shan2024-07-261-2/+10
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | xarray can't support arbitrary page cache size. the largest and supported page cache size is defined as MAX_PAGECACHE_ORDER by commit 099d90642a71 ("mm/filemap: make MAX_PAGECACHE_ORDER acceptable to xarray"). However, it's possible to have 512MB page cache in the huge memory's collapsing path on ARM64 system whose base page size is 64KB. 512MB page cache is breaking the limitation and a warning is raised when the xarray entry is split as shown in the following example. [root@dhcp-10-26-1-207 ~]# cat /proc/1/smaps | grep KernelPageSize KernelPageSize: 64 kB [root@dhcp-10-26-1-207 ~]# cat /tmp/test.c : int main(int argc, char **argv) { const char *filename = TEST_XFS_FILENAME; int fd = 0; void *buf = (void *)-1, *p; int pgsize = getpagesize(); int ret = 0; if (pgsize != 0x10000) { fprintf(stdout, "System with 64KB base page size is required!\n"); return -EPERM; } system("echo 0 > /sys/devices/virtual/bdi/253:0/read_ahead_kb"); system("echo 1 > /proc/sys/vm/drop_caches"); /* Open the xfs file */ fd = open(filename, O_RDONLY); assert(fd > 0); /* Create VMA */ buf = mmap(NULL, TEST_MEM_SIZE, PROT_READ, MAP_SHARED, fd, 0); assert(buf != (void *)-1); fprintf(stdout, "mapped buffer at 0x%p\n", buf); /* Populate VMA */ ret = madvise(buf, TEST_MEM_SIZE, MADV_NOHUGEPAGE); assert(ret == 0); ret = madvise(buf, TEST_MEM_SIZE, MADV_POPULATE_READ); assert(ret == 0); /* Collapse VMA */ ret = madvise(buf, TEST_MEM_SIZE, MADV_HUGEPAGE); assert(ret == 0); ret = madvise(buf, TEST_MEM_SIZE, MADV_COLLAPSE); if (ret) { fprintf(stdout, "Error %d to madvise(MADV_COLLAPSE)\n", errno); goto out; } /* Split xarray entry. Write permission is needed */ munmap(buf, TEST_MEM_SIZE); buf = (void *)-1; close(fd); fd = open(filename, O_RDWR); assert(fd > 0); fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, TEST_MEM_SIZE - pgsize, pgsize); out: if (buf != (void *)-1) munmap(buf, TEST_MEM_SIZE); if (fd > 0) close(fd); return ret; } [root@dhcp-10-26-1-207 ~]# gcc /tmp/test.c -o /tmp/test [root@dhcp-10-26-1-207 ~]# /tmp/test ------------[ cut here ]------------ WARNING: CPU: 25 PID: 7560 at lib/xarray.c:1025 xas_split_alloc+0xf8/0x128 Modules linked in: nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib \ nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct \ nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 \ ip_set rfkill nf_tables nfnetlink vfat fat virtio_balloon drm fuse \ xfs libcrc32c crct10dif_ce ghash_ce sha2_ce sha256_arm64 virtio_net \ sha1_ce net_failover virtio_blk virtio_console failover dimlib virtio_mmio CPU: 25 PID: 7560 Comm: test Kdump: loaded Not tainted 6.10.0-rc7-gavin+ #9 Hardware name: QEMU KVM Virtual Machine, BIOS edk2-20240524-1.el9 05/24/2024 pstate: 83400005 (Nzcv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--) pc : xas_split_alloc+0xf8/0x128 lr : split_huge_page_to_list_to_order+0x1c4/0x780 sp : ffff8000ac32f660 x29: ffff8000ac32f660 x28: ffff0000e0969eb0 x27: ffff8000ac32f6c0 x26: 0000000000000c40 x25: ffff0000e0969eb0 x24: 000000000000000d x23: ffff8000ac32f6c0 x22: ffffffdfc0700000 x21: 0000000000000000 x20: 0000000000000000 x19: ffffffdfc0700000 x18: 0000000000000000 x17: 0000000000000000 x16: ffffd5f3708ffc70 x15: 0000000000000000 x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000 x11: ffffffffffffffc0 x10: 0000000000000040 x9 : ffffd5f3708e692c x8 : 0000000000000003 x7 : 0000000000000000 x6 : ffff0000e0969eb8 x5 : ffffd5f37289e378 x4 : 0000000000000000 x3 : 0000000000000c40 x2 : 000000000000000d x1 : 000000000000000c x0 : 0000000000000000 Call trace: xas_split_alloc+0xf8/0x128 split_huge_page_to_list_to_order+0x1c4/0x780 truncate_inode_partial_folio+0xdc/0x160 truncate_inode_pages_range+0x1b4/0x4a8 truncate_pagecache_range+0x84/0xa0 xfs_flush_unmap_range+0x70/0x90 [xfs] xfs_file_fallocate+0xfc/0x4d8 [xfs] vfs_fallocate+0x124/0x2f0 ksys_fallocate+0x4c/0xa0 __arm64_sys_fallocate+0x24/0x38 invoke_syscall.constprop.0+0x7c/0xd8 do_el0_svc+0xb4/0xd0 el0_svc+0x44/0x1d8 el0t_64_sync_handler+0x134/0x150 el0t_64_sync+0x17c/0x180 Fix it by correcting the supported page cache orders, different sets for DAX and other files. With it corrected, 512MB page cache becomes disallowed on all non-DAX files on ARM64 system where the base page size is 64KB. After this patch is applied, the test program fails with error -EINVAL returned from __thp_vma_allowable_orders() and the madvise() system call to collapse the page caches. Link: https://lkml.kernel.org/r/20240715000423.316491-1-gshan@redhat.com Fixes: 6b24ca4a1a8d ("mm: Use multi-index entries in the page cache") Signed-off-by: Gavin Shan <gshan@redhat.com> Acked-by: David Hildenbrand <david@redhat.com> Reviewed-by: Ryan Roberts <ryan.roberts@arm.com> Acked-by: Zi Yan <ziy@nvidia.com> Cc: Baolin Wang <baolin.wang@linux.alibaba.com> Cc: Barry Song <baohua@kernel.org> Cc: Don Dutile <ddutile@redhat.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Peter Xu <peterx@redhat.com> Cc: Ryan Roberts <ryan.roberts@arm.com> Cc: William Kucharski <william.kucharski@oracle.com> Cc: <stable@vger.kernel.org> [5.17+] Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* mm: huge_memory: use !CONFIG_64BIT to relax huge page alignment on 32 bit ↵Yang Shi2024-07-261-1/+1
| | | | | | | | | | | | | | | | | | | | | | | | | | | | machines Yves-Alexis Perez reported commit 4ef9ad19e176 ("mm: huge_memory: don't force huge page alignment on 32 bit") didn't work for x86_32 [1]. It is because x86_32 uses CONFIG_X86_32 instead of CONFIG_32BIT. !CONFIG_64BIT should cover all 32 bit machines. [1] https://lore.kernel.org/linux-mm/CAHbLzkr1LwH3pcTgM+aGQ31ip2bKqiqEQ8=FQB+t2c3dhNKNHA@mail.gmail.com/ Link: https://lkml.kernel.org/r/20240712155855.1130330-1-yang@os.amperecomputing.com Fixes: 4ef9ad19e176 ("mm: huge_memory: don't force huge page alignment on 32 bit") Signed-off-by: Yang Shi <yang@os.amperecomputing.com> Reported-by: Yves-Alexis Perez <corsac@debian.org> Tested-by: Yves-Alexis Perez <corsac@debian.org> Acked-by: David Hildenbrand <david@redhat.com> Cc: Ben Hutchings <ben@decadent.org.uk> Cc: Christoph Lameter <cl@linux.com> Cc: Jiri Slaby <jirislaby@kernel.org> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Rik van Riel <riel@surriel.com> Cc: Salvatore Bonaccorso <carnil@debian.org> Cc: Suren Baghdasaryan <surenb@google.com> Cc: <stable@vger.kernel.org> [6.8+] Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* mm: shmem: rename mTHP shmem countersRyan Roberts2024-07-131-6/+6
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | The legacy PMD-sized THP counters at /proc/vmstat include thp_file_alloc, thp_file_fallback and thp_file_fallback_charge, which rather confusingly refer to shmem THP and do not include any other types of file pages. This is inconsistent since in most other places in the kernel, THP counters are explicitly separated for anon, shmem and file flavours. However, we are stuck with it since it constitutes a user ABI. Recently, commit 66f44583f9b6 ("mm: shmem: add mTHP counters for anonymous shmem") added equivalent mTHP stats for shmem, keeping the same "file_" prefix in the names. But in future, we may want to add extra stats to cover actual file pages, at which point, it would all become very confusing. So let's take the opportunity to rename these new counters "shmem_" before the change makes it upstream and the ABI becomes immutable. While we are at it, let's improve the documentation for the legacy counters to make it clear that they count shmem pages only. Link: https://lkml.kernel.org/r/20240710095503.3193901-1-ryan.roberts@arm.com Signed-off-by: Ryan Roberts <ryan.roberts@arm.com> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com> Reviewed-by: Lance Yang <ioworker0@gmail.com> Reviewed-by: Zi Yan <ziy@nvidia.com> Reviewed-by: Barry Song <baohua@kernel.org> Acked-by: David Hildenbrand <david@redhat.com> Cc: Daniel Gomez <da.gomez@samsung.com> Cc: Hugh Dickins <hughd@google.com> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* mm: thp: support "THPeligible" semantics for mTHP with anonymous shmemBang Li2024-07-131-4/+9
| | | | | | | | | | | | | | | | | | | | | After the commit 7fb1b252afb5 ("mm: shmem: add mTHP support for anonymous shmem"), we can configure different policies through the multi-size THP sysfs interface for anonymous shmem. But currently "THPeligible" indicates only whether the mapping is eligible for allocating THP-pages as well as the THP is PMD mappable or not for anonymous shmem, we need to support semantics for mTHP with anonymous shmem similar to those for mTHP with anonymous memory. Link: https://lkml.kernel.org/r/20240705032309.24933-1-libang.li@antgroup.com Signed-off-by: Bang Li <libang.li@antgroup.com> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com> Cc: David Hildenbrand <david@redhat.com> Cc: Hugh Dickins <hughd@google.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Lance Yang <ioworker0@gmail.com> Cc: Ryan Roberts <ryan.roberts@arm.com> Cc: Zi Yan <ziy@nvidia.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* mm: fix khugepaged activation policyRyan Roberts2024-07-131-0/+7
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Since the introduction of mTHP, the docuementation has stated that khugepaged would be enabled when any mTHP size is enabled, and disabled when all mTHP sizes are disabled. There are 2 problems with this; 1. this is not what was implemented by the code and 2. this is not the desirable behavior. Desirable behavior is for khugepaged to be enabled when any PMD-sized THP is enabled, anon or file. (Note that file THP is still controlled by the top-level control so we must always consider that, as well as the PMD-size mTHP control for anon). khugepaged only supports collapsing to PMD-sized THP so there is no value in enabling it when PMD-sized THP is disabled. So let's change the code and documentation to reflect this policy. Further, per-size enabled control modification events were not previously forwarded to khugepaged to give it an opportunity to start or stop. Consequently the following was resulting in khugepaged eroneously not being activated: echo never > /sys/kernel/mm/transparent_hugepage/enabled echo always > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled [ryan.roberts@arm.com: v3] Link: https://lkml.kernel.org/r/20240705102849.2479686-1-ryan.roberts@arm.com Link: https://lkml.kernel.org/r/20240705102849.2479686-1-ryan.roberts@arm.com Link: https://lkml.kernel.org/r/20240704091051.2411934-1-ryan.roberts@arm.com Signed-off-by: Ryan Roberts <ryan.roberts@arm.com> Fixes: 3485b88390b0 ("mm: thp: introduce multi-size THP sysfs interface") Closes: https://lore.kernel.org/linux-mm/7a0bbe69-1e3d-4263-b206-da007791a5c4@redhat.com/ Acked-by: David Hildenbrand <david@redhat.com> Cc: Baolin Wang <baolin.wang@linux.alibaba.com> Cc: Barry Song <baohua@kernel.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Lance Yang <ioworker0@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* mm: add per-order mTHP split countersLance Yang2024-07-131-2/+10
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Patch series "mm: introduce per-order mTHP split counters", v3. At present, the split counters in THP statistics no longer include PTE-mapped mTHP. Therefore, we want to introduce per-order mTHP split counters to monitor the frequency of mTHP splits. This will assist developers in better analyzing and optimizing system performance. /sys/kernel/mm/transparent_hugepage/hugepages-<size>/stats split split_failed split_deferred This patch (of 2): Currently, the split counters in THP statistics no longer include PTE-mapped mTHP. Therefore, we propose introducing per-order mTHP split counters to monitor the frequency of mTHP splits. This will help developers better analyze and optimize system performance. /sys/kernel/mm/transparent_hugepage/hugepages-<size>/stats split split_failed split_deferred [ioworker0@gmail.com: make things more readable, per Barry and Baolin] Link: https://lkml.kernel.org/r/20240704012905.42971-2-ioworker0@gmail.com [ioworker0@gmail.com: use == for `order' test, per David] Link: https://lkml.kernel.org/r/20240705113119.82210-1-ioworker0@gmail.com Link: https://lkml.kernel.org/r/20240704012905.42971-1-ioworker0@gmail.com Link: https://lkml.kernel.org/r/20240704012905.42971-2-ioworker0@gmail.com Link: https://lkml.kernel.org/r/20240628130750.73097-1-ioworker0@gmail.com Link: https://lkml.kernel.org/r/20240628130750.73097-2-ioworker0@gmail.com Signed-off-by: Mingzhe Yang <mingzhe.yang@ly.com> Signed-off-by: Lance Yang <ioworker0@gmail.com> Reviewed-by: Ryan Roberts <ryan.roberts@arm.com> Acked-by: Barry Song <baohua@kernel.org> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com> Acked-by: David Hildenbrand <david@redhat.com> Cc: Bang Li <libang.li@antgroup.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Zi Yan <ziy@nvidia.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* Merge branch 'mm-hotfixes-stable' into mm-stable to pick up "mm: fixAndrew Morton2024-07-061-1/+1
|\ | | | | | | | | crashes from deferred split racing folio migration", needed by "mm: migrate: split folio_migrate_mapping()".
| * mm: gup: stop abusing try_grab_folioYang Shi2024-07-061-1/+1
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | A kernel warning was reported when pinning folio in CMA memory when launching SEV virtual machine. The splat looks like: [ 464.325306] WARNING: CPU: 13 PID: 6734 at mm/gup.c:1313 __get_user_pages+0x423/0x520 [ 464.325464] CPU: 13 PID: 6734 Comm: qemu-kvm Kdump: loaded Not tainted 6.6.33+ #6 [ 464.325477] RIP: 0010:__get_user_pages+0x423/0x520 [ 464.325515] Call Trace: [ 464.325520] <TASK> [ 464.325523] ? __get_user_pages+0x423/0x520 [ 464.325528] ? __warn+0x81/0x130 [ 464.325536] ? __get_user_pages+0x423/0x520 [ 464.325541] ? report_bug+0x171/0x1a0 [ 464.325549] ? handle_bug+0x3c/0x70 [ 464.325554] ? exc_invalid_op+0x17/0x70 [ 464.325558] ? asm_exc_invalid_op+0x1a/0x20 [ 464.325567] ? __get_user_pages+0x423/0x520 [ 464.325575] __gup_longterm_locked+0x212/0x7a0 [ 464.325583] internal_get_user_pages_fast+0xfb/0x190 [ 464.325590] pin_user_pages_fast+0x47/0x60 [ 464.325598] sev_pin_memory+0xca/0x170 [kvm_amd] [ 464.325616] sev_mem_enc_register_region+0x81/0x130 [kvm_amd] Per the analysis done by yangge, when starting the SEV virtual machine, it will call pin_user_pages_fast(..., FOLL_LONGTERM, ...) to pin the memory. But the page is in CMA area, so fast GUP will fail then fallback to the slow path due to the longterm pinnalbe check in try_grab_folio(). The slow path will try to pin the pages then migrate them out of CMA area. But the slow path also uses try_grab_folio() to pin the page, it will also fail due to the same check then the above warning is triggered. In addition, the try_grab_folio() is supposed to be used in fast path and it elevates folio refcount by using add ref unless zero. We are guaranteed to have at least one stable reference in slow path, so the simple atomic add could be used. The performance difference should be trivial, but the misuse may be confusing and misleading. Redefined try_grab_folio() to try_grab_folio_fast(), and try_grab_page() to try_grab_folio(), and use them in the proper paths. This solves both the abuse and the kernel warning. The proper naming makes their usecase more clear and should prevent from abusing in the future. peterx said: : The user will see the pin fails, for gpu-slow it further triggers the WARN : right below that failure (as in the original report): : : folio = try_grab_folio(page, page_increm - 1, : foll_flags); : if (WARN_ON_ONCE(!folio)) { <------------------------ here : /* : * Release the 1st page ref if the : * folio is problematic, fail hard. : */ : gup_put_folio(page_folio(page), 1, : foll_flags); : ret = -EFAULT; : goto out; : } [1] https://lore.kernel.org/linux-mm/1719478388-31917-1-git-send-email-yangge1116@126.com/ [shy828301@gmail.com: fix implicit declaration of function try_grab_folio_fast] Link: https://lkml.kernel.org/r/CAHbLzkowMSso-4Nufc9hcMehQsK9PNz3OSu-+eniU-2Mm-xjhA@mail.gmail.com Link: https://lkml.kernel.org/r/20240628191458.2605553-1-yang@os.amperecomputing.com Fixes: 57edfcfd3419 ("mm/gup: accelerate thp gup even for "pages != NULL"") Signed-off-by: Yang Shi <yang@os.amperecomputing.com> Reported-by: yangge <yangge1116@126.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: David Hildenbrand <david@redhat.com> Cc: Peter Xu <peterx@redhat.com> Cc: <stable@vger.kernel.org> [6.6+] Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* | mm: refactor folio_undo_large_rmappable()Kefeng Wang2024-07-051-12/+1
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Folios of order <= 1 are not in deferred list, the check of order is added into folio_undo_large_rmappable() from commit 8897277acfef ("mm: support order-1 folios in the page cache"), but there is a repeated check for small folio (order 0) during each call of the folio_undo_large_rmappable(), so only keep folio_order() check inside the function. In addition, move all the checks into header file to save a function call for non-large-rmappable or empty deferred_list folio. Link: https://lkml.kernel.org/r/20240521130315.46072-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com> Reviewed-by: David Hildenbrand <david@redhat.com> Reviewed-by: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Lance Yang <ioworker0@gmail.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Michal Hocko <mhocko@kernel.org> Cc: Muchun Song <muchun.song@linux.dev> Cc: Roman Gushchin <roman.gushchin@linux.dev> Cc: Shakeel Butt <shakeel.butt@linux.dev> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* | mm/migrate: move NUMA hinting fault folio isolation + checks under PTLDavid Hildenbrand2024-07-041-3/+5
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Currently we always take a folio reference even if migration will not even be tried or isolation failed, requiring us to grab+drop an additional reference. Further, we end up calling folio_likely_mapped_shared() while the folio might have already been unmapped, because after we dropped the PTL, that can easily happen. We want to stop touching mapcounts and friends from such context, and only call folio_likely_mapped_shared() while the folio is still mapped: mapcount information is pretty much stale and unreliable otherwise. So let's move checks into numamigrate_isolate_folio(), rename that function to migrate_misplaced_folio_prepare(), and call that function from callsites where we call migrate_misplaced_folio(), but still with the PTL held. We can now stop taking temporary folio references, and really only take a reference if folio isolation succeeded. Doing the folio_likely_mapped_shared() + folio isolation under PT lock is now similar to how we handle MADV_PAGEOUT. While at it, combine the folio_is_file_lru() checks. [david@redhat.com: fix list_del() corruption] Link: https://lkml.kernel.org/r/8f85c31a-e603-4578-bf49-136dae0d4b69@redhat.com Link: https://lkml.kernel.org/r/20240626191129.658CFC32782@smtp.kernel.org Link: https://lkml.kernel.org/r/20240620212935.656243-3-david@redhat.com Signed-off-by: David Hildenbrand <david@redhat.com> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com> Reviewed-by: Zi Yan <ziy@nvidia.com> Tested-by: Donet Tom <donettom@linux.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* | mm/migrate: make migrate_misplaced_folio() return 0 on successDavid Hildenbrand2024-07-041-3/+2
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Patch series "mm/migrate: move NUMA hinting fault folio isolation + checks under PTL". Let's just return 0 on success, which is less confusing. ... especially because we got it wrong in the migrate.h stub where we have "return -EAGAIN; /* can't migrate now */" instead of "return 0;". Likely this wrong return value doesn't currently matter, but it certainly adds confusion. We'll add migrate_misplaced_folio_prepare() next, where we want to use the same "return 0 on success" approach, so let's just clean this up. Link: https://lkml.kernel.org/r/20240620212935.656243-1-david@redhat.com Link: https://lkml.kernel.org/r/20240620212935.656243-2-david@redhat.com Signed-off-by: David Hildenbrand <david@redhat.com> Reviewed-by: Zi Yan <ziy@nvidia.com> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com> Cc: Donet Tom <donettom@linux.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* | mm: memory: convert clear_huge_page() to folio_zero_user()Kefeng Wang2024-07-041-2/+2
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Patch series "mm: improve clear and copy user folio", v2. Some folio conversions. An improvement is to move address alignment into the caller as it is only needed if we don't know which address will be accessed when clearing/copying user folios. This patch (of 4): Replace clear_huge_page() with folio_zero_user(), and take a folio instead of a page. Directly get number of pages by folio_nr_pages() to remove pages_per_huge_page argument, furthermore, move the address alignment from folio_zero_user() to the callers since the alignment is only needed when we don't know which address will be accessed. Link: https://lkml.kernel.org/r/20240618091242.2140164-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20240618091242.2140164-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: David Hildenbrand <david@redhat.com> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Muchun Song <muchun.song@linux.dev> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* | mm: extend rmap flags arguments for folio_add_new_anon_rmapBarry Song2024-07-041-1/+1
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Patch series "mm: clarify folio_add_new_anon_rmap() and __folio_add_anon_rmap()", v2. This patchset is preparatory work for mTHP swapin. folio_add_new_anon_rmap() assumes that new anon rmaps are always exclusive. However, this assumption doesn’t hold true for cases like do_swap_page(), where a new anon might be added to the swapcache and is not necessarily exclusive. The patchset extends the rmap flags to allow folio_add_new_anon_rmap() to handle both exclusive and non-exclusive new anon folios. The do_swap_page() function is updated to use this extended API with rmap flags. Consequently, all new anon folios now consistently use folio_add_new_anon_rmap(). The special case for !folio_test_anon() in __folio_add_anon_rmap() can be safely removed. In conclusion, new anon folios always use folio_add_new_anon_rmap(), regardless of exclusivity. Old anon folios continue to use __folio_add_anon_rmap() via folio_add_anon_rmap_pmd() and folio_add_anon_rmap_ptes(). This patch (of 3): In the case of a swap-in, a new anonymous folio is not necessarily exclusive. This patch updates the rmap flags to allow a new anonymous folio to be treated as either exclusive or non-exclusive. To maintain the existing behavior, we always use EXCLUSIVE as the default setting. [akpm@linux-foundation.org: cleanup and constifications per David and akpm] [v-songbaohua@oppo.com: fix missing doc for flags of folio_add_new_anon_rmap()] Link: https://lkml.kernel.org/r/20240619210641.62542-1-21cnbao@gmail.com [v-songbaohua@oppo.com: enhance doc for extend rmap flags arguments for folio_add_new_anon_rmap] Link: https://lkml.kernel.org/r/20240622030256.43775-1-21cnbao@gmail.com Link: https://lkml.kernel.org/r/20240617231137.80726-1-21cnbao@gmail.com Link: https://lkml.kernel.org/r/20240617231137.80726-2-21cnbao@gmail.com Signed-off-by: Barry Song <v-songbaohua@oppo.com> Suggested-by: David Hildenbrand <david@redhat.com> Tested-by: Shuai Yuan <yuanshuai@oppo.com> Acked-by: David Hildenbrand <david@redhat.com> Cc: Baolin Wang <baolin.wang@linux.alibaba.com> Cc: Chris Li <chrisl@kernel.org> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Michal Hocko <mhocko@suse.com> Cc: Ryan Roberts <ryan.roberts@arm.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Yu Zhao <yuzhao@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* | mm/huge_memory.c: fix used-uninitializedAndrew Morton2024-07-041-2/+1
| | | | | | | | | | | | | | | | | | | | Fix used-uninitialized of `page'. Fixes: dce7d10be4bb ("mm/madvise: optimize lazyfreeing with mTHP in madvise_free") Reported-by: kernel test robot <lkp@intel.com> Closes: https://lore.kernel.org/oe-kbuild-all/202406260514.SLhNM9kQ-lkp@intel.com Cc: Lance Yang <ioworker0@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* | mm/vmscan: avoid split lazyfree THP during shrink_folio_list()Lance Yang2024-07-041-0/+66
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | When the user no longer requires the pages, they would use madvise(MADV_FREE) to mark the pages as lazy free. Subsequently, they typically would not re-write to that memory again. During memory reclaim, if we detect that the large folio and its PMD are both still marked as clean and there are no unexpected references (such as GUP), so we can just discard the memory lazily, improving the efficiency of memory reclamation in this case. On an Intel i5 CPU, reclaiming 1GiB of lazyfree THPs using mem_cgroup_force_empty() results in the following runtimes in seconds (shorter is better): -------------------------------------------- | Old | New | Change | -------------------------------------------- | 0.683426 | 0.049197 | -92.80% | -------------------------------------------- [ioworker0@gmail.com: minor changes per David] Link: https://lkml.kernel.org/r/20240622100057.3352-1-ioworker0@gmail.com Link: https://lkml.kernel.org/r/20240614015138.31461-4-ioworker0@gmail.com Signed-off-by: Lance Yang <ioworker0@gmail.com> Suggested-by: Zi Yan <ziy@nvidia.com> Suggested-by: David Hildenbrand <david@redhat.com> Cc: Bang Li <libang.li@antgroup.com> Cc: Baolin Wang <baolin.wang@linux.alibaba.com> Cc: Barry Song <baohua@kernel.org> Cc: Fangrui Song <maskray@google.com> Cc: Jeff Xie <xiehuan09@gmail.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Michal Hocko <mhocko@suse.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Muchun Song <songmuchun@bytedance.com> Cc: Peter Xu <peterx@redhat.com> Cc: Ryan Roberts <ryan.roberts@arm.com> Cc: SeongJae Park <sj@kernel.org> Cc: Yang Shi <shy828301@gmail.com> Cc: Yin Fengwei <fengwei.yin@intel.com> Cc: Zach O'Keefe <zokeefe@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* | mm/rmap: integrate PMD-mapped folio splitting into pagewalk loopLance Yang2024-07-041-20/+22
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | In preparation for supporting try_to_unmap_one() to unmap PMD-mapped folios, start the pagewalk first, then call split_huge_pmd_address() to split the folio. Link: https://lkml.kernel.org/r/20240614015138.31461-3-ioworker0@gmail.com Signed-off-by: Lance Yang <ioworker0@gmail.com> Suggested-by: David Hildenbrand <david@redhat.com> Acked-by: David Hildenbrand <david@redhat.com> Suggested-by: Baolin Wang <baolin.wang@linux.alibaba.com> Acked-by: Zi Yan <ziy@nvidia.com> Cc: Bang Li <libang.li@antgroup.com> Cc: Barry Song <baohua@kernel.org> Cc: Fangrui Song <maskray@google.com> Cc: Jeff Xie <xiehuan09@gmail.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Michal Hocko <mhocko@suse.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Muchun Song <songmuchun@bytedance.com> Cc: Peter Xu <peterx@redhat.com> Cc: Ryan Roberts <ryan.roberts@arm.com> Cc: SeongJae Park <sj@kernel.org> Cc: Yang Shi <shy828301@gmail.com> Cc: Yin Fengwei <fengwei.yin@intel.com> Cc: Zach O'Keefe <zokeefe@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* | mm: introduce pmd|pte_needs_soft_dirty_wp helpers for softdirty write-protectBarry Song2024-07-041-1/+1
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Patch series "mm: introduce pmd|pte_needs_soft_dirty_wp helpers and utilize them", v2. This patchset introduces the pte_need_soft_dirty_wp and pmd_need_soft_dirty_wp helpers to determine if write protection is required for softdirty tracking. These helpers enhance code readability and improve the overall appearance. They are then utilized in gup, mprotect, swap, and other related functions. This patch (of 2): This patch introduces the pte_needs_soft_dirty_wp and pmd_needs_soft_dirty_wp helpers to determine if write protection is required for softdirty tracking. This can enhance code readability and improve its overall appearance. These new helpers are then utilized in gup, huge_memory, and mprotect. Link: https://lkml.kernel.org/r/20240607211358.4660-1-21cnbao@gmail.com Link: https://lkml.kernel.org/r/20240607211358.4660-2-21cnbao@gmail.com Signed-off-by: Barry Song <v-songbaohua@oppo.com> Suggested-by: David Hildenbrand <david@redhat.com> Acked-by: David Hildenbrand <david@redhat.com> Cc: Chris Li <chrisl@kernel.org> Cc: Kairui Song <kasong@tencent.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Minchan Kim <minchan@kernel.org> Cc: Ryan Roberts <ryan.roberts@arm.com> Cc: Suren Baghdasaryan <surenb@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* | mm: shmem: add mTHP counters for anonymous shmemBaolin Wang2024-07-041-0/+6
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Add mTHP counters for anonymous shmem. [baolin.wang@linux.alibaba.com: update Documentation/admin-guide/mm/transhuge.rst] Link: https://lkml.kernel.org/r/d86e2e7f-4141-432b-b2ba-c6691f36ef0b@linux.alibaba.com Link: https://lkml.kernel.org/r/4fd9e467d49ae4a747e428bcd821c7d13125ae67.1718090413.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com> Reviewed-by: Lance Yang <ioworker0@gmail.com> Cc: Barry Song <v-songbaohua@oppo.com> Cc: Daniel Gomez <da.gomez@samsung.com> Cc: David Hildenbrand <david@redhat.com> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Hugh Dickins <hughd@google.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Pankaj Raghav <p.raghav@samsung.com> Cc: Ryan Roberts <ryan.roberts@arm.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Zi Yan <ziy@nvidia.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* | mm: shmem: add multi-size THP sysfs interface for anonymous shmemBaolin Wang2024-07-041-8/+4
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | To support the use of mTHP with anonymous shmem, add a new sysfs interface 'shmem_enabled' in the '/sys/kernel/mm/transparent_hugepage/hugepages-kB/' directory for each mTHP to control whether shmem is enabled for that mTHP, with a value similar to the top level 'shmem_enabled', which can be set to: "always", "inherit (to inherit the top level setting)", "within_size", "advise", "never". An 'inherit' option is added to ensure compatibility with these global settings, and the options 'force' and 'deny' are dropped, which are rather testing artifacts from the old ages. By default, PMD-sized hugepages have enabled="inherit" and all other hugepage sizes have enabled="never" for '/sys/kernel/mm/transparent_hugepage/hugepages-xxkB/shmem_enabled'. In addition, if top level value is 'force', then only PMD-sized hugepages have enabled="inherit", otherwise configuration will be failed and vice versa. That means now we will avoid using non-PMD sized THP to override the global huge allocation. [baolin.wang@linux.alibaba.com: fix transhuge.rst indentation] Link: https://lkml.kernel.org/r/b189d815-998b-4dfd-ba89-218ff51313f8@linux.alibaba.com [akpm@linux-foundation.org: reflow transhuge.rst addition to 80 cols] [baolin.wang@linux.alibaba.com: move huge_shmem_orders_lock under CONFIG_SYSFS] Link: https://lkml.kernel.org/r/eb34da66-7f12-44f3-a39e-2bcc90c33354@linux.alibaba.com [akpm@linux-foundation.org: huge_memory.c needs mm_types.h] Link: https://lkml.kernel.org/r/ffddfa8b3cb4266ff963099ab78cfd7184c57ac7.1718090413.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com> Cc: Barry Song <v-songbaohua@oppo.com> Cc: Daniel Gomez <da.gomez@samsung.com> Cc: David Hildenbrand <david@redhat.com> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Hugh Dickins <hughd@google.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Lance Yang <ioworker0@gmail.com> Cc: Pankaj Raghav <p.raghav@samsung.com> Cc: Ryan Roberts <ryan.roberts@arm.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Zi Yan <ziy@nvidia.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* | mm/swap: reduce swap cache search spaceKairui Song2024-07-041-1/+1
|/ | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Currently we use one swap_address_space for every 64M chunk to reduce lock contention, this is like having a set of smaller swap files inside one swap device. But when doing swap cache look up or insert, we are still using the offset of the whole large swap device. This is OK for correctness, as the offset (key) is unique. But Xarray is specially optimized for small indexes, it creates the radix tree levels lazily to be just enough to fit the largest key stored in one Xarray. So we are wasting tree nodes unnecessarily. For 64M chunk it should only take at most 3 levels to contain everything. But if we are using the offset from the whole swap device, the offset (key) value will be way beyond 64M, and so will the tree level. Optimize this by using a new helper swap_cache_index to get a swap entry's unique offset in its own 64M swap_address_space. I see a ~1% performance gain in benchmark and actual workload with high memory pressure. Test with `time memhog 128G` inside a 8G memcg using 128G swap (ramdisk with SWP_SYNCHRONOUS_IO dropped, tested 3 times, results are stable. The test result is similar but the improvement is smaller if SWP_SYNCHRONOUS_IO is enabled, as swap out path can never skip swap cache): Before: 6.07user 250.74system 4:17.26elapsed 99%CPU (0avgtext+0avgdata 8373376maxresident)k 0inputs+0outputs (55major+33555018minor)pagefaults 0swaps After (1.8% faster): 6.08user 246.09system 4:12.58elapsed 99%CPU (0avgtext+0avgdata 8373248maxresident)k 0inputs+0outputs (54major+33555027minor)pagefaults 0swaps Similar result with MySQL and sysbench using swap: Before: 94055.61 qps After (0.8% faster): 94834.91 qps Radix tree slab usage is also very slightly lower. Link: https://lkml.kernel.org/r/20240521175854.96038-12-ryncsn@gmail.com Signed-off-by: Kairui Song <kasong@tencent.com> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Anna Schumaker <anna@kernel.org> Cc: Barry Song <v-songbaohua@oppo.com> Cc: Chao Yu <chao@kernel.org> Cc: Chris Li <chrisl@kernel.org> Cc: David Hildenbrand <david@redhat.com> Cc: David Howells <dhowells@redhat.com> Cc: Hugh Dickins <hughd@google.com> Cc: Ilya Dryomov <idryomov@gmail.com> Cc: Jaegeuk Kim <jaegeuk@kernel.org> Cc: Jeff Layton <jlayton@kernel.org> Cc: Marc Dionne <marc.dionne@auristor.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Minchan Kim <minchan@kernel.org> Cc: NeilBrown <neilb@suse.de> Cc: Ryan Roberts <ryan.roberts@arm.com> Cc: Ryusuke Konishi <konishi.ryusuke@gmail.com> Cc: Trond Myklebust <trond.myklebust@hammerspace.com> Cc: Xiubo Li <xiubli@redhat.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* mm: huge_memory: fix misused mapping_large_folio_support() for anon foliosRan Xiaokai2024-06-151-11/+17
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | When I did a large folios split test, a WARNING "[ 5059.122759][ T166] Cannot split file folio to non-0 order" was triggered. But the test cases are only for anonmous folios. while mapping_large_folio_support() is only reasonable for page cache folios. In split_huge_page_to_list_to_order(), the folio passed to mapping_large_folio_support() maybe anonmous folio. The folio_test_anon() check is missing. So the split of the anonmous THP is failed. This is also the same for shmem_mapping(). We'd better add a check for both. But the shmem_mapping() in __split_huge_page() is not involved, as for anonmous folios, the end parameter is set to -1, so (head[i].index >= end) is always false. shmem_mapping() is not called. Also add a VM_WARN_ON_ONCE() in mapping_large_folio_support() for anon mapping, So we can detect the wrong use more easily. THP folios maybe exist in the pagecache even the file system doesn't support large folio, it is because when CONFIG_TRANSPARENT_HUGEPAGE is enabled, khugepaged will try to collapse read-only file-backed pages to THP. But the mapping does not actually support multi order large folios properly. Using /sys/kernel/debug/split_huge_pages to verify this, with this patch, large anon THP is successfully split and the warning is ceased. Link: https://lkml.kernel.org/r/202406071740485174hcFl7jRxncsHDtI-Pz-o@zte.com.cn Fixes: c010d47f107f ("mm: thp: split huge page to any lower order pages") Reviewed-by: Barry Song <baohua@kernel.org> Reviewed-by: Zi Yan <ziy@nvidia.com> Acked-by: David Hildenbrand <david@redhat.com> Signed-off-by: Ran Xiaokai <ran.xiaokai@zte.com.cn> Cc: Michal Hocko <mhocko@kernel.org> Cc: xu xin <xu.xin16@zte.com.cn> Cc: Yang Yang <yang.yang29@zte.com.cn> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* mm: drop the 'anon_' prefix for swap-out mTHP countersBaolin Wang2024-06-061-4/+4
| | | | | | | | | | | | | | | | | | | | | | | The mTHP swap related counters: 'anon_swpout' and 'anon_swpout_fallback' are confusing with an 'anon_' prefix, since the shmem can swap out non-anonymous pages. So drop the 'anon_' prefix to keep consistent with the old swap counter names. This is needed in 6.10-rcX to avoid having an inconsistent ABI out in the field. Link: https://lkml.kernel.org/r/7a8989c13299920d7589007a30065c3e2c19f0e0.1716431702.git.baolin.wang@linux.alibaba.com Fixes: d0f048ac39f6 ("mm: add per-order mTHP anon_swpout and anon_swpout_fallback counters") Fixes: 42248b9d34ea ("mm: add docs for per-order mTHP counters and transhuge_page ABI") Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com> Suggested-by: "Huang, Ying" <ying.huang@intel.com> Acked-by: Barry Song <baohua@kernel.org> Cc: David Hildenbrand <david@redhat.com> Cc: Lance Yang <ioworker0@gmail.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Ryan Roberts <ryan.roberts@arm.com> Cc: Zi Yan <ziy@nvidia.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* thp: remove HPAGE_PMD_ORDER minimum assertionMatthew Wilcox (Oracle)2024-05-071-5/+0
| | | | | | | | | We now handle order-1 folios correctly, so we don't need this assertion any more. Link: https://lkml.kernel.org/r/20240429190114.3126789-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* mm: fix race between __split_huge_pmd_locked() and GUP-fastRyan Roberts2024-05-071-23/+26
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | __split_huge_pmd_locked() can be called for a present THP, devmap or (non-present) migration entry. It calls pmdp_invalidate() unconditionally on the pmdp and only determines if it is present or not based on the returned old pmd. This is a problem for the migration entry case because pmd_mkinvalid(), called by pmdp_invalidate() must only be called for a present pmd. On arm64 at least, pmd_mkinvalid() will mark the pmd such that any future call to pmd_present() will return true. And therefore any lockless pgtable walker could see the migration entry pmd in this state and start interpretting the fields as if it were present, leading to BadThings (TM). GUP-fast appears to be one such lockless pgtable walker. x86 does not suffer the above problem, but instead pmd_mkinvalid() will corrupt the offset field of the swap entry within the swap pte. See link below for discussion of that problem. Fix all of this by only calling pmdp_invalidate() for a present pmd. And for good measure let's add a warning to all implementations of pmdp_invalidate[_ad](). I've manually reviewed all other pmdp_invalidate[_ad]() call sites and believe all others to be conformant. This is a theoretical bug found during code review. I don't have any test case to trigger it in practice. Link: https://lkml.kernel.org/r/20240501143310.1381675-1-ryan.roberts@arm.com Link: https://lore.kernel.org/all/0dd7827a-6334-439a-8fd0-43c98e6af22b@arm.com/ Fixes: 84c3fc4e9c56 ("mm: thp: check pmd migration entry in common path") Signed-off-by: Ryan Roberts <ryan.roberts@arm.com> Reviewed-by: Zi Yan <ziy@nvidia.com> Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com> Acked-by: David Hildenbrand <david@redhat.com> Cc: Andreas Larsson <andreas@gaisler.com> Cc: Andy Lutomirski <luto@kernel.org> Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org> Cc: Borislav Petkov (AMD) <bp@alien8.de> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Christian Borntraeger <borntraeger@linux.ibm.com> Cc: Christophe Leroy <christophe.leroy@csgroup.eu> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: "David S. Miller" <davem@davemloft.net> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com> Cc: Nicholas Piggin <npiggin@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Sven Schnelle <svens@linux.ibm.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Will Deacon <will@kernel.org> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* mm: delay the check for a NULL anon_vmaMatthew Wilcox (Oracle)2024-05-061-2/+4
| | | | | | | | | | | | | | | | | | | | | | | Instead of checking the anon_vma early in the fault path where all page faults pay the cost, delay it until we know we're going to need the anon_vma to be filled in. This will have a slight negative effect on the first fault in an anonymous VMA, but it shortens every other page fault. It also makes the code slightly cleaner as the anon and file backed fault handling look more similar. The Intel kernel test bot reports a 3x improvement in vm-scalability throughput with the small-allocs-mt test. This is clearly an extreme situation that won't be replicated in any real-world workload, but it's a nice win. https://lore.kernel.org/all/202404261055.c5e24608-oliver.sang@intel.com/ Link: https://lkml.kernel.org/r/20240426144506.1290619-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: Suren Baghdasaryan <surenb@google.com> Cc: David Hildenbrand <david@redhat.com> Cc: Jann Horn <jannh@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* mm: simplify thp_vma_allowable_orderMatthew Wilcox2024-05-061-2/+5
| | | | | | | | | | | Combine the three boolean arguments into one flags argument for readability. Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: David Hildenbrand <david@redhat.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Ryan Roberts <ryan.roberts@arm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* mm/huge_memory: improve split_huge_page_to_list_to_order() return value ↵David Hildenbrand2024-05-061-3/+11
| | | | | | | | | | | | | | | | | | | | | | documentation The documentation is wrong and relying on it almost resulted in BUGs in new callers: ever since fd4a7ac32918 ("mm: migrate: try again if THP split is failed due to page refcnt") we return -EAGAIN on unexpected folio references, not -EBUSY. Let's fix that and also document which other return values we can currently see and why they could happen. [david@redhat.com: v2] Link: https://lkml.kernel.org/r/20240422194217.442933-1-david@redhat.com Link: https://lkml.kernel.org/r/20240418151834.216557-1-david@redhat.com Signed-off-by: David Hildenbrand <david@redhat.com> Reviewed-by: Zi Yan <ziy@nvidia.com> Reviewed-by: John Hubbard <jhubbard@nvidia.com> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com> Cc: Matthew Wilcox <willy@infradead.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* userfaultfd: remove WRITE_ONCE when setting folio->index during UFFDIO_MOVESuren Baghdasaryan2024-05-061-1/+1
| | | | | | | | | | | | | | When folio is moved with UFFDIO_MOVE it gets locked before the rmap and index are modified. Due to the folio lock being already held, WRITE_ONCE() is not needed when setting the folio index. Remove it. Link: https://lkml.kernel.org/r/20240415020821.1152951-1-surenb@google.com Reported-by: Matthew Wilcox <willy@infradead.org> Signed-off-by: Suren Baghdasaryan <surenb@google.com> Reviewed-by: David Hildenbrand <david@redhat.com> Reviewed-by: Peter Xu <peterx@redhat.com> Cc: Lokesh Gidra <lokeshgidra@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* mm: add per-order mTHP anon_swpout and anon_swpout_fallback countersBarry Song2024-05-061-0/+4
| | | | | | | | | | | | | | | | | | | | | | | | This helps to display the fragmentation situation of the swapfile, knowing the proportion of how much we haven't split large folios. So far, we only support non-split swapout for anon memory, with the possibility of expanding to shmem in the future. So, we add the "anon" prefix to the counter names. Link: https://lkml.kernel.org/r/20240412114858.407208-3-21cnbao@gmail.com Signed-off-by: Barry Song <v-songbaohua@oppo.com> Reviewed-by: Ryan Roberts <ryan.roberts@arm.com> Acked-by: David Hildenbrand <david@redhat.com> Cc: Chris Li <chrisl@kernel.org> Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com> Cc: Kairui Song <kasong@tencent.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Peter Xu <peterx@redhat.com> Cc: Ryan Roberts <ryan.roberts@arm.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Yu Zhao <yuzhao@google.com> Cc: Jonathan Corbet <corbet@lwn.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* mm: add per-order mTHP anon_fault_alloc and anon_fault_fallback countersBarry Song2024-05-061-0/+52
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Patch series "mm: add per-order mTHP alloc and swpout counters", v6. The patchset introduces a framework to facilitate mTHP counters, starting with the allocation and swap-out counters. Currently, only four new nodes are appended to the stats directory for each mTHP size. /sys/kernel/mm/transparent_hugepage/hugepages-<size>/stats anon_fault_alloc anon_fault_fallback anon_fault_fallback_charge anon_swpout anon_swpout_fallback These nodes are crucial for us to monitor the fragmentation levels of both the buddy system and the swap partitions. In the future, we may consider adding additional nodes for further insights. This patch (of 4): Profiling a system blindly with mTHP has become challenging due to the lack of visibility into its operations. Presenting the success rate of mTHP allocations appears to be pressing need. Recently, I've been experiencing significant difficulty debugging performance improvements and regressions without these figures. It's crucial for us to understand the true effectiveness of mTHP in real-world scenarios, especially in systems with fragmented memory. This patch establishes the framework for per-order mTHP counters. It begins by introducing the anon_fault_alloc and anon_fault_fallback counters. Additionally, to maintain consistency with thp_fault_fallback_charge in /proc/vmstat, this patch also tracks anon_fault_fallback_charge when mem_cgroup_charge fails for mTHP. Incorporating additional counters should now be straightforward as well. Link: https://lkml.kernel.org/r/20240412114858.407208-1-21cnbao@gmail.com Link: https://lkml.kernel.org/r/20240412114858.407208-2-21cnbao@gmail.com Signed-off-by: Barry Song <v-songbaohua@oppo.com> Acked-by: David Hildenbrand <david@redhat.com> Cc: Chris Li <chrisl@kernel.org> Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com> Cc: Kairui Song <kasong@tencent.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Peter Xu <peterx@redhat.com> Cc: Ryan Roberts <ryan.roberts@arm.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Yu Zhao <yuzhao@google.com> Cc: Jonathan Corbet <corbet@lwn.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* mm/huge_memory: use folio_mapcount() in zap_huge_pmd() sanity checkDavid Hildenbrand2024-05-061-1/+1
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | We want to limit the use of page_mapcount() to the places where it is absolutely necessary. Let's similarly check for folio_mapcount() underflows instead of page_mapcount() underflows like we do in zap_present_folio_ptes() now. Instead of the VM_BUG_ON(), we should actually be doing something like print_bad_pte(). For now, let's keep it simple and use WARN_ON_ONCE(), performing that check independently of DEBUG_VM. Link: https://lkml.kernel.org/r/20240409192301.907377-9-david@redhat.com Signed-off-by: David Hildenbrand <david@redhat.com> Cc: Chris Zankel <chris@zankel.net> Cc: Hugh Dickins <hughd@google.com> Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Max Filippov <jcmvbkbc@gmail.com> Cc: Miaohe Lin <linmiaohe@huawei.com> Cc: Muchun Song <muchun.song@linux.dev> Cc: Naoya Horiguchi <nao.horiguchi@gmail.com> Cc: Peter Xu <peterx@redhat.com> Cc: Richard Chang <richardycc@google.com> Cc: Rich Felker <dalias@libc.org> Cc: Ryan Roberts <ryan.roberts@arm.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yin Fengwei <fengwei.yin@intel.com> Cc: Yoshinori Sato <ysato@users.sourceforge.jp> Cc: Zi Yan <ziy@nvidia.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* mm: swap: remove CLUSTER_FLAG_HUGE from swap_cluster_info:flagsRyan Roberts2024-04-261-3/+0
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Patch series "Swap-out mTHP without splitting", v7. This series adds support for swapping out multi-size THP (mTHP) without needing to first split the large folio via split_huge_page_to_list_to_order(). It closely follows the approach already used to swap-out PMD-sized THP. There are a couple of reasons for swapping out mTHP without splitting: - Performance: It is expensive to split a large folio and under extreme memory pressure some workloads regressed performance when using 64K mTHP vs 4K small folios because of this extra cost in the swap-out path. This series not only eliminates the regression but makes it faster to swap out 64K mTHP vs 4K small folios. - Memory fragmentation avoidance: If we can avoid splitting a large folio memory is less likely to become fragmented, making it easier to re-allocate a large folio in future. - Performance: Enables a separate series [7] to swap-in whole mTHPs, which means we won't lose the TLB-efficiency benefits of mTHP once the memory has been through a swap cycle. I've done what I thought was the smallest change possible, and as a result, this approach is only employed when the swap is backed by a non-rotating block device (just as PMD-sized THP is supported today). Discussion against the RFC concluded that this is sufficient. Performance Testing =================== I've run some swap performance tests on Ampere Altra VM (arm64) with 8 CPUs. The VM is set up with a 35G block ram device as the swap device and the test is run from inside a memcg limited to 40G memory. I've then run `usemem` from vm-scalability with 70 processes, each allocating and writing 1G of memory. I've repeated everything 6 times and taken the mean performance improvement relative to 4K page baseline: | alloc size | baseline | + this series | | | mm-unstable (~v6.9-rc1) | | |:-----------|------------------------:|------------------------:| | 4K Page | 0.0% | 1.3% | | 64K THP | -13.6% | 46.3% | | 2M THP | 91.4% | 89.6% | So with this change, the 64K swap performance goes from a 14% regression to a 46% improvement. While 2M shows a small regression I'm confident that this is just noise. [1] https://lore.kernel.org/linux-mm/20231010142111.3997780-1-ryan.roberts@arm.com/ [2] https://lore.kernel.org/linux-mm/20231017161302.2518826-1-ryan.roberts@arm.com/ [3] https://lore.kernel.org/linux-mm/20231025144546.577640-1-ryan.roberts@arm.com/ [4] https://lore.kernel.org/linux-mm/20240311150058.1122862-1-ryan.roberts@arm.com/ [5] https://lore.kernel.org/linux-mm/20240327144537.4165578-1-ryan.roberts@arm.com/ [6] https://lore.kernel.org/linux-mm/20240403114032.1162100-1-ryan.roberts@arm.com/ [7] https://lore.kernel.org/linux-mm/20240304081348.197341-1-21cnbao@gmail.com/ [8] https://lore.kernel.org/linux-mm/CAGsJ_4yMOow27WDvN2q=E4HAtDd2PJ=OQ5Pj9DG+6FLWwNuXUw@mail.gmail.com/ [9] https://lore.kernel.org/linux-mm/579d5127-c763-4001-9625-4563a9316ac3@redhat.com/ This patch (of 7): As preparation for supporting small-sized THP in the swap-out path, without first needing to split to order-0, Remove the CLUSTER_FLAG_HUGE, which, when present, always implies PMD-sized THP, which is the same as the cluster size. The only use of the flag was to determine whether a swap entry refers to a single page or a PMD-sized THP in swap_page_trans_huge_swapped(). Instead of relying on the flag, we now pass in order, which originates from the folio's order. This allows the logic to work for folios of any order. The one snag is that one of the swap_page_trans_huge_swapped() call sites does not have the folio. But it was only being called there to shortcut a call __try_to_reclaim_swap() in some cases. __try_to_reclaim_swap() gets the folio and (via some other functions) calls swap_page_trans_huge_swapped(). So I've removed the problematic call site and believe the new logic should be functionally equivalent. That said, removing the fast path means that we will take a reference and trylock a large folio much more often, which we would like to avoid. The next patch will solve this. Removing CLUSTER_FLAG_HUGE also means we can remove split_swap_cluster() which used to be called during folio splitting, since split_swap_cluster()'s only job was to remove the flag. Link: https://lkml.kernel.org/r/20240408183946.2991168-1-ryan.roberts@arm.com Link: https://lkml.kernel.org/r/20240408183946.2991168-2-ryan.roberts@arm.com Signed-off-by: Ryan Roberts <ryan.roberts@arm.com> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Acked-by: Chris Li <chrisl@kernel.org> Acked-by: David Hildenbrand <david@redhat.com> Cc: Barry Song <21cnbao@gmail.com> Cc: Gao Xiang <xiang@kernel.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Lance Yang <ioworker0@gmail.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Michal Hocko <mhocko@suse.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yu Zhao <yuzhao@google.com> Cc: Barry Song <v-songbaohua@oppo.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* mm: huge_memory: add the missing folio_test_pmd_mappable() for THP split ↵Baolin Wang2024-04-261-2/+5
| | | | | | | | | | | | | | | | | statistics Now the mTHP can also be split or added into the deferred list, so add folio_test_pmd_mappable() validation for PMD mapped THP, to avoid confusion with PMD mapped THP related statistics. [baolin.wang@linux.alibaba.com: check THP earlier in case folio is split, per Lance] Link: https://lkml.kernel.org/r/b99f8cb14bc85fdb6ab43721d1331cb5ebed2581.1713771041.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/a5341defeef27c9ac7b85c97f030f93e4368bbc1.1711694852.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com> Acked-by: David Hildenbrand <david@redhat.com> Reviewed-by: Lance Yang <ioworker0@gmail.com> Cc: Muchun Song <muchun.song@linux.dev> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* thp: add thp_get_unmapped_area_vmflags()Rick Edgecombe2024-04-261-7/+16
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | When memory is being placed, mmap() will take care to respect the guard gaps of certain types of memory (VM_SHADOWSTACK, VM_GROWSUP and VM_GROWSDOWN). In order to ensure guard gaps between mappings, mmap() needs to consider two things: 1. That the new mapping isn't placed in an any existing mappings guard gaps. 2. That the new mapping isn't placed such that any existing mappings are not in *its* guard gaps. The longstanding behavior of mmap() is to ensure 1, but not take any care around 2. So for example, if there is a PAGE_SIZE free area, and a mmap() with a PAGE_SIZE size, and a type that has a guard gap is being placed, mmap() may place the shadow stack in the PAGE_SIZE free area. Then the mapping that is supposed to have a guard gap will not have a gap to the adjacent VMA. Add a THP implementations of the vm_flags variant of get_unmapped_area(). Future changes will call this from mmap.c in the do_mmap() path to allow shadow stacks to be placed with consideration taken for the start guard gap. Shadow stack memory is always private and anonymous and so special guard gap logic is not needed in a lot of caseis, but it can be mapped by THP, so needs to be handled. Link: https://lkml.kernel.org/r/20240326021656.202649-7-rick.p.edgecombe@intel.com Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com> Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu> Cc: Alexei Starovoitov <ast@kernel.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org> Cc: Borislav Petkov (AMD) <bp@alien8.de> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Deepak Gupta <debug@rivosinc.com> Cc: Guo Ren <guoren@kernel.org> Cc: Helge Deller <deller@gmx.de> Cc: H. Peter Anvin (Intel) <hpa@zytor.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com> Cc: Kees Cook <keescook@chromium.org> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Liam R. Howlett <Liam.Howlett@oracle.com> Cc: Mark Brown <broonie@kernel.org> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com> Cc: Nicholas Piggin <npiggin@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* mm: switch mm->get_unmapped_area() to a flagRick Edgecombe2024-04-261-5/+4
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | The mm_struct contains a function pointer *get_unmapped_area(), which is set to either arch_get_unmapped_area() or arch_get_unmapped_area_topdown() during the initialization of the mm. Since the function pointer only ever points to two functions that are named the same across all arch's, a function pointer is not really required. In addition future changes will want to add versions of the functions that take additional arguments. So to save a pointers worth of bytes in mm_struct, and prevent adding additional function pointers to mm_struct in future changes, remove it and keep the information about which get_unmapped_area() to use in a flag. Add the new flag to MMF_INIT_MASK so it doesn't get clobbered on fork by mmf_init_flags(). Most MM flags get clobbered on fork. In the pre-existing behavior mm->get_unmapped_area() would get copied to the new mm in dup_mm(), so not clobbering the flag preserves the existing behavior around inheriting the topdown-ness. Introduce a helper, mm_get_unmapped_area(), to easily convert code that refers to the old function pointer to instead select and call either arch_get_unmapped_area() or arch_get_unmapped_area_topdown() based on the flag. Then drop the mm->get_unmapped_area() function pointer. Leave the get_unmapped_area() pointer in struct file_operations alone. The main purpose of this change is to reorganize in preparation for future changes, but it also converts the calls of mm->get_unmapped_area() from indirect branches into a direct ones. The stress-ng bigheap benchmark calls realloc a lot, which calls through get_unmapped_area() in the kernel. On x86, the change yielded a ~1% improvement there on a retpoline config. In testing a few x86 configs, removing the pointer unfortunately didn't result in any actual size reductions in the compiled layout of mm_struct. But depending on compiler or arch alignment requirements, the change could shrink the size of mm_struct. Link: https://lkml.kernel.org/r/20240326021656.202649-3-rick.p.edgecombe@intel.com Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com> Acked-by: Dave Hansen <dave.hansen@linux.intel.com> Acked-by: Liam R. Howlett <Liam.Howlett@oracle.com> Reviewed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Alexei Starovoitov <ast@kernel.org> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Andy Lutomirski <luto@kernel.org> Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org> Cc: Borislav Petkov (AMD) <bp@alien8.de> Cc: Christophe Leroy <christophe.leroy@csgroup.eu> Cc: Deepak Gupta <debug@rivosinc.com> Cc: Guo Ren <guoren@kernel.org> Cc: Helge Deller <deller@gmx.de> Cc: H. Peter Anvin (Intel) <hpa@zytor.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com> Cc: Kees Cook <keescook@chromium.org> Cc: Mark Brown <broonie@kernel.org> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com> Cc: Nicholas Piggin <npiggin@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* mm/gup: handle huge pmd for follow_pmd_mask()Peter Xu2024-04-261-84/+2
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Replace pmd_trans_huge() with pmd_leaf() to also cover pmd_huge() as long as enabled. FOLL_TOUCH and FOLL_SPLIT_PMD only apply to THP, not yet huge. Since now follow_trans_huge_pmd() can process hugetlb pages, renaming it into follow_huge_pmd() to match what it does. Move it into gup.c so not depend on CONFIG_THP. When at it, move the ctx->page_mask setup into follow_huge_pmd(), only set it when the page is valid. It was not a bug to set it before even if GUP failed (page==NULL), because follow_page_mask() callers always ignores page_mask if so. But doing so makes the code cleaner. [peterx@redhat.com: allow follow_pmd_mask() to take hugetlb tail pages] Link: https://lkml.kernel.org/r/20240403013249.1418299-3-peterx@redhat.com Link: https://lkml.kernel.org/r/20240327152332.950956-12-peterx@redhat.com Signed-off-by: Peter Xu <peterx@redhat.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Tested-by: Ryan Roberts <ryan.roberts@arm.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Andrew Jones <andrew.jones@linux.dev> Cc: Aneesh Kumar K.V (IBM) <aneesh.kumar@kernel.org> Cc: Axel Rasmussen <axelrasmussen@google.com> Cc: Christophe Leroy <christophe.leroy@csgroup.eu> Cc: Christoph Hellwig <hch@infradead.org> Cc: David Hildenbrand <david@redhat.com> Cc: James Houghton <jthoughton@google.com> Cc: John Hubbard <jhubbard@nvidia.com> Cc: Kirill A. Shutemov <kirill@shutemov.name> Cc: Lorenzo Stoakes <lstoakes@gmail.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: "Mike Rapoport (IBM)" <rppt@kernel.org> Cc: Muchun Song <muchun.song@linux.dev> Cc: Rik van Riel <riel@surriel.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Yang Shi <shy828301@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* mm/gup: handle huge pud for follow_pud_mask()Peter Xu2024-04-261-45/+2
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Teach follow_pud_mask() to be able to handle normal PUD pages like hugetlb. Rename follow_devmap_pud() to follow_huge_pud() so that it can process either huge devmap or hugetlb. Move it out of TRANSPARENT_HUGEPAGE_PUD and and huge_memory.c (which relies on CONFIG_THP). Switch to pud_leaf() to detect both cases in the slow gup. In the new follow_huge_pud(), taking care of possible CoR for hugetlb if necessary. touch_pud() needs to be moved out of huge_memory.c to be accessable from gup.c even if !THP. Since at it, optimize the non-present check by adding a pud_present() early check before taking the pgtable lock, failing the follow_page() early if PUD is not present: that is required by both devmap or hugetlb. Use pud_huge() to also cover the pud_devmap() case. One more trivial thing to mention is, introduce "pud_t pud" in the code paths along the way, so the code doesn't dereference *pudp multiple time. Not only because that looks less straightforward, but also because if the dereference really happened, it's not clear whether there can be race to see different *pudp values when it's being modified at the same time. Setting ctx->page_mask properly for a PUD entry. As a side effect, this patch should also be able to optimize devmap GUP on PUD to be able to jump over the whole PUD range, but not yet verified. Hugetlb already can do so prior to this patch. Link: https://lkml.kernel.org/r/20240327152332.950956-11-peterx@redhat.com Signed-off-by: Peter Xu <peterx@redhat.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Tested-by: Ryan Roberts <ryan.roberts@arm.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Andrew Jones <andrew.jones@linux.dev> Cc: Aneesh Kumar K.V (IBM) <aneesh.kumar@kernel.org> Cc: Axel Rasmussen <axelrasmussen@google.com> Cc: Christophe Leroy <christophe.leroy@csgroup.eu> Cc: Christoph Hellwig <hch@infradead.org> Cc: David Hildenbrand <david@redhat.com> Cc: James Houghton <jthoughton@google.com> Cc: John Hubbard <jhubbard@nvidia.com> Cc: Kirill A. Shutemov <kirill@shutemov.name> Cc: Lorenzo Stoakes <lstoakes@gmail.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: "Mike Rapoport (IBM)" <rppt@kernel.org> Cc: Muchun Song <muchun.song@linux.dev> Cc: Rik van Riel <riel@surriel.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Yang Shi <shy828301@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* mm: rename mm_put_huge_zero_page to mm_put_huge_zero_folioMatthew Wilcox (Oracle)2024-04-261-1/+1
| | | | | | | | | Also remove mm_get_huge_zero_page() now it has no users. Link: https://lkml.kernel.org/r/20240326202833.523759-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: David Hildenbrand <david@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* mm: convert do_huge_pmd_anonymous_page to huge_zero_folioMatthew Wilcox (Oracle)2024-04-261-11/+12
| | | | | | | | | Use folios more widely. Link: https://lkml.kernel.org/r/20240326202833.523759-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: David Hildenbrand <david@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* mm: convert huge_zero_page to huge_zero_folioMatthew Wilcox (Oracle)2024-04-261-14/+14
| | | | | | | | | | With all callers of is_huge_zero_page() converted, we can now switch the huge_zero_page itself from being a compound page to a folio. Link: https://lkml.kernel.org/r/20240326202833.523759-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: David Hildenbrand <david@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* mm: add pmd_folio()Matthew Wilcox (Oracle)2024-04-261-3/+3
| | | | | | | | | | | Convert directly from a pmd to a folio without going through another representation first. For now this is just a slightly shorter way to write it, but it might end up being more efficient later. Link: https://lkml.kernel.org/r/20240326202833.523759-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: David Hildenbrand <david@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* mm: add is_huge_zero_folio()Matthew Wilcox (Oracle)2024-04-261-3/+3
| | | | | | | | | | | This is the folio equivalent of is_huge_zero_page(). It doesn't add any efficiency, but it does prevent the caller from passing a tail page and getting confused when the predicate returns false. Link: https://lkml.kernel.org/r/20240326202833.523759-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: David Hildenbrand <david@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* huge_memory.c: document huge page splitting rules more thoroughlyJohn Hubbard2024-04-261-15/+27
| | | | | | | | | | | | | | | 1. Add information about the behavior of huge page splitting, with respect to page/folio refcounts, and gup/pup pins. 2. Update and clarify the existing documentation, to compensate for the ravages of time and code change. Link: https://lkml.kernel.org/r/20240325044452.217463-1-jhubbard@nvidia.com Signed-off-by: John Hubbard <jhubbard@nvidia.com> Reviewed-by: Zi Yan <ziy@nvidia.com> Reviewed-by: David Hildenbrand <david@redhat.com> Cc: Matthew Wilcox <willy@infradead.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* mm: convert folio_estimated_sharers() to folio_likely_mapped_shared()David Hildenbrand2024-04-261-1/+1
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Callers of folio_estimated_sharers() only care about "mapped shared vs. mapped exclusively", not the exact estimate of sharers. Let's consolidate and unify the condition users are checking. While at it clarify the semantics and extend the discussion on the fuzziness. Use the "likely mapped shared" terminology to better express what the (adjusted) function actually checks. Whether a partially-mappable folio is more likely to not be partially mapped than partially mapped is debatable. In the future, we might be able to improve our estimate for partially-mappable folios, though. Note that we will now consistently detect "mapped shared" only if the first subpage is actually mapped multiple times. When the first subpage is not mapped, we will consistently detect it as "mapped exclusively". This change should currently only affect the usage in madvise_free_pte_range() and queue_folios_pte_range() for large folios: if the first page was already unmapped, we would have skipped the folio. [david@redhat.com: folio_likely_mapped_shared() kerneldoc fixup] Link: https://lkml.kernel.org/r/dd0ad9f2-2d7a-45f3-9ba3-979488c7dd27@redhat.com Link: https://lkml.kernel.org/r/20240227201548.857831-1-david@redhat.com Signed-off-by: David Hildenbrand <david@redhat.com> Reviewed-by: Khalid Aziz <khalid.aziz@oracle.com> Acked-by: Barry Song <v-songbaohua@oppo.com> Reviewed-by: Vishal Moola (Oracle) <vishal.moola@gmail.com> Reviewed-by: Ryan Roberts <ryan.roberts@arm.com> Reviewed-by: Zi Yan <ziy@nvidia.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* mm: remove folio_prep_large_rmappable()Matthew Wilcox (Oracle)2024-04-261-8/+1
| | | | | | | | | | | | | | | | | Now that prep_compound_page() initialises folio->_deferred_list, folio_prep_large_rmappable()'s only purpose is to set the large_rmappable flag, so inline it into the two callers. Take the opportunity to convert the large_rmappable definition from PAGEFLAG to FOLIO_FLAG and remove the existance of PageTestLargeRmappable and friends. Link: https://lkml.kernel.org/r/20240321142448.1645400-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: David Hildenbrand <david@redhat.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> Cc: Miaohe Lin <linmiaohe@huawei.com> Cc: Muchun Song <muchun.song@linux.dev> Cc: Oscar Salvador <osalvador@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* mm: always initialise folio->_deferred_listMatthew Wilcox (Oracle)2024-04-261-2/+0
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Patch series "Various significant MM patches". These patches all interact in annoying ways which make it tricky to send them out in any way other than a big batch, even though there's not really an overarching theme to connect them. The big effects of this patch series are: - folio_test_hugetlb() becomes reliable, even when called without a page reference - We free up PG_slab, and we could always use more page flags - We no longer need to check PageSlab before calling page_mapcount() This patch (of 9): For compound pages which are at least order-2 (and hence have a deferred_list), initialise it and then we can check at free that the page is not part of a deferred list. We recently found this useful to rule out a source of corruption. [peterx@redhat.com: always initialise folio->_deferred_list] Link: https://lkml.kernel.org/r/20240417211836.2742593-2-peterx@redhat.com Link: https://lkml.kernel.org/r/20240321142448.1645400-1-willy@infradead.org Link: https://lkml.kernel.org/r/20240321142448.1645400-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> Signed-off-by: Peter Xu <peterx@redhat.com> Reviewed-by: David Hildenbrand <david@redhat.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> Cc: Miaohe Lin <linmiaohe@huawei.com> Cc: Muchun Song <muchun.song@linux.dev> Cc: Oscar Salvador <osalvador@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* mm: create new codetag references during page splittingSuren Baghdasaryan2024-04-261-0/+2
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | When a high-order page is split into smaller ones, each newly split page should get its codetag. After the split each split page will be referencing the original codetag. The codetag's "bytes" counter remains the same because the amount of allocated memory has not changed, however the "calls" counter gets increased to keep the counter correct when these individual pages get freed. Link: https://lkml.kernel.org/r/20240321163705.3067592-20-surenb@google.com Signed-off-by: Suren Baghdasaryan <surenb@google.com> Reviewed-by: Vlastimil Babka <vbabka@suse.cz> Tested-by: Kees Cook <keescook@chromium.org> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Alex Gaynor <alex.gaynor@gmail.com> Cc: Alice Ryhl <aliceryhl@google.com> Cc: Andreas Hindborg <a.hindborg@samsung.com> Cc: Benno Lossin <benno.lossin@proton.me> Cc: "Björn Roy Baron" <bjorn3_gh@protonmail.com> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Christoph Lameter <cl@linux.com> Cc: Dennis Zhou <dennis@kernel.org> Cc: Gary Guo <gary@garyguo.net> Cc: Kent Overstreet <kent.overstreet@linux.dev> Cc: Miguel Ojeda <ojeda@kernel.org> Cc: Pasha Tatashin <pasha.tatashin@soleen.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Tejun Heo <tj@kernel.org> Cc: Wedson Almeida Filho <wedsonaf@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* mm/mempolicy: use numa_node_id() instead of cpu_to_node()Donet Tom2024-04-261-1/+1
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Patch series "Allow migrate on protnone reference with MPOL_PREFERRED_MANY policy:, v4. This patchset is to optimize the cross-socket memory access with MPOL_PREFERRED_MANY policy. To test this patch we ran the following test on a 3 node system. Node 0 - 2GB - Tier 1 Node 1 - 11GB - Tier 1 Node 6 - 10GB - Tier 2 Below changes are made to memcached to set the memory policy, It select Node0 and Node1 as preferred nodes. #include <numaif.h> #include <numa.h> unsigned long nodemask; int ret; nodemask = 0x03; ret = set_mempolicy(MPOL_PREFERRED_MANY | MPOL_F_NUMA_BALANCING, &nodemask, 10); /* If MPOL_F_NUMA_BALANCING isn't supported, * fall back to MPOL_PREFERRED_MANY */ if (ret < 0 && errno == EINVAL){ printf("set mem policy normal\n"); ret = set_mempolicy(MPOL_PREFERRED_MANY, &nodemask, 10); } if (ret < 0) { perror("Failed to call set_mempolicy"); exit(-1); } Test Procedure: =============== 1. Make sure memory tiring and demotion are enabled. 2. Start memcached. # ./memcached -b 100000 -m 204800 -u root -c 1000000 -t 7 -d -s "/tmp/memcached.sock" 3. Run memtier_benchmark to store 3200000 keys. #./memtier_benchmark -S "/tmp/memcached.sock" --protocol=memcache_binary --threads=1 --pipeline=1 --ratio=1:0 --key-pattern=S:S --key-minimum=1 --key-maximum=3200000 -n allkeys -c 1 -R -x 1 -d 1024 4. Start a memory eater on node 0 and 1. This will demote all memcached pages to node 6. 5. Make sure all the memcached pages got demoted to lower tier by reading /proc/<memcaced PID>/numa_maps. # cat /proc/2771/numa_maps --- default anon=1009 dirty=1009 active=0 N6=1009 kernelpagesize_kB=64 default anon=1009 dirty=1009 active=0 N6=1009 kernelpagesize_kB=64 --- 6. Kill memory eater. 7. Read the pgpromote_success counter. 8. Start reading the keys by running memtier_benchmark. #./memtier_benchmark -S "/tmp/memcached.sock" --protocol=memcache_binary --pipeline=1 --distinct-client-seed --ratio=0:3 --key-pattern=R:R --key-minimum=1 --key-maximum=3200000 -n allkeys --threads=64 -c 1 -R -x 6 9. Read the pgpromote_success counter. Test Results: ============= Without Patch ------------------ 1. pgpromote_success before test Node 0: pgpromote_success 11 Node 1: pgpromote_success 140974 pgpromote_success after test Node 0: pgpromote_success 11 Node 1: pgpromote_success 140974 2. Memtier-benchmark result. AGGREGATED AVERAGE RESULTS (6 runs) ================================================================== Type Ops/sec Hits/sec Misses/sec Avg. Latency p50 Latency ------------------------------------------------------------------ Sets 0.00 --- --- --- --- Gets 305792.03 305791.93 0.10 0.18949 0.16700 Waits 0.00 --- --- --- --- Totals 305792.03 305791.93 0.10 0.18949 0.16700 ====================================== p99 Latency p99.9 Latency KB/sec ------------------------------------- --- --- 0.00 0.44700 1.71100 11542.69 --- --- --- 0.44700 1.71100 11542.69 With Patch --------------- 1. pgpromote_success before test Node 0: pgpromote_success 5 Node 1: pgpromote_success 89386 pgpromote_success after test Node 0: pgpromote_success 57895 Node 1: pgpromote_success 141463 2. Memtier-benchmark result. AGGREGATED AVERAGE RESULTS (6 runs) ==================================================================== Type Ops/sec Hits/sec Misses/sec Avg. Latency p50 Latency -------------------------------------------------------------------- Sets 0.00 --- --- --- --- Gets 521942.24 521942.07 0.17 0.11459 0.10300 Waits 0.00 --- --- --- --- Totals 521942.24 521942.07 0.17 0.11459 0.10300 ======================================= p99 Latency p99.9 Latency KB/sec --------------------------------------- --- --- 0.00 0.23100 0.31900 19701.68 --- --- --- 0.23100 0.31900 19701.68 Test Result Analysis: ===================== 1. With patch we could observe pages are getting promoted. 2. Memtier-benchmark results shows that, with the patch, performance has increased more than 50%. Ops/sec without fix - 305792.03 Ops/sec with fix - 521942.24 This patch (of 2): Instead of using 'cpu_to_node()', we use 'numa_node_id()', which is quicker. smp_processor_id is guaranteed to be stable in the 'mpol_misplaced()' function because it is called with ptl held. lockdep_assert_held was added to ensure that. No functional change in this patch. [donettom@linux.ibm.com: add "* @vmf: structure describing the fault" comment] Link: https://lkml.kernel.org/r/d8b993ea9dccfac0bc3ed61d3a81f4ac5f376e46.1711002865.git.donettom@linux.ibm.com Link: https://lkml.kernel.org/r/cover.1711373653.git.donettom@linux.ibm.com Link: https://lkml.kernel.org/r/6059f034f436734b472d066db69676fb3a459864.1711373653.git.donettom@linux.ibm.com Link: https://lkml.kernel.org/r/cover.1709909210.git.donettom@linux.ibm.com Link: https://lkml.kernel.org/r/744646531af02cc687cde8ae788fb1779e99d02c.1709909210.git.donettom@linux.ibm.com Signed-off-by: Aneesh Kumar K.V (IBM) <aneesh.kumar@kernel.org> Signed-off-by: Donet Tom <donettom@linux.ibm.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Feng Tang <feng.tang@intel.com> Cc: Huang, Ying <ying.huang@intel.com> Cc: Hugh Dickins <hughd@google.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org> Cc: Mel Gorman <mgorman@suse.de> Cc: Michal Hocko <mhocko@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@surriel.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
* userfaultfd: change src_folio after ensuring it's unpinned in UFFDIO_MOVELokesh Gidra2024-04-171-3/+3
| | | | | | | | | | | | | | | | | | | | | | | Commit d7a08838ab74 ("mm: userfaultfd: fix unexpected change to src_folio when UFFDIO_MOVE fails") moved the src_folio->{mapping, index} changing to after clearing the page-table and ensuring that it's not pinned. This avoids failure of swapout+migration and possibly memory corruption. However, the commit missed fixing it in the huge-page case. Link: https://lkml.kernel.org/r/20240404171726.2302435-1-lokeshgidra@google.com Fixes: adef440691ba ("userfaultfd: UFFDIO_MOVE uABI") Signed-off-by: Lokesh Gidra <lokeshgidra@google.com> Acked-by: David Hildenbrand <david@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Kalesh Singh <kaleshsingh@google.com> Cc: Lokesh Gidra <lokeshgidra@google.com> Cc: Nicolas Geoffray <ngeoffray@google.com> Cc: Peter Xu <peterx@redhat.com> Cc: Qi Zheng <zhengqi.arch@bytedance.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>