summaryrefslogtreecommitdiffstats
path: root/lib
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-04-28 04:42:02 +0200
committerLinus Torvalds <torvalds@linux-foundation.org>2023-04-28 04:42:02 +0200
commit7fa8a8ee9400fe8ec188426e40e481717bc5e924 (patch)
treecc8fd6b4f936ec01e73238643757451e20478c07 /lib
parentMerge tag 'mips_6.4' of git://git.kernel.org/pub/scm/linux/kernel/git/mips/linux (diff)
parentmm,unmap: avoid flushing TLB in batch if PTE is inaccessible (diff)
downloadlinux-7fa8a8ee9400fe8ec188426e40e481717bc5e924.tar.xz
linux-7fa8a8ee9400fe8ec188426e40e481717bc5e924.zip
Merge tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton: - Nick Piggin's "shoot lazy tlbs" series, to improve the peformance of switching from a user process to a kernel thread. - More folio conversions from Kefeng Wang, Zhang Peng and Pankaj Raghav. - zsmalloc performance improvements from Sergey Senozhatsky. - Yue Zhao has found and fixed some data race issues around the alteration of memcg userspace tunables. - VFS rationalizations from Christoph Hellwig: - removal of most of the callers of write_one_page() - make __filemap_get_folio()'s return value more useful - Luis Chamberlain has changed tmpfs so it no longer requires swap backing. Use `mount -o noswap'. - Qi Zheng has made the slab shrinkers operate locklessly, providing some scalability benefits. - Keith Busch has improved dmapool's performance, making part of its operations O(1) rather than O(n). - Peter Xu adds the UFFD_FEATURE_WP_UNPOPULATED feature to userfaultd, permitting userspace to wr-protect anon memory unpopulated ptes. - Kirill Shutemov has changed MAX_ORDER's meaning to be inclusive rather than exclusive, and has fixed a bunch of errors which were caused by its unintuitive meaning. - Axel Rasmussen give userfaultfd the UFFDIO_CONTINUE_MODE_WP feature, which causes minor faults to install a write-protected pte. - Vlastimil Babka has done some maintenance work on vma_merge(): cleanups to the kernel code and improvements to our userspace test harness. - Cleanups to do_fault_around() by Lorenzo Stoakes. - Mike Rapoport has moved a lot of initialization code out of various mm/ files and into mm/mm_init.c. - Lorenzo Stoakes removd vmf_insert_mixed_prot(), which was added for DRM, but DRM doesn't use it any more. - Lorenzo has also coverted read_kcore() and vread() to use iterators and has thereby removed the use of bounce buffers in some cases. - Lorenzo has also contributed further cleanups of vma_merge(). - Chaitanya Prakash provides some fixes to the mmap selftesting code. - Matthew Wilcox changes xfs and afs so they no longer take sleeping locks in ->map_page(), a step towards RCUification of pagefaults. - Suren Baghdasaryan has improved mmap_lock scalability by switching to per-VMA locking. - Frederic Weisbecker has reworked the percpu cache draining so that it no longer causes latency glitches on cpu isolated workloads. - Mike Rapoport cleans up and corrects the ARCH_FORCE_MAX_ORDER Kconfig logic. - Liu Shixin has changed zswap's initialization so we no longer waste a chunk of memory if zswap is not being used. - Yosry Ahmed has improved the performance of memcg statistics flushing. - David Stevens has fixed several issues involving khugepaged, userfaultfd and shmem. - Christoph Hellwig has provided some cleanup work to zram's IO-related code paths. - David Hildenbrand has fixed up some issues in the selftest code's testing of our pte state changing. - Pankaj Raghav has made page_endio() unneeded and has removed it. - Peter Xu contributed some rationalizations of the userfaultfd selftests. - Yosry Ahmed has fixed an issue around memcg's page recalim accounting. - Chaitanya Prakash has fixed some arm-related issues in the selftests/mm code. - Longlong Xia has improved the way in which KSM handles hwpoisoned pages. - Peter Xu fixes a few issues with uffd-wp at fork() time. - Stefan Roesch has changed KSM so that it may now be used on a per-process and per-cgroup basis. * tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (369 commits) mm,unmap: avoid flushing TLB in batch if PTE is inaccessible shmem: restrict noswap option to initial user namespace mm/khugepaged: fix conflicting mods to collapse_file() sparse: remove unnecessary 0 values from rc mm: move 'mmap_min_addr' logic from callers into vm_unmapped_area() hugetlb: pte_alloc_huge() to replace huge pte_alloc_map() maple_tree: fix allocation in mas_sparse_area() mm: do not increment pgfault stats when page fault handler retries zsmalloc: allow only one active pool compaction context selftests/mm: add new selftests for KSM mm: add new KSM process and sysfs knobs mm: add new api to enable ksm per process mm: shrinkers: fix debugfs file permissions mm: don't check VMA write permissions if the PTE/PMD indicates write permissions migrate_pages_batch: fix statistics for longterm pin retry userfaultfd: use helper function range_in_vma() lib/show_mem.c: use for_each_populated_zone() simplify code mm: correct arg in reclaim_pages()/reclaim_clean_pages_from_list() fs/buffer: convert create_page_buffers to folio_create_buffers fs/buffer: add folio_create_empty_buffers helper ...
Diffstat (limited to 'lib')
-rw-r--r--lib/Kconfig.debug10
-rw-r--r--lib/iov_iter.c48
-rw-r--r--lib/maple_tree.c78
-rw-r--r--lib/show_mem.c19
-rw-r--r--lib/stackdepot.c12
-rw-r--r--lib/test_printf.c26
-rw-r--r--lib/test_vmalloc.c37
-rw-r--r--lib/vsprintf.c21
8 files changed, 187 insertions, 64 deletions
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 786ba9fd8555..be272aa2fc0a 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -791,6 +791,16 @@ config DEBUG_VM
If unsure, say N.
+config DEBUG_VM_SHOOT_LAZIES
+ bool "Debug MMU_LAZY_TLB_SHOOTDOWN implementation"
+ depends on DEBUG_VM
+ depends on MMU_LAZY_TLB_SHOOTDOWN
+ help
+ Enable additional IPIs that ensure lazy tlb mm references are removed
+ before the mm is freed.
+
+ If unsure, say N.
+
config DEBUG_VM_MAPLE_TREE
bool "Debug VM maple trees"
depends on DEBUG_VM
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 967fba189c5f..c3dbe994112c 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -172,6 +172,18 @@ static int copyout(void __user *to, const void *from, size_t n)
return n;
}
+static int copyout_nofault(void __user *to, const void *from, size_t n)
+{
+ long res;
+
+ if (should_fail_usercopy())
+ return n;
+
+ res = copy_to_user_nofault(to, from, n);
+
+ return res < 0 ? n : res;
+}
+
static int copyin(void *to, const void __user *from, size_t n)
{
size_t res = n;
@@ -734,6 +746,42 @@ size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
}
EXPORT_SYMBOL(copy_page_to_iter);
+size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t bytes,
+ struct iov_iter *i)
+{
+ size_t res = 0;
+
+ if (!page_copy_sane(page, offset, bytes))
+ return 0;
+ if (WARN_ON_ONCE(i->data_source))
+ return 0;
+ if (unlikely(iov_iter_is_pipe(i)))
+ return copy_page_to_iter_pipe(page, offset, bytes, i);
+ page += offset / PAGE_SIZE; // first subpage
+ offset %= PAGE_SIZE;
+ while (1) {
+ void *kaddr = kmap_local_page(page);
+ size_t n = min(bytes, (size_t)PAGE_SIZE - offset);
+
+ iterate_and_advance(i, n, base, len, off,
+ copyout_nofault(base, kaddr + offset + off, len),
+ memcpy(base, kaddr + offset + off, len)
+ )
+ kunmap_local(kaddr);
+ res += n;
+ bytes -= n;
+ if (!bytes || !n)
+ break;
+ offset += n;
+ if (offset == PAGE_SIZE) {
+ page++;
+ offset = 0;
+ }
+ }
+ return res;
+}
+EXPORT_SYMBOL(copy_page_to_iter_nofault);
+
size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
struct iov_iter *i)
{
diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index 1281a40d5735..110a36479dce 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -2312,9 +2312,7 @@ static inline struct maple_enode *mte_node_or_none(struct maple_enode *enode)
static inline void mas_wr_node_walk(struct ma_wr_state *wr_mas)
{
struct ma_state *mas = wr_mas->mas;
- unsigned char count;
- unsigned char offset;
- unsigned long index, min, max;
+ unsigned char count, offset;
if (unlikely(ma_is_dense(wr_mas->type))) {
wr_mas->r_max = wr_mas->r_min = mas->index;
@@ -2327,34 +2325,12 @@ static inline void mas_wr_node_walk(struct ma_wr_state *wr_mas)
count = wr_mas->node_end = ma_data_end(wr_mas->node, wr_mas->type,
wr_mas->pivots, mas->max);
offset = mas->offset;
- min = mas_safe_min(mas, wr_mas->pivots, offset);
- if (unlikely(offset == count))
- goto max;
-
- max = wr_mas->pivots[offset];
- index = mas->index;
- if (unlikely(index <= max))
- goto done;
-
- if (unlikely(!max && offset))
- goto max;
-
- min = max + 1;
- while (++offset < count) {
- max = wr_mas->pivots[offset];
- if (index <= max)
- goto done;
- else if (unlikely(!max))
- break;
- min = max + 1;
- }
+ while (offset < count && mas->index > wr_mas->pivots[offset])
+ offset++;
-max:
- max = mas->max;
-done:
- wr_mas->r_max = max;
- wr_mas->r_min = min;
+ wr_mas->r_max = offset < count ? wr_mas->pivots[offset] : mas->max;
+ wr_mas->r_min = mas_safe_min(mas, wr_mas->pivots, offset);
wr_mas->offset_end = mas->offset = offset;
}
@@ -3282,7 +3258,7 @@ static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end
if (tmp < max_p)
memset(pivs + tmp, 0,
- sizeof(unsigned long *) * (max_p - tmp));
+ sizeof(unsigned long) * (max_p - tmp));
if (tmp < mt_slots[mt])
memset(slots + tmp, 0, sizeof(void *) * (max_s - tmp));
@@ -5274,25 +5250,28 @@ static inline void mas_fill_gap(struct ma_state *mas, void *entry,
* @size: The size of the gap
* @fwd: Searching forward or back
*/
-static inline void mas_sparse_area(struct ma_state *mas, unsigned long min,
+static inline int mas_sparse_area(struct ma_state *mas, unsigned long min,
unsigned long max, unsigned long size, bool fwd)
{
- unsigned long start = 0;
-
- if (!unlikely(mas_is_none(mas)))
- start++;
+ if (!unlikely(mas_is_none(mas)) && min == 0) {
+ min++;
+ /*
+ * At this time, min is increased, we need to recheck whether
+ * the size is satisfied.
+ */
+ if (min > max || max - min + 1 < size)
+ return -EBUSY;
+ }
/* mas_is_ptr */
- if (start < min)
- start = min;
-
if (fwd) {
- mas->index = start;
- mas->last = start + size - 1;
- return;
+ mas->index = min;
+ mas->last = min + size - 1;
+ } else {
+ mas->last = max;
+ mas->index = max - size + 1;
}
-
- mas->index = max;
+ return 0;
}
/*
@@ -5321,10 +5300,8 @@ int mas_empty_area(struct ma_state *mas, unsigned long min,
return -EBUSY;
/* Empty set */
- if (mas_is_none(mas) || mas_is_ptr(mas)) {
- mas_sparse_area(mas, min, max, size, true);
- return 0;
- }
+ if (mas_is_none(mas) || mas_is_ptr(mas))
+ return mas_sparse_area(mas, min, max, size, true);
/* The start of the window can only be within these values */
mas->index = min;
@@ -5380,10 +5357,8 @@ int mas_empty_area_rev(struct ma_state *mas, unsigned long min,
}
/* Empty set. */
- if (mas_is_none(mas) || mas_is_ptr(mas)) {
- mas_sparse_area(mas, min, max, size, false);
- return 0;
- }
+ if (mas_is_none(mas) || mas_is_ptr(mas))
+ return mas_sparse_area(mas, min, max, size, false);
/* The start of the window can only be within these values. */
mas->index = min;
@@ -5815,6 +5790,7 @@ int mas_preallocate(struct ma_state *mas, gfp_t gfp)
mas_reset(mas);
return ret;
}
+EXPORT_SYMBOL_GPL(mas_preallocate);
/*
* mas_destroy() - destroy a maple state.
diff --git a/lib/show_mem.c b/lib/show_mem.c
index 0d7585cde2a6..1485c87be935 100644
--- a/lib/show_mem.c
+++ b/lib/show_mem.c
@@ -10,26 +10,19 @@
void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
{
- pg_data_t *pgdat;
unsigned long total = 0, reserved = 0, highmem = 0;
+ struct zone *zone;
printk("Mem-Info:\n");
__show_free_areas(filter, nodemask, max_zone_idx);
- for_each_online_pgdat(pgdat) {
- int zoneid;
+ for_each_populated_zone(zone) {
- for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
- struct zone *zone = &pgdat->node_zones[zoneid];
- if (!populated_zone(zone))
- continue;
+ total += zone->present_pages;
+ reserved += zone->present_pages - zone_managed_pages(zone);
- total += zone->present_pages;
- reserved += zone->present_pages - zone_managed_pages(zone);
-
- if (is_highmem_idx(zoneid))
- highmem += zone->present_pages;
- }
+ if (is_highmem(zone))
+ highmem += zone->present_pages;
}
printk("%lu pages RAM\n", total);
diff --git a/lib/stackdepot.c b/lib/stackdepot.c
index 036da8e295d1..2f5aa851834e 100644
--- a/lib/stackdepot.c
+++ b/lib/stackdepot.c
@@ -17,6 +17,7 @@
#include <linux/gfp.h>
#include <linux/jhash.h>
#include <linux/kernel.h>
+#include <linux/kmsan.h>
#include <linux/mm.h>
#include <linux/mutex.h>
#include <linux/percpu.h>
@@ -306,6 +307,11 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc)
stack->handle.extra = 0;
memcpy(stack->entries, entries, flex_array_size(stack, entries, size));
pool_offset += required_size;
+ /*
+ * Let KMSAN know the stored stack record is initialized. This shall
+ * prevent false positive reports if instrumented code accesses it.
+ */
+ kmsan_unpoison_memory(stack, required_size);
return stack;
}
@@ -465,6 +471,12 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle,
struct stack_record *stack;
*entries = NULL;
+ /*
+ * Let KMSAN know *entries is initialized. This shall prevent false
+ * positive reports if instrumented code accesses it.
+ */
+ kmsan_unpoison_memory(entries, sizeof(*entries));
+
if (!handle)
return 0;
diff --git a/lib/test_printf.c b/lib/test_printf.c
index 46b4e6c414a3..7677ebccf3c3 100644
--- a/lib/test_printf.c
+++ b/lib/test_printf.c
@@ -642,12 +642,26 @@ page_flags_test(int section, int node, int zone, int last_cpupid,
test(cmp_buf, "%pGp", &flags);
}
+static void __init page_type_test(unsigned int page_type, const char *name,
+ char *cmp_buf)
+{
+ unsigned long size;
+
+ size = scnprintf(cmp_buf, BUF_SIZE, "%#x(", page_type);
+ if (page_type_has_type(page_type))
+ size += scnprintf(cmp_buf + size, BUF_SIZE - size, "%s", name);
+
+ snprintf(cmp_buf + size, BUF_SIZE - size, ")");
+ test(cmp_buf, "%pGt", &page_type);
+}
+
static void __init
flags(void)
{
unsigned long flags;
char *cmp_buffer;
gfp_t gfp;
+ unsigned int page_type;
cmp_buffer = kmalloc(BUF_SIZE, GFP_KERNEL);
if (!cmp_buffer)
@@ -687,6 +701,18 @@ flags(void)
gfp |= __GFP_HIGH;
test(cmp_buffer, "%pGg", &gfp);
+ page_type = ~0;
+ page_type_test(page_type, "", cmp_buffer);
+
+ page_type = 10;
+ page_type_test(page_type, "", cmp_buffer);
+
+ page_type = ~PG_buddy;
+ page_type_test(page_type, "buddy", cmp_buffer);
+
+ page_type = ~(PG_table | PG_buddy);
+ page_type_test(page_type, "table|buddy", cmp_buffer);
+
kfree(cmp_buffer);
}
diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c
index cd2bdba6d3ed..9dd9745d365f 100644
--- a/lib/test_vmalloc.c
+++ b/lib/test_vmalloc.c
@@ -53,6 +53,7 @@ __param(int, run_test_mask, INT_MAX,
"\t\tid: 128, name: pcpu_alloc_test\n"
"\t\tid: 256, name: kvfree_rcu_1_arg_vmalloc_test\n"
"\t\tid: 512, name: kvfree_rcu_2_arg_vmalloc_test\n"
+ "\t\tid: 1024, name: vm_map_ram_test\n"
/* Add a new test case description here. */
);
@@ -358,6 +359,41 @@ kvfree_rcu_2_arg_vmalloc_test(void)
return 0;
}
+static int
+vm_map_ram_test(void)
+{
+ unsigned long nr_allocated;
+ unsigned int map_nr_pages;
+ unsigned char *v_ptr;
+ struct page **pages;
+ int i;
+
+ map_nr_pages = nr_pages > 0 ? nr_pages:1;
+ pages = kmalloc(map_nr_pages * sizeof(struct page), GFP_KERNEL);
+ if (!pages)
+ return -1;
+
+ nr_allocated = alloc_pages_bulk_array(GFP_KERNEL, map_nr_pages, pages);
+ if (nr_allocated != map_nr_pages)
+ goto cleanup;
+
+ /* Run the test loop. */
+ for (i = 0; i < test_loop_count; i++) {
+ v_ptr = vm_map_ram(pages, map_nr_pages, NUMA_NO_NODE);
+ *v_ptr = 'a';
+ vm_unmap_ram(v_ptr, map_nr_pages);
+ }
+
+cleanup:
+ for (i = 0; i < nr_allocated; i++)
+ __free_page(pages[i]);
+
+ kfree(pages);
+
+ /* 0 indicates success. */
+ return nr_allocated != map_nr_pages;
+}
+
struct test_case_desc {
const char *test_name;
int (*test_func)(void);
@@ -374,6 +410,7 @@ static struct test_case_desc test_case_array[] = {
{ "pcpu_alloc_test", pcpu_alloc_test },
{ "kvfree_rcu_1_arg_vmalloc_test", kvfree_rcu_1_arg_vmalloc_test },
{ "kvfree_rcu_2_arg_vmalloc_test", kvfree_rcu_2_arg_vmalloc_test },
+ { "vm_map_ram_test", vm_map_ram_test },
/* Add a new test case here. */
};
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 426418253fd4..40f560959b16 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -2052,6 +2052,25 @@ char *format_page_flags(char *buf, char *end, unsigned long flags)
return buf;
}
+static
+char *format_page_type(char *buf, char *end, unsigned int page_type)
+{
+ buf = number(buf, end, page_type, default_flag_spec);
+
+ if (buf < end)
+ *buf = '(';
+ buf++;
+
+ if (page_type_has_type(page_type))
+ buf = format_flags(buf, end, ~page_type, pagetype_names);
+
+ if (buf < end)
+ *buf = ')';
+ buf++;
+
+ return buf;
+}
+
static noinline_for_stack
char *flags_string(char *buf, char *end, void *flags_ptr,
struct printf_spec spec, const char *fmt)
@@ -2065,6 +2084,8 @@ char *flags_string(char *buf, char *end, void *flags_ptr,
switch (fmt[1]) {
case 'p':
return format_page_flags(buf, end, *(unsigned long *)flags_ptr);
+ case 't':
+ return format_page_type(buf, end, *(unsigned int *)flags_ptr);
case 'v':
flags = *(unsigned long *)flags_ptr;
names = vmaflag_names;