summaryrefslogtreecommitdiffstats
path: root/mm/memory-failure.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-04-28 04:42:02 +0200
committerLinus Torvalds <torvalds@linux-foundation.org>2023-04-28 04:42:02 +0200
commit7fa8a8ee9400fe8ec188426e40e481717bc5e924 (patch)
treecc8fd6b4f936ec01e73238643757451e20478c07 /mm/memory-failure.c
parentMerge tag 'mips_6.4' of git://git.kernel.org/pub/scm/linux/kernel/git/mips/linux (diff)
parentmm,unmap: avoid flushing TLB in batch if PTE is inaccessible (diff)
downloadlinux-7fa8a8ee9400fe8ec188426e40e481717bc5e924.tar.xz
linux-7fa8a8ee9400fe8ec188426e40e481717bc5e924.zip
Merge tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton: - Nick Piggin's "shoot lazy tlbs" series, to improve the peformance of switching from a user process to a kernel thread. - More folio conversions from Kefeng Wang, Zhang Peng and Pankaj Raghav. - zsmalloc performance improvements from Sergey Senozhatsky. - Yue Zhao has found and fixed some data race issues around the alteration of memcg userspace tunables. - VFS rationalizations from Christoph Hellwig: - removal of most of the callers of write_one_page() - make __filemap_get_folio()'s return value more useful - Luis Chamberlain has changed tmpfs so it no longer requires swap backing. Use `mount -o noswap'. - Qi Zheng has made the slab shrinkers operate locklessly, providing some scalability benefits. - Keith Busch has improved dmapool's performance, making part of its operations O(1) rather than O(n). - Peter Xu adds the UFFD_FEATURE_WP_UNPOPULATED feature to userfaultd, permitting userspace to wr-protect anon memory unpopulated ptes. - Kirill Shutemov has changed MAX_ORDER's meaning to be inclusive rather than exclusive, and has fixed a bunch of errors which were caused by its unintuitive meaning. - Axel Rasmussen give userfaultfd the UFFDIO_CONTINUE_MODE_WP feature, which causes minor faults to install a write-protected pte. - Vlastimil Babka has done some maintenance work on vma_merge(): cleanups to the kernel code and improvements to our userspace test harness. - Cleanups to do_fault_around() by Lorenzo Stoakes. - Mike Rapoport has moved a lot of initialization code out of various mm/ files and into mm/mm_init.c. - Lorenzo Stoakes removd vmf_insert_mixed_prot(), which was added for DRM, but DRM doesn't use it any more. - Lorenzo has also coverted read_kcore() and vread() to use iterators and has thereby removed the use of bounce buffers in some cases. - Lorenzo has also contributed further cleanups of vma_merge(). - Chaitanya Prakash provides some fixes to the mmap selftesting code. - Matthew Wilcox changes xfs and afs so they no longer take sleeping locks in ->map_page(), a step towards RCUification of pagefaults. - Suren Baghdasaryan has improved mmap_lock scalability by switching to per-VMA locking. - Frederic Weisbecker has reworked the percpu cache draining so that it no longer causes latency glitches on cpu isolated workloads. - Mike Rapoport cleans up and corrects the ARCH_FORCE_MAX_ORDER Kconfig logic. - Liu Shixin has changed zswap's initialization so we no longer waste a chunk of memory if zswap is not being used. - Yosry Ahmed has improved the performance of memcg statistics flushing. - David Stevens has fixed several issues involving khugepaged, userfaultfd and shmem. - Christoph Hellwig has provided some cleanup work to zram's IO-related code paths. - David Hildenbrand has fixed up some issues in the selftest code's testing of our pte state changing. - Pankaj Raghav has made page_endio() unneeded and has removed it. - Peter Xu contributed some rationalizations of the userfaultfd selftests. - Yosry Ahmed has fixed an issue around memcg's page recalim accounting. - Chaitanya Prakash has fixed some arm-related issues in the selftests/mm code. - Longlong Xia has improved the way in which KSM handles hwpoisoned pages. - Peter Xu fixes a few issues with uffd-wp at fork() time. - Stefan Roesch has changed KSM so that it may now be used on a per-process and per-cgroup basis. * tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (369 commits) mm,unmap: avoid flushing TLB in batch if PTE is inaccessible shmem: restrict noswap option to initial user namespace mm/khugepaged: fix conflicting mods to collapse_file() sparse: remove unnecessary 0 values from rc mm: move 'mmap_min_addr' logic from callers into vm_unmapped_area() hugetlb: pte_alloc_huge() to replace huge pte_alloc_map() maple_tree: fix allocation in mas_sparse_area() mm: do not increment pgfault stats when page fault handler retries zsmalloc: allow only one active pool compaction context selftests/mm: add new selftests for KSM mm: add new KSM process and sysfs knobs mm: add new api to enable ksm per process mm: shrinkers: fix debugfs file permissions mm: don't check VMA write permissions if the PTE/PMD indicates write permissions migrate_pages_batch: fix statistics for longterm pin retry userfaultfd: use helper function range_in_vma() lib/show_mem.c: use for_each_populated_zone() simplify code mm: correct arg in reclaim_pages()/reclaim_clean_pages_from_list() fs/buffer: convert create_page_buffers to folio_create_buffers fs/buffer: add folio_create_empty_buffers helper ...
Diffstat (limited to 'mm/memory-failure.c')
-rw-r--r--mm/memory-failure.c65
1 files changed, 47 insertions, 18 deletions
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 10e60b6b2447..5b663eca1f29 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -200,7 +200,7 @@ static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, boo
return true;
}
-#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
+#if IS_ENABLED(CONFIG_HWPOISON_INJECT)
u32 hwpoison_filter_enable = 0;
u32 hwpoison_filter_dev_major = ~0U;
@@ -437,9 +437,9 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
* page->mapping are sufficient for mapping the page back to its
* corresponding user virtual address.
*/
-static void add_to_kill(struct task_struct *tsk, struct page *p,
- pgoff_t fsdax_pgoff, struct vm_area_struct *vma,
- struct list_head *to_kill)
+static void __add_to_kill(struct task_struct *tsk, struct page *p,
+ struct vm_area_struct *vma, struct list_head *to_kill,
+ unsigned long ksm_addr, pgoff_t fsdax_pgoff)
{
struct to_kill *tk;
@@ -449,7 +449,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
return;
}
- tk->addr = page_address_in_vma(p, vma);
+ tk->addr = ksm_addr ? ksm_addr : page_address_in_vma(p, vma);
if (is_zone_device_page(p)) {
if (fsdax_pgoff != FSDAX_INVALID_PGOFF)
tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma);
@@ -480,6 +480,34 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
list_add_tail(&tk->nd, to_kill);
}
+static void add_to_kill_anon_file(struct task_struct *tsk, struct page *p,
+ struct vm_area_struct *vma,
+ struct list_head *to_kill)
+{
+ __add_to_kill(tsk, p, vma, to_kill, 0, FSDAX_INVALID_PGOFF);
+}
+
+#ifdef CONFIG_KSM
+static bool task_in_to_kill_list(struct list_head *to_kill,
+ struct task_struct *tsk)
+{
+ struct to_kill *tk, *next;
+
+ list_for_each_entry_safe(tk, next, to_kill, nd) {
+ if (tk->tsk == tsk)
+ return true;
+ }
+
+ return false;
+}
+void add_to_kill_ksm(struct task_struct *tsk, struct page *p,
+ struct vm_area_struct *vma, struct list_head *to_kill,
+ unsigned long ksm_addr)
+{
+ if (!task_in_to_kill_list(to_kill, tsk))
+ __add_to_kill(tsk, p, vma, to_kill, ksm_addr, FSDAX_INVALID_PGOFF);
+}
+#endif
/*
* Kill the processes that have been collected earlier.
*
@@ -559,8 +587,7 @@ static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
* processes sharing the same error page,if the process is "early kill", the
* task_struct of the dedicated thread will also be returned.
*/
-static struct task_struct *task_early_kill(struct task_struct *tsk,
- int force_early)
+struct task_struct *task_early_kill(struct task_struct *tsk, int force_early)
{
if (!tsk->mm)
return NULL;
@@ -605,7 +632,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
continue;
if (!page_mapped_in_vma(page, vma))
continue;
- add_to_kill(t, page, FSDAX_INVALID_PGOFF, vma, to_kill);
+ add_to_kill_anon_file(t, page, vma, to_kill);
}
}
read_unlock(&tasklist_lock);
@@ -641,8 +668,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
* to be informed of all such data corruptions.
*/
if (vma->vm_mm == t->mm)
- add_to_kill(t, page, FSDAX_INVALID_PGOFF, vma,
- to_kill);
+ add_to_kill_anon_file(t, page, vma, to_kill);
}
}
read_unlock(&tasklist_lock);
@@ -650,6 +676,13 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
}
#ifdef CONFIG_FS_DAX
+static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p,
+ struct vm_area_struct *vma,
+ struct list_head *to_kill, pgoff_t pgoff)
+{
+ __add_to_kill(tsk, p, vma, to_kill, 0, pgoff);
+}
+
/*
* Collect processes when the error hit a fsdax page.
*/
@@ -669,7 +702,7 @@ static void collect_procs_fsdax(struct page *page,
continue;
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
if (vma->vm_mm == t->mm)
- add_to_kill(t, page, pgoff, vma, to_kill);
+ add_to_kill_fsdax(t, page, vma, to_kill, pgoff);
}
}
read_unlock(&tasklist_lock);
@@ -685,8 +718,9 @@ static void collect_procs(struct page *page, struct list_head *tokill,
{
if (!page->mapping)
return;
-
- if (PageAnon(page))
+ if (unlikely(PageKsm(page)))
+ collect_procs_ksm(page, tokill, force_early);
+ else if (PageAnon(page))
collect_procs_anon(page, tokill, force_early);
else
collect_procs_file(page, tokill, force_early);
@@ -1541,11 +1575,6 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
if (!page_mapped(hpage))
return true;
- if (PageKsm(p)) {
- pr_err("%#lx: can't handle KSM pages.\n", pfn);
- return false;
- }
-
if (PageSwapCache(p)) {
pr_err("%#lx: keeping poisoned page in swap cache\n", pfn);
ttu &= ~TTU_HWPOISON;