diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-04-22 19:10:43 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-04-22 19:10:43 +0200 |
commit | 281b9d9a4b02229b602a14f7540206b0fbe4134f (patch) | |
tree | dd5c1aee3a7effe79c1bcc2fd035500d38b6d373 /mm | |
parent | mm/vmalloc: huge vmalloc backing pages should be split rather than compound (diff) | |
parent | mm/mmu_notifier.c: fix race in mmu_interval_notifier_remove() (diff) | |
download | linux-281b9d9a4b02229b602a14f7540206b0fbe4134f.tar.xz linux-281b9d9a4b02229b602a14f7540206b0fbe4134f.zip |
Merge branch 'akpm' (patches from Andrew)
Merge misc fixes from Andrew Morton:
"13 patches.
Subsystems affected by this patch series: mm (memory-failure, memcg,
userfaultfd, hugetlbfs, mremap, oom-kill, kasan, hmm), and kcov"
* emailed patches from Andrew Morton <akpm@linux-foundation.org>:
mm/mmu_notifier.c: fix race in mmu_interval_notifier_remove()
kcov: don't generate a warning on vm_insert_page()'s failure
MAINTAINERS: add Vincenzo Frascino to KASAN reviewers
oom_kill.c: futex: delay the OOM reaper to allow time for proper futex cleanup
selftest/vm: add skip support to mremap_test
selftest/vm: support xfail in mremap_test
selftest/vm: verify remap destination address in mremap_test
selftest/vm: verify mmap addr in mremap_test
mm, hugetlb: allow for "high" userspace addresses
userfaultfd: mark uffd_wp regardless of VM_WRITE flag
memcg: sync flush only if periodic flush is delayed
mm/memory-failure.c: skip huge_zero_page in memory_failure()
mm/hwpoison: fix race between hugetlb free/demotion and memory_failure_hugetlb()
Diffstat (limited to 'mm')
-rw-r--r-- | mm/hugetlb.c | 10 | ||||
-rw-r--r-- | mm/memcontrol.c | 12 | ||||
-rw-r--r-- | mm/memory-failure.c | 158 | ||||
-rw-r--r-- | mm/mmap.c | 8 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 14 | ||||
-rw-r--r-- | mm/oom_kill.c | 54 | ||||
-rw-r--r-- | mm/userfaultfd.c | 15 | ||||
-rw-r--r-- | mm/workingset.c | 2 |
8 files changed, 200 insertions, 73 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f8ca7cca3c1a..3fc721789743 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6785,6 +6785,16 @@ int get_hwpoison_huge_page(struct page *page, bool *hugetlb) return ret; } +int get_huge_page_for_hwpoison(unsigned long pfn, int flags) +{ + int ret; + + spin_lock_irq(&hugetlb_lock); + ret = __get_huge_page_for_hwpoison(pfn, flags); + spin_unlock_irq(&hugetlb_lock); + return ret; +} + void putback_active_hugepage(struct page *page) { spin_lock_irq(&hugetlb_lock); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 725f76723220..598fece89e2b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -587,6 +587,9 @@ static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); static DEFINE_SPINLOCK(stats_flush_lock); static DEFINE_PER_CPU(unsigned int, stats_updates); static atomic_t stats_flush_threshold = ATOMIC_INIT(0); +static u64 flush_next_time; + +#define FLUSH_TIME (2UL*HZ) /* * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can @@ -637,6 +640,7 @@ static void __mem_cgroup_flush_stats(void) if (!spin_trylock_irqsave(&stats_flush_lock, flag)) return; + flush_next_time = jiffies_64 + 2*FLUSH_TIME; cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup); atomic_set(&stats_flush_threshold, 0); spin_unlock_irqrestore(&stats_flush_lock, flag); @@ -648,10 +652,16 @@ void mem_cgroup_flush_stats(void) __mem_cgroup_flush_stats(); } +void mem_cgroup_flush_stats_delayed(void) +{ + if (time_after64(jiffies_64, flush_next_time)) + mem_cgroup_flush_stats(); +} + static void flush_memcg_stats_dwork(struct work_struct *w) { __mem_cgroup_flush_stats(); - queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); + queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); } /** diff --git a/mm/memory-failure.c b/mm/memory-failure.c index dcb6bb9cf731..27760c19bad7 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1498,50 +1498,113 @@ static int try_to_split_thp_page(struct page *page, const char *msg) return 0; } -static int memory_failure_hugetlb(unsigned long pfn, int flags) +/* + * Called from hugetlb code with hugetlb_lock held. + * + * Return values: + * 0 - free hugepage + * 1 - in-use hugepage + * 2 - not a hugepage + * -EBUSY - the hugepage is busy (try to retry) + * -EHWPOISON - the hugepage is already hwpoisoned + */ +int __get_huge_page_for_hwpoison(unsigned long pfn, int flags) +{ + struct page *page = pfn_to_page(pfn); + struct page *head = compound_head(page); + int ret = 2; /* fallback to normal page handling */ + bool count_increased = false; + + if (!PageHeadHuge(head)) + goto out; + + if (flags & MF_COUNT_INCREASED) { + ret = 1; + count_increased = true; + } else if (HPageFreed(head) || HPageMigratable(head)) { + ret = get_page_unless_zero(head); + if (ret) + count_increased = true; + } else { + ret = -EBUSY; + goto out; + } + + if (TestSetPageHWPoison(head)) { + ret = -EHWPOISON; + goto out; + } + + return ret; +out: + if (count_increased) + put_page(head); + return ret; +} + +#ifdef CONFIG_HUGETLB_PAGE +/* + * Taking refcount of hugetlb pages needs extra care about race conditions + * with basic operations like hugepage allocation/free/demotion. + * So some of prechecks for hwpoison (pinning, and testing/setting + * PageHWPoison) should be done in single hugetlb_lock range. + */ +static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb) { - struct page *p = pfn_to_page(pfn); - struct page *head = compound_head(p); int res; + struct page *p = pfn_to_page(pfn); + struct page *head; unsigned long page_flags; + bool retry = true; - if (TestSetPageHWPoison(head)) { - pr_err("Memory failure: %#lx: already hardware poisoned\n", - pfn); - res = -EHWPOISON; - if (flags & MF_ACTION_REQUIRED) + *hugetlb = 1; +retry: + res = get_huge_page_for_hwpoison(pfn, flags); + if (res == 2) { /* fallback to normal page handling */ + *hugetlb = 0; + return 0; + } else if (res == -EHWPOISON) { + pr_err("Memory failure: %#lx: already hardware poisoned\n", pfn); + if (flags & MF_ACTION_REQUIRED) { + head = compound_head(p); res = kill_accessing_process(current, page_to_pfn(head), flags); + } + return res; + } else if (res == -EBUSY) { + if (retry) { + retry = false; + goto retry; + } + action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED); return res; } + head = compound_head(p); + lock_page(head); + + if (hwpoison_filter(p)) { + ClearPageHWPoison(head); + res = -EOPNOTSUPP; + goto out; + } + num_poisoned_pages_inc(); - if (!(flags & MF_COUNT_INCREASED)) { - res = get_hwpoison_page(p, flags); - if (!res) { - lock_page(head); - if (hwpoison_filter(p)) { - if (TestClearPageHWPoison(head)) - num_poisoned_pages_dec(); - unlock_page(head); - return -EOPNOTSUPP; - } - unlock_page(head); - res = MF_FAILED; - if (__page_handle_poison(p)) { - page_ref_inc(p); - res = MF_RECOVERED; - } - action_result(pfn, MF_MSG_FREE_HUGE, res); - return res == MF_RECOVERED ? 0 : -EBUSY; - } else if (res < 0) { - action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED); - return -EBUSY; + /* + * Handling free hugepage. The possible race with hugepage allocation + * or demotion can be prevented by PageHWPoison flag. + */ + if (res == 0) { + unlock_page(head); + res = MF_FAILED; + if (__page_handle_poison(p)) { + page_ref_inc(p); + res = MF_RECOVERED; } + action_result(pfn, MF_MSG_FREE_HUGE, res); + return res == MF_RECOVERED ? 0 : -EBUSY; } - lock_page(head); - /* * The page could have changed compound pages due to race window. * If this happens just bail out. @@ -1554,14 +1617,6 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) page_flags = head->flags; - if (hwpoison_filter(p)) { - if (TestClearPageHWPoison(head)) - num_poisoned_pages_dec(); - put_page(p); - res = -EOPNOTSUPP; - goto out; - } - /* * TODO: hwpoison for pud-sized hugetlb doesn't work right now, so * simply disable it. In order to make it work properly, we need @@ -1588,6 +1643,12 @@ out: unlock_page(head); return res; } +#else +static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb) +{ + return 0; +} +#endif static int memory_failure_dev_pagemap(unsigned long pfn, int flags, struct dev_pagemap *pgmap) @@ -1712,6 +1773,7 @@ int memory_failure(unsigned long pfn, int flags) int res = 0; unsigned long page_flags; bool retry = true; + int hugetlb = 0; if (!sysctl_memory_failure_recovery) panic("Memory failure on page %lx", pfn); @@ -1739,10 +1801,9 @@ int memory_failure(unsigned long pfn, int flags) } try_again: - if (PageHuge(p)) { - res = memory_failure_hugetlb(pfn, flags); + res = try_memory_failure_hugetlb(pfn, flags, &hugetlb); + if (hugetlb) goto unlock_mutex; - } if (TestSetPageHWPoison(p)) { pr_err("Memory failure: %#lx: already hardware poisoned\n", @@ -1800,6 +1861,19 @@ try_again: if (PageTransHuge(hpage)) { /* + * Bail out before SetPageHasHWPoisoned() if hpage is + * huge_zero_page, although PG_has_hwpoisoned is not + * checked in set_huge_zero_page(). + * + * TODO: Handle memory failure of huge_zero_page thoroughly. + */ + if (is_huge_zero_page(hpage)) { + action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED); + res = -EBUSY; + goto unlock_mutex; + } + + /* * The flag must be set after the refcount is bumped * otherwise it may race with THP split. * And the flag can't be set in get_hwpoison_page() since diff --git a/mm/mmap.c b/mm/mmap.c index 3aa839f81e63..313b57d55a63 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2117,14 +2117,6 @@ unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info) return addr; } -#ifndef arch_get_mmap_end -#define arch_get_mmap_end(addr) (TASK_SIZE) -#endif - -#ifndef arch_get_mmap_base -#define arch_get_mmap_base(addr, base) (base) -#endif - /* Get an address range which is currently unmapped. * For shmat() with addr=0. * diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 459d195d2ff6..f45ff1b7626a 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -1036,6 +1036,18 @@ int mmu_interval_notifier_insert_locked( } EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked); +static bool +mmu_interval_seq_released(struct mmu_notifier_subscriptions *subscriptions, + unsigned long seq) +{ + bool ret; + + spin_lock(&subscriptions->lock); + ret = subscriptions->invalidate_seq != seq; + spin_unlock(&subscriptions->lock); + return ret; +} + /** * mmu_interval_notifier_remove - Remove a interval notifier * @interval_sub: Interval subscription to unregister @@ -1083,7 +1095,7 @@ void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub) lock_map_release(&__mmu_notifier_invalidate_range_start_map); if (seq) wait_event(subscriptions->wq, - READ_ONCE(subscriptions->invalidate_seq) != seq); + mmu_interval_seq_released(subscriptions, seq)); /* pairs with mmgrab in mmu_interval_notifier_insert() */ mmdrop(mm); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 7ec38194f8e1..49d7df39b02d 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -632,7 +632,7 @@ done: */ set_bit(MMF_OOM_SKIP, &mm->flags); - /* Drop a reference taken by wake_oom_reaper */ + /* Drop a reference taken by queue_oom_reaper */ put_task_struct(tsk); } @@ -644,12 +644,12 @@ static int oom_reaper(void *unused) struct task_struct *tsk = NULL; wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL); - spin_lock(&oom_reaper_lock); + spin_lock_irq(&oom_reaper_lock); if (oom_reaper_list != NULL) { tsk = oom_reaper_list; oom_reaper_list = tsk->oom_reaper_list; } - spin_unlock(&oom_reaper_lock); + spin_unlock_irq(&oom_reaper_lock); if (tsk) oom_reap_task(tsk); @@ -658,22 +658,48 @@ static int oom_reaper(void *unused) return 0; } -static void wake_oom_reaper(struct task_struct *tsk) +static void wake_oom_reaper(struct timer_list *timer) { - /* mm is already queued? */ - if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags)) - return; + struct task_struct *tsk = container_of(timer, struct task_struct, + oom_reaper_timer); + struct mm_struct *mm = tsk->signal->oom_mm; + unsigned long flags; - get_task_struct(tsk); + /* The victim managed to terminate on its own - see exit_mmap */ + if (test_bit(MMF_OOM_SKIP, &mm->flags)) { + put_task_struct(tsk); + return; + } - spin_lock(&oom_reaper_lock); + spin_lock_irqsave(&oom_reaper_lock, flags); tsk->oom_reaper_list = oom_reaper_list; oom_reaper_list = tsk; - spin_unlock(&oom_reaper_lock); + spin_unlock_irqrestore(&oom_reaper_lock, flags); trace_wake_reaper(tsk->pid); wake_up(&oom_reaper_wait); } +/* + * Give the OOM victim time to exit naturally before invoking the oom_reaping. + * The timers timeout is arbitrary... the longer it is, the longer the worst + * case scenario for the OOM can take. If it is too small, the oom_reaper can + * get in the way and release resources needed by the process exit path. + * e.g. The futex robust list can sit in Anon|Private memory that gets reaped + * before the exit path is able to wake the futex waiters. + */ +#define OOM_REAPER_DELAY (2*HZ) +static void queue_oom_reaper(struct task_struct *tsk) +{ + /* mm is already queued? */ + if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags)) + return; + + get_task_struct(tsk); + timer_setup(&tsk->oom_reaper_timer, wake_oom_reaper, 0); + tsk->oom_reaper_timer.expires = jiffies + OOM_REAPER_DELAY; + add_timer(&tsk->oom_reaper_timer); +} + static int __init oom_init(void) { oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); @@ -681,7 +707,7 @@ static int __init oom_init(void) } subsys_initcall(oom_init) #else -static inline void wake_oom_reaper(struct task_struct *tsk) +static inline void queue_oom_reaper(struct task_struct *tsk) { } #endif /* CONFIG_MMU */ @@ -932,7 +958,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) rcu_read_unlock(); if (can_oom_reap) - wake_oom_reaper(victim); + queue_oom_reaper(victim); mmdrop(mm); put_task_struct(victim); @@ -968,7 +994,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message) task_lock(victim); if (task_will_free_mem(victim)) { mark_oom_victim(victim); - wake_oom_reaper(victim); + queue_oom_reaper(victim); task_unlock(victim); put_task_struct(victim); return; @@ -1067,7 +1093,7 @@ bool out_of_memory(struct oom_control *oc) */ if (task_will_free_mem(current)) { mark_oom_victim(current); - wake_oom_reaper(current); + queue_oom_reaper(current); return true; } diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 0cb8e5ef1713..e9bb6db002aa 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -72,12 +72,15 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, _dst_pte = pte_mkdirty(_dst_pte); if (page_in_cache && !vm_shared) writable = false; - if (writable) { - if (wp_copy) - _dst_pte = pte_mkuffd_wp(_dst_pte); - else - _dst_pte = pte_mkwrite(_dst_pte); - } + + /* + * Always mark a PTE as write-protected when needed, regardless of + * VM_WRITE, which the user might change. + */ + if (wp_copy) + _dst_pte = pte_mkuffd_wp(_dst_pte); + else if (writable) + _dst_pte = pte_mkwrite(_dst_pte); dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); diff --git a/mm/workingset.c b/mm/workingset.c index 8a3828acc0bf..592569a8974c 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -355,7 +355,7 @@ void workingset_refault(struct folio *folio, void *shadow) mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); - mem_cgroup_flush_stats(); + mem_cgroup_flush_stats_delayed(); /* * Compare the distance to the existing workingset size. We * don't activate pages that couldn't stay resident even if |