diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/madvise.c | 12 | ||||
-rw-r--r-- | mm/memcontrol.c | 103 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 27 | ||||
-rw-r--r-- | mm/nommu.c | 10 | ||||
-rw-r--r-- | mm/slub.c | 41 | ||||
-rw-r--r-- | mm/sparse.c | 8 | ||||
-rw-r--r-- | mm/vmalloc.c | 11 |
7 files changed, 143 insertions, 69 deletions
diff --git a/mm/madvise.c b/mm/madvise.c index 43b47d3fae02..4bb30ed6c8d2 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -335,12 +335,14 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, } page = pmd_page(orig_pmd); + + /* Do not interfere with other mappings of this page */ + if (page_mapcount(page) != 1) + goto huge_unlock; + if (next - addr != HPAGE_PMD_SIZE) { int err; - if (page_mapcount(page) != 1) - goto huge_unlock; - get_page(page); spin_unlock(ptl); lock_page(page); @@ -426,6 +428,10 @@ regular_page: continue; } + /* Do not interfere with other mappings of this page */ + if (page_mapcount(page) != 1) + continue; + VM_BUG_ON_PAGE(PageTransCompound(page), page); if (pte_young(ptent)) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2058b8da18db..7a4bd8b9adc2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2297,28 +2297,41 @@ static void high_work_func(struct work_struct *work) #define MEMCG_DELAY_SCALING_SHIFT 14 /* - * Scheduled by try_charge() to be executed from the userland return path - * and reclaims memory over the high limit. + * Get the number of jiffies that we should penalise a mischievous cgroup which + * is exceeding its memory.high by checking both it and its ancestors. */ -void mem_cgroup_handle_over_high(void) +static unsigned long calculate_high_delay(struct mem_cgroup *memcg, + unsigned int nr_pages) { - unsigned long usage, high, clamped_high; - unsigned long pflags; - unsigned long penalty_jiffies, overage; - unsigned int nr_pages = current->memcg_nr_pages_over_high; - struct mem_cgroup *memcg; + unsigned long penalty_jiffies; + u64 max_overage = 0; - if (likely(!nr_pages)) - return; + do { + unsigned long usage, high; + u64 overage; - memcg = get_mem_cgroup_from_mm(current->mm); - reclaim_high(memcg, nr_pages, GFP_KERNEL); - current->memcg_nr_pages_over_high = 0; + usage = page_counter_read(&memcg->memory); + high = READ_ONCE(memcg->high); + + /* + * Prevent division by 0 in overage calculation by acting as if + * it was a threshold of 1 page + */ + high = max(high, 1UL); + + overage = usage - high; + overage <<= MEMCG_DELAY_PRECISION_SHIFT; + overage = div64_u64(overage, high); + + if (overage > max_overage) + max_overage = overage; + } while ((memcg = parent_mem_cgroup(memcg)) && + !mem_cgroup_is_root(memcg)); + + if (!max_overage) + return 0; /* - * memory.high is breached and reclaim is unable to keep up. Throttle - * allocators proactively to slow down excessive growth. - * * We use overage compared to memory.high to calculate the number of * jiffies to sleep (penalty_jiffies). Ideally this value should be * fairly lenient on small overages, and increasingly harsh when the @@ -2326,24 +2339,9 @@ void mem_cgroup_handle_over_high(void) * its crazy behaviour, so we exponentially increase the delay based on * overage amount. */ - - usage = page_counter_read(&memcg->memory); - high = READ_ONCE(memcg->high); - - if (usage <= high) - goto out; - - /* - * Prevent division by 0 in overage calculation by acting as if it was a - * threshold of 1 page - */ - clamped_high = max(high, 1UL); - - overage = div_u64((u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT, - clamped_high); - - penalty_jiffies = ((u64)overage * overage * HZ) - >> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT); + penalty_jiffies = max_overage * max_overage * HZ; + penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT; + penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT; /* * Factor in the task's own contribution to the overage, such that four @@ -2360,7 +2358,32 @@ void mem_cgroup_handle_over_high(void) * application moving forwards and also permit diagnostics, albeit * extremely slowly. */ - penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES); + return min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES); +} + +/* + * Scheduled by try_charge() to be executed from the userland return path + * and reclaims memory over the high limit. + */ +void mem_cgroup_handle_over_high(void) +{ + unsigned long penalty_jiffies; + unsigned long pflags; + unsigned int nr_pages = current->memcg_nr_pages_over_high; + struct mem_cgroup *memcg; + + if (likely(!nr_pages)) + return; + + memcg = get_mem_cgroup_from_mm(current->mm); + reclaim_high(memcg, nr_pages, GFP_KERNEL); + current->memcg_nr_pages_over_high = 0; + + /* + * memory.high is breached and reclaim is unable to keep up. Throttle + * allocators proactively to slow down excessive growth. + */ + penalty_jiffies = calculate_high_delay(memcg, nr_pages); /* * Don't sleep if the amount of jiffies this memcg owes us is so low @@ -4027,7 +4050,7 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; unsigned long usage; - int i, j, size; + int i, j, size, entries; mutex_lock(&memcg->thresholds_lock); @@ -4047,14 +4070,20 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, __mem_cgroup_threshold(memcg, type == _MEMSWAP); /* Calculate new number of threshold */ - size = 0; + size = entries = 0; for (i = 0; i < thresholds->primary->size; i++) { if (thresholds->primary->entries[i].eventfd != eventfd) size++; + else + entries++; } new = thresholds->spare; + /* If no items related to eventfd have been cleared, nothing to do */ + if (!entries) + goto unlock; + /* Set thresholds array to NULL if we don't have thresholds */ if (!size) { kfree(new); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index ef3973a5d34a..06852b896fa6 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -307,7 +307,8 @@ static void mn_hlist_release(struct mmu_notifier_subscriptions *subscriptions, * ->release returns. */ id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist) + hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist, + srcu_read_lock_held(&srcu)) /* * If ->release runs before mmu_notifier_unregister it must be * handled, as it's the only way for the driver to flush all @@ -370,7 +371,8 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(subscription, - &mm->notifier_subscriptions->list, hlist) { + &mm->notifier_subscriptions->list, hlist, + srcu_read_lock_held(&srcu)) { if (subscription->ops->clear_flush_young) young |= subscription->ops->clear_flush_young( subscription, mm, start, end); @@ -389,7 +391,8 @@ int __mmu_notifier_clear_young(struct mm_struct *mm, id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(subscription, - &mm->notifier_subscriptions->list, hlist) { + &mm->notifier_subscriptions->list, hlist, + srcu_read_lock_held(&srcu)) { if (subscription->ops->clear_young) young |= subscription->ops->clear_young(subscription, mm, start, end); @@ -407,7 +410,8 @@ int __mmu_notifier_test_young(struct mm_struct *mm, id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(subscription, - &mm->notifier_subscriptions->list, hlist) { + &mm->notifier_subscriptions->list, hlist, + srcu_read_lock_held(&srcu)) { if (subscription->ops->test_young) { young = subscription->ops->test_young(subscription, mm, address); @@ -428,7 +432,8 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(subscription, - &mm->notifier_subscriptions->list, hlist) { + &mm->notifier_subscriptions->list, hlist, + srcu_read_lock_held(&srcu)) { if (subscription->ops->change_pte) subscription->ops->change_pte(subscription, mm, address, pte); @@ -476,7 +481,8 @@ static int mn_hlist_invalidate_range_start( int id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist) { + hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist, + srcu_read_lock_held(&srcu)) { const struct mmu_notifier_ops *ops = subscription->ops; if (ops->invalidate_range_start) { @@ -528,7 +534,8 @@ mn_hlist_invalidate_end(struct mmu_notifier_subscriptions *subscriptions, int id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist) { + hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist, + srcu_read_lock_held(&srcu)) { /* * Call invalidate_range here too to avoid the need for the * subsystem of having to register an invalidate_range_end @@ -582,7 +589,8 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm, id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(subscription, - &mm->notifier_subscriptions->list, hlist) { + &mm->notifier_subscriptions->list, hlist, + srcu_read_lock_held(&srcu)) { if (subscription->ops->invalidate_range) subscription->ops->invalidate_range(subscription, mm, start, end); @@ -714,7 +722,8 @@ find_get_mmu_notifier(struct mm_struct *mm, const struct mmu_notifier_ops *ops) spin_lock(&mm->notifier_subscriptions->lock); hlist_for_each_entry_rcu(subscription, - &mm->notifier_subscriptions->list, hlist) { + &mm->notifier_subscriptions->list, hlist, + lockdep_is_held(&mm->notifier_subscriptions->lock)) { if (subscription->ops != ops) continue; diff --git a/mm/nommu.c b/mm/nommu.c index bd2b4e5ef144..318df4e236c9 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -370,10 +370,14 @@ void vm_unmap_aliases(void) EXPORT_SYMBOL_GPL(vm_unmap_aliases); /* - * Implement a stub for vmalloc_sync_all() if the architecture chose not to - * have one. + * Implement a stub for vmalloc_sync_[un]mapping() if the architecture + * chose not to have one. */ -void __weak vmalloc_sync_all(void) +void __weak vmalloc_sync_mappings(void) +{ +} + +void __weak vmalloc_sync_unmappings(void) { } diff --git a/mm/slub.c b/mm/slub.c index 17dc00e33115..6589b41d5a60 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1973,8 +1973,6 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, if (node == NUMA_NO_NODE) searchnode = numa_mem_id(); - else if (!node_present_pages(node)) - searchnode = node_to_mem_node(node); object = get_partial_node(s, get_node(s, searchnode), c, flags); if (object || node != NUMA_NO_NODE) @@ -2563,17 +2561,27 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, struct page *page; page = c->page; - if (!page) + if (!page) { + /* + * if the node is not online or has no normal memory, just + * ignore the node constraint + */ + if (unlikely(node != NUMA_NO_NODE && + !node_state(node, N_NORMAL_MEMORY))) + node = NUMA_NO_NODE; goto new_slab; + } redo: if (unlikely(!node_match(page, node))) { - int searchnode = node; - - if (node != NUMA_NO_NODE && !node_present_pages(node)) - searchnode = node_to_mem_node(node); - - if (unlikely(!node_match(page, searchnode))) { + /* + * same as above but node_match() being false already + * implies node != NUMA_NO_NODE + */ + if (!node_state(node, N_NORMAL_MEMORY)) { + node = NUMA_NO_NODE; + goto redo; + } else { stat(s, ALLOC_NODE_MISMATCH); deactivate_slab(s, page, c->freelist, c); goto new_slab; @@ -2997,11 +3005,13 @@ redo: barrier(); if (likely(page == c->page)) { - set_freepointer(s, tail_obj, c->freelist); + void **freelist = READ_ONCE(c->freelist); + + set_freepointer(s, tail_obj, freelist); if (unlikely(!this_cpu_cmpxchg_double( s->cpu_slab->freelist, s->cpu_slab->tid, - c->freelist, tid, + freelist, tid, head, next_tid(tid)))) { note_cmpxchg_failure("slab_free", s, tid); @@ -3175,6 +3185,15 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, if (unlikely(!object)) { /* + * We may have removed an object from c->freelist using + * the fastpath in the previous iteration; in that case, + * c->tid has not been bumped yet. + * Since ___slab_alloc() may reenable interrupts while + * allocating memory, we should bump c->tid now. + */ + c->tid = next_tid(c->tid); + + /* * Invoking slow path likely have side-effect * of re-populating per CPU c->freelist */ diff --git a/mm/sparse.c b/mm/sparse.c index 596b2a45b100..aadb7298dcef 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -734,6 +734,7 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages, struct mem_section *ms = __pfn_to_section(pfn); bool section_is_early = early_section(ms); struct page *memmap = NULL; + bool empty; unsigned long *subsection_map = ms->usage ? &ms->usage->subsection_map[0] : NULL; @@ -764,7 +765,8 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages, * For 2/ and 3/ the SPARSEMEM_VMEMMAP={y,n} cases are unified */ bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION); - if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION)) { + empty = bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION); + if (empty) { unsigned long section_nr = pfn_to_section_nr(pfn); /* @@ -779,13 +781,15 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages, ms->usage = NULL; } memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); - ms->section_mem_map = (unsigned long)NULL; } if (section_is_early && memmap) free_map_bootmem(memmap); else depopulate_section_memmap(pfn, nr_pages, altmap); + + if (empty) + ms->section_mem_map = (unsigned long)NULL; } static struct page * __meminit section_activate(int nid, unsigned long pfn, diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 1f46c3b86f9f..6b8eeb0ecee5 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1295,7 +1295,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) * First make sure the mappings are removed from all page-tables * before they are freed. */ - vmalloc_sync_all(); + vmalloc_sync_unmappings(); /* * TODO: to calculate a flush range without looping. @@ -3128,16 +3128,19 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, EXPORT_SYMBOL(remap_vmalloc_range); /* - * Implement a stub for vmalloc_sync_all() if the architecture chose not to - * have one. + * Implement stubs for vmalloc_sync_[un]mappings () if the architecture chose + * not to have one. * * The purpose of this function is to make sure the vmalloc area * mappings are identical in all page-tables in the system. */ -void __weak vmalloc_sync_all(void) +void __weak vmalloc_sync_mappings(void) { } +void __weak vmalloc_sync_unmappings(void) +{ +} static int f(pte_t *pte, unsigned long addr, void *data) { |