diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 213 |
1 files changed, 88 insertions, 125 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8dab23a71fc4..508bcea7df56 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -103,11 +103,6 @@ static bool do_memsw_account(void) return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap; } -/* memcg and lruvec stats flushing */ -static void flush_memcg_stats_dwork(struct work_struct *w); -static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); -static DEFINE_SPINLOCK(stats_flush_lock); - #define THRESHOLDS_EVENTS_TARGET 128 #define SOFTLIMIT_EVENTS_TARGET 1024 @@ -239,7 +234,7 @@ enum res_type { iter != NULL; \ iter = mem_cgroup_iter(NULL, iter, NULL)) -static inline bool should_force_charge(void) +static inline bool task_is_dying(void) { return tsk_is_oom_victim(current) || fatal_signal_pending(current) || (current->flags & PF_EXITING); @@ -613,6 +608,58 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) return mz; } +/* + * memcg and lruvec stats flushing + * + * Many codepaths leading to stats update or read are performance sensitive and + * adding stats flushing in such codepaths is not desirable. So, to optimize the + * flushing the kernel does: + * + * 1) Periodically and asynchronously flush the stats every 2 seconds to not let + * rstat update tree grow unbounded. + * + * 2) Flush the stats synchronously on reader side only when there are more than + * (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization + * will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but + * only for 2 seconds due to (1). + */ +static void flush_memcg_stats_dwork(struct work_struct *w); +static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); +static DEFINE_SPINLOCK(stats_flush_lock); +static DEFINE_PER_CPU(unsigned int, stats_updates); +static atomic_t stats_flush_threshold = ATOMIC_INIT(0); + +static inline void memcg_rstat_updated(struct mem_cgroup *memcg) +{ + cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); + if (!(__this_cpu_inc_return(stats_updates) % MEMCG_CHARGE_BATCH)) + atomic_inc(&stats_flush_threshold); +} + +static void __mem_cgroup_flush_stats(void) +{ + unsigned long flag; + + if (!spin_trylock_irqsave(&stats_flush_lock, flag)) + return; + + cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup); + atomic_set(&stats_flush_threshold, 0); + spin_unlock_irqrestore(&stats_flush_lock, flag); +} + +void mem_cgroup_flush_stats(void) +{ + if (atomic_read(&stats_flush_threshold) > num_online_cpus()) + __mem_cgroup_flush_stats(); +} + +static void flush_memcg_stats_dwork(struct work_struct *w) +{ + mem_cgroup_flush_stats(); + queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); +} + /** * __mod_memcg_state - update cgroup memory statistics * @memcg: the memory cgroup @@ -625,7 +672,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) return; __this_cpu_add(memcg->vmstats_percpu->state[idx], val); - cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); + memcg_rstat_updated(memcg); } /* idx can be of type enum memcg_stat_item or node_stat_item. */ @@ -653,10 +700,12 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, memcg = pn->memcg; /* Update memcg */ - __mod_memcg_state(memcg, idx, val); + __this_cpu_add(memcg->vmstats_percpu->state[idx], val); /* Update lruvec */ __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); + + memcg_rstat_updated(memcg); } /** @@ -758,7 +807,7 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, return; __this_cpu_add(memcg->vmstats_percpu->events[idx], count); - cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); + memcg_rstat_updated(memcg); } static unsigned long memcg_events(struct mem_cgroup *memcg, int event) @@ -1415,7 +1464,7 @@ static char *memory_stat_format(struct mem_cgroup *memcg) * * Current memory state: */ - cgroup_rstat_flush(memcg->css.cgroup); + mem_cgroup_flush_stats(); for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { u64 size; @@ -1576,7 +1625,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, * A few threads which were not waiting at mutex_lock_killable() can * fail to bail out. Therefore, check again after holding oom_lock. */ - ret = should_force_charge() || out_of_memory(&oc); + ret = task_is_dying() || out_of_memory(&oc); unlock: mutex_unlock(&oom_lock); @@ -2544,6 +2593,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, struct page_counter *counter; enum oom_status oom_status; unsigned long nr_reclaimed; + bool passed_oom = false; bool may_swap = true; bool drained = false; unsigned long pflags; @@ -2579,15 +2629,6 @@ retry: goto force; /* - * Unlike in global OOM situations, memcg is not in a physical - * memory shortage. Allow dying and OOM-killed tasks to - * bypass the last charges so that they can exit quickly and - * free their memory. - */ - if (unlikely(should_force_charge())) - goto force; - - /* * Prevent unbounded recursion when reclaim operations need to * allocate memory. This might exceed the limits temporarily, * but we prefer facilitating memory reclaim and getting back @@ -2644,8 +2685,9 @@ retry: if (gfp_mask & __GFP_RETRY_MAYFAIL) goto nomem; - if (fatal_signal_pending(current)) - goto force; + /* Avoid endless loop for tasks bypassed by the oom killer */ + if (passed_oom && task_is_dying()) + goto nomem; /* * keep retrying as long as the memcg oom killer is able to make @@ -2654,14 +2696,10 @@ retry: */ oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages * PAGE_SIZE)); - switch (oom_status) { - case OOM_SUCCESS: + if (oom_status == OOM_SUCCESS) { + passed_oom = true; nr_retries = MAX_RECLAIM_RETRIES; goto retry; - case OOM_FAILED: - goto force; - default: - goto nomem; } nomem: if (!(gfp_mask & __GFP_NOFAIL)) @@ -2736,8 +2774,7 @@ static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, return try_charge_memcg(memcg, gfp_mask, nr_pages); } -#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MMU) -static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) +static inline void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) { if (mem_cgroup_is_root(memcg)) return; @@ -2746,7 +2783,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) if (do_memsw_account()) page_counter_uncharge(&memcg->memsw, nr_pages); } -#endif static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) { @@ -2965,7 +3001,6 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp, unsigned int nr_pages) { - struct page_counter *counter; struct mem_cgroup *memcg; int ret; @@ -2975,21 +3010,8 @@ static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp, if (ret) goto out; - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && - !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) { - - /* - * Enforce __GFP_NOFAIL allocation because callers are not - * prepared to see failures and likely do not have any failure - * handling code. - */ - if (gfp & __GFP_NOFAIL) { - page_counter_charge(&memcg->kmem, nr_pages); - goto out; - } - cancel_charge(memcg, nr_pages); - ret = -ENOMEM; - } + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + page_counter_charge(&memcg->kmem, nr_pages); out: css_put(&memcg->css); @@ -3481,19 +3503,11 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) /* try to free all pages in this cgroup */ while (nr_retries && page_counter_read(&memcg->memory)) { - int progress; - if (signal_pending(current)) return -EINTR; - progress = try_to_free_mem_cgroup_pages(memcg, 1, - GFP_KERNEL, true); - if (!progress) { + if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true)) nr_retries--; - /* maybe some writeback is necessary */ - congestion_wait(BLK_RW_ASYNC, HZ/10); - } - } return 0; @@ -3534,8 +3548,7 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) unsigned long val; if (mem_cgroup_is_root(memcg)) { - /* mem_cgroup_threshold() calls here from irqsafe context */ - cgroup_rstat_flush_irqsafe(memcg->css.cgroup); + mem_cgroup_flush_stats(); val = memcg_page_state(memcg, NR_FILE_PAGES) + memcg_page_state(memcg, NR_ANON_MAPPED); if (swap) @@ -3610,7 +3623,6 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) return 0; BUG_ON(memcg->kmemcg_id >= 0); - BUG_ON(memcg->kmem_state); memcg_id = memcg_alloc_cache_id(); if (memcg_id < 0) @@ -3627,22 +3639,18 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) static_branch_enable(&memcg_kmem_enabled_key); memcg->kmemcg_id = memcg_id; - memcg->kmem_state = KMEM_ONLINE; return 0; } static void memcg_offline_kmem(struct mem_cgroup *memcg) { - struct cgroup_subsys_state *css; - struct mem_cgroup *parent, *child; + struct mem_cgroup *parent; int kmemcg_id; - if (memcg->kmem_state != KMEM_ONLINE) + if (memcg->kmemcg_id == -1) return; - memcg->kmem_state = KMEM_ALLOCATED; - parent = parent_mem_cgroup(memcg); if (!parent) parent = root_mem_cgroup; @@ -3653,31 +3661,15 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) BUG_ON(kmemcg_id < 0); /* - * Change kmemcg_id of this cgroup and all its descendants to the - * parent's id, and then move all entries from this cgroup's list_lrus - * to ones of the parent. After we have finished, all list_lrus - * corresponding to this cgroup are guaranteed to remain empty. The - * ordering is imposed by list_lru_node->lock taken by + * After we have finished memcg_reparent_objcgs(), all list_lrus + * corresponding to this cgroup are guaranteed to remain empty. + * The ordering is imposed by list_lru_node->lock taken by * memcg_drain_all_list_lrus(). */ - rcu_read_lock(); /* can be called from css_free w/o cgroup_mutex */ - css_for_each_descendant_pre(css, &memcg->css) { - child = mem_cgroup_from_css(css); - BUG_ON(child->kmemcg_id != kmemcg_id); - child->kmemcg_id = parent->kmemcg_id; - } - rcu_read_unlock(); - memcg_drain_all_list_lrus(kmemcg_id, parent); memcg_free_cache_id(kmemcg_id); -} - -static void memcg_free_kmem(struct mem_cgroup *memcg) -{ - /* css_alloc() failed, offlining didn't happen */ - if (unlikely(memcg->kmem_state == KMEM_ONLINE)) - memcg_offline_kmem(memcg); + memcg->kmemcg_id = -1; } #else static int memcg_online_kmem(struct mem_cgroup *memcg) @@ -3687,22 +3679,8 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) static void memcg_offline_kmem(struct mem_cgroup *memcg) { } -static void memcg_free_kmem(struct mem_cgroup *memcg) -{ -} #endif /* CONFIG_MEMCG_KMEM */ -static int memcg_update_kmem_max(struct mem_cgroup *memcg, - unsigned long max) -{ - int ret; - - mutex_lock(&memcg_max_mutex); - ret = page_counter_set_max(&memcg->kmem, max); - mutex_unlock(&memcg_max_mutex); - return ret; -} - static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max) { int ret; @@ -3768,10 +3746,8 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, ret = mem_cgroup_resize_max(memcg, nr_pages, true); break; case _KMEM: - pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. " - "Please report your usecase to linux-mm@kvack.org if you " - "depend on this functionality.\n"); - ret = memcg_update_kmem_max(memcg, nr_pages); + /* kmem.limit_in_bytes is deprecated. */ + ret = -EOPNOTSUPP; break; case _TCP: ret = memcg_update_tcp_max(memcg, nr_pages); @@ -3916,7 +3892,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) int nid; struct mem_cgroup *memcg = mem_cgroup_from_seq(m); - cgroup_rstat_flush(memcg->css.cgroup); + mem_cgroup_flush_stats(); for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { seq_printf(m, "%s=%lu", stat->name, @@ -3988,7 +3964,7 @@ static int memcg_stat_show(struct seq_file *m, void *v) BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); - cgroup_rstat_flush(memcg->css.cgroup); + mem_cgroup_flush_stats(); for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { unsigned long nr; @@ -4491,7 +4467,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); struct mem_cgroup *parent; - cgroup_rstat_flush_irqsafe(memcg->css.cgroup); + mem_cgroup_flush_stats(); *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); @@ -5324,7 +5300,9 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) cancel_work_sync(&memcg->high_work); mem_cgroup_remove_from_trees(memcg); free_shrinker_info(memcg); - memcg_free_kmem(memcg); + + /* Need to offline kmem if online_css() fails */ + memcg_offline_kmem(memcg); mem_cgroup_free(memcg); } @@ -5357,21 +5335,6 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) memcg_wb_domain_size_changed(memcg); } -void mem_cgroup_flush_stats(void) -{ - if (!spin_trylock(&stats_flush_lock)) - return; - - cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup); - spin_unlock(&stats_flush_lock); -} - -static void flush_memcg_stats_dwork(struct work_struct *w) -{ - mem_cgroup_flush_stats(); - queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); -} - static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); @@ -5561,7 +5524,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, #endif static struct page *mc_handle_file_pte(struct vm_area_struct *vma, - unsigned long addr, pte_t ptent, swp_entry_t *entry) + unsigned long addr, pte_t ptent) { if (!vma->vm_file) /* anonymous vma */ return NULL; @@ -5736,7 +5699,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, else if (is_swap_pte(ptent)) page = mc_handle_swap_pte(vma, ptent, &ent); else if (pte_none(ptent)) - page = mc_handle_file_pte(vma, addr, ptent, &ent); + page = mc_handle_file_pte(vma, addr, ptent); if (!page && !ent.val) return ret; @@ -6391,7 +6354,7 @@ static int memory_numa_stat_show(struct seq_file *m, void *v) int i; struct mem_cgroup *memcg = mem_cgroup_from_seq(m); - cgroup_rstat_flush(memcg->css.cgroup); + mem_cgroup_flush_stats(); for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { int nid; |