diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 490 |
1 files changed, 256 insertions, 234 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d563fb515766..7845c64a2c57 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -25,6 +25,7 @@ * Copyright (C) 2020 Alibaba, Inc, Alex Shi */ +#include <linux/cgroup-defs.h> #include <linux/page_counter.h> #include <linux/memcontrol.h> #include <linux/cgroup.h> @@ -41,6 +42,7 @@ #include <linux/rcupdate.h> #include <linux/limits.h> #include <linux/export.h> +#include <linux/list.h> #include <linux/mutex.h> #include <linux/rbtree.h> #include <linux/slab.h> @@ -93,9 +95,6 @@ static bool cgroup_memory_nobpf __ro_after_init; static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); #endif -#define THRESHOLDS_EVENTS_TARGET 128 -#define SOFTLIMIT_EVENTS_TARGET 1024 - static inline bool task_is_dying(void) { return tsk_is_oom_victim(current) || fatal_signal_pending(current) || @@ -305,6 +304,12 @@ static const unsigned int memcg_node_stat_items[] = { #ifdef CONFIG_SWAP NR_SWAPCACHE, #endif +#ifdef CONFIG_NUMA_BALANCING + PGPROMOTE_SUCCESS, +#endif + PGDEMOTE_KSWAPD, + PGDEMOTE_DIRECT, + PGDEMOTE_KHUGEPAGED, }; static const unsigned int memcg_stat_items[] = { @@ -320,24 +325,27 @@ static const unsigned int memcg_stat_items[] = { #define NR_MEMCG_NODE_STAT_ITEMS ARRAY_SIZE(memcg_node_stat_items) #define MEMCG_VMSTAT_SIZE (NR_MEMCG_NODE_STAT_ITEMS + \ ARRAY_SIZE(memcg_stat_items)) -static int8_t mem_cgroup_stats_index[MEMCG_NR_STAT] __read_mostly; +#define BAD_STAT_IDX(index) ((u32)(index) >= U8_MAX) +static u8 mem_cgroup_stats_index[MEMCG_NR_STAT] __read_mostly; static void init_memcg_stats(void) { - int8_t i, j = 0; + u8 i, j = 0; + + BUILD_BUG_ON(MEMCG_NR_STAT >= U8_MAX); - BUILD_BUG_ON(MEMCG_NR_STAT >= S8_MAX); + memset(mem_cgroup_stats_index, U8_MAX, sizeof(mem_cgroup_stats_index)); - for (i = 0; i < NR_MEMCG_NODE_STAT_ITEMS; ++i) - mem_cgroup_stats_index[memcg_node_stat_items[i]] = ++j; + for (i = 0; i < NR_MEMCG_NODE_STAT_ITEMS; ++i, ++j) + mem_cgroup_stats_index[memcg_node_stat_items[i]] = j; - for (i = 0; i < ARRAY_SIZE(memcg_stat_items); ++i) - mem_cgroup_stats_index[memcg_stat_items[i]] = ++j; + for (i = 0; i < ARRAY_SIZE(memcg_stat_items); ++i, ++j) + mem_cgroup_stats_index[memcg_stat_items[i]] = j; } static inline int memcg_stats_index(int idx) { - return mem_cgroup_stats_index[idx] - 1; + return mem_cgroup_stats_index[idx]; } struct lruvec_stats_percpu { @@ -369,7 +377,7 @@ unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) return node_page_state(lruvec_pgdat(lruvec), idx); i = memcg_stats_index(idx); - if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx)) + if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return 0; pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); @@ -392,7 +400,7 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec, return node_page_state(lruvec_pgdat(lruvec), idx); i = memcg_stats_index(idx); - if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx)) + if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return 0; pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); @@ -406,8 +414,10 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec, /* Subset of vm_event_item to report for memcg event stats */ static const unsigned int memcg_vm_event_stat[] = { +#ifdef CONFIG_MEMCG_V1 PGPGIN, PGPGOUT, +#endif PGSCAN_KSWAPD, PGSCAN_DIRECT, PGSCAN_KHUGEPAGED, @@ -432,24 +442,32 @@ static const unsigned int memcg_vm_event_stat[] = { THP_SWPOUT, THP_SWPOUT_FALLBACK, #endif +#ifdef CONFIG_NUMA_BALANCING + NUMA_PAGE_MIGRATE, + NUMA_PTE_UPDATES, + NUMA_HINT_FAULTS, +#endif }; #define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat) -static int8_t mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly; +static u8 mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly; static void init_memcg_events(void) { - int8_t i; + u8 i; - BUILD_BUG_ON(NR_VM_EVENT_ITEMS >= S8_MAX); + BUILD_BUG_ON(NR_VM_EVENT_ITEMS >= U8_MAX); + + memset(mem_cgroup_events_index, U8_MAX, + sizeof(mem_cgroup_events_index)); for (i = 0; i < NR_MEMCG_EVENTS; ++i) - mem_cgroup_events_index[memcg_vm_event_stat[i]] = i + 1; + mem_cgroup_events_index[memcg_vm_event_stat[i]] = i; } static inline int memcg_events_index(enum vm_event_item idx) { - return mem_cgroup_events_index[idx] - 1; + return mem_cgroup_events_index[idx]; } struct memcg_vmstats_percpu { @@ -469,10 +487,6 @@ struct memcg_vmstats_percpu { /* Delta calculation for lockless upward propagation */ long state_prev[MEMCG_VMSTAT_SIZE]; unsigned long events_prev[NR_MEMCG_EVENTS]; - - /* Cgroup1: threshold notifications & softlimit tree updates */ - unsigned long nr_page_events; - unsigned long targets[MEM_CGROUP_NTARGETS]; } ____cacheline_aligned; struct memcg_vmstats { @@ -621,7 +635,7 @@ unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) long x; int i = memcg_stats_index(idx); - if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx)) + if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return 0; x = READ_ONCE(memcg->vmstats->state[i]); @@ -662,7 +676,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, if (mem_cgroup_disabled()) return; - if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx)) + if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return; __this_cpu_add(memcg->vmstats_percpu->state[i], val); @@ -675,7 +689,7 @@ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) long x; int i = memcg_stats_index(idx); - if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx)) + if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return 0; x = READ_ONCE(memcg->vmstats->state_local[i]); @@ -694,7 +708,7 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec, struct mem_cgroup *memcg; int i = memcg_stats_index(idx); - if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx)) + if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return; pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); @@ -810,7 +824,7 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, if (mem_cgroup_disabled()) return; - if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx)) + if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return; memcg_stats_lock(); @@ -823,7 +837,7 @@ unsigned long memcg_events(struct mem_cgroup *memcg, int event) { int i = memcg_events_index(event); - if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, event)) + if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, event)) return 0; return READ_ONCE(memcg->vmstats->events[i]); @@ -833,50 +847,12 @@ unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) { int i = memcg_events_index(event); - if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, event)) + if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, event)) return 0; return READ_ONCE(memcg->vmstats->events_local[i]); } -void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, int nr_pages) -{ - /* pagein of a big page is an event. So, ignore page size */ - if (nr_pages > 0) - __count_memcg_events(memcg, PGPGIN, 1); - else { - __count_memcg_events(memcg, PGPGOUT, 1); - nr_pages = -nr_pages; /* for event */ - } - - __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages); -} - -bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, - enum mem_cgroup_events_target target) -{ - unsigned long val, next; - - val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events); - next = __this_cpu_read(memcg->vmstats_percpu->targets[target]); - /* from time_after() in jiffies.h */ - if ((long)(next - val) < 0) { - switch (target) { - case MEM_CGROUP_TARGET_THRESH: - next = val + THRESHOLDS_EVENTS_TARGET; - break; - case MEM_CGROUP_TARGET_SOFTLIMIT: - next = val + SOFTLIMIT_EVENTS_TARGET; - break; - default: - break; - } - __this_cpu_write(memcg->vmstats_percpu->targets[target], next); - return true; - } - return false; -} - struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) { /* @@ -971,6 +947,24 @@ again: } /** + * get_mem_cgroup_from_folio - Obtain a reference on a given folio's memcg. + * @folio: folio from which memcg should be extracted. + */ +struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio) +{ + struct mem_cgroup *memcg = folio_memcg(folio); + + if (mem_cgroup_disabled()) + return NULL; + + rcu_read_lock(); + if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css))) + memcg = root_mem_cgroup; + rcu_read_unlock(); + return memcg; +} + +/** * mem_cgroup_iter - iterate over memory cgroup hierarchy * @root: hierarchy root * @prev: previously returned memcg, NULL on first invocation @@ -992,9 +986,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup_reclaim_cookie *reclaim) { struct mem_cgroup_reclaim_iter *iter; - struct cgroup_subsys_state *css = NULL; - struct mem_cgroup *memcg = NULL; - struct mem_cgroup *pos = NULL; + struct cgroup_subsys_state *css; + struct mem_cgroup *pos; + struct mem_cgroup *next; if (mem_cgroup_disabled()) return NULL; @@ -1003,81 +997,67 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, root = root_mem_cgroup; rcu_read_lock(); +restart: + next = NULL; if (reclaim) { - struct mem_cgroup_per_node *mz; + int gen; + int nid = reclaim->pgdat->node_id; - mz = root->nodeinfo[reclaim->pgdat->node_id]; - iter = &mz->iter; + iter = &root->nodeinfo[nid]->iter; + gen = atomic_read(&iter->generation); /* * On start, join the current reclaim iteration cycle. * Exit when a concurrent walker completes it. */ if (!prev) - reclaim->generation = iter->generation; - else if (reclaim->generation != iter->generation) + reclaim->generation = gen; + else if (reclaim->generation != gen) goto out_unlock; - while (1) { - pos = READ_ONCE(iter->position); - if (!pos || css_tryget(&pos->css)) - break; - /* - * css reference reached zero, so iter->position will - * be cleared by ->css_released. However, we should not - * rely on this happening soon, because ->css_released - * is called from a work queue, and by busy-waiting we - * might block it. So we clear iter->position right - * away. - */ - (void)cmpxchg(&iter->position, pos, NULL); - } - } else if (prev) { + pos = READ_ONCE(iter->position); + } else pos = prev; - } - if (pos) - css = &pos->css; - - for (;;) { - css = css_next_descendant_pre(css, &root->css); - if (!css) { - /* - * Reclaimers share the hierarchy walk, and a - * new one might jump in right at the end of - * the hierarchy - make sure they see at least - * one group and restart from the beginning. - */ - if (!prev) - continue; - break; - } + css = pos ? &pos->css : NULL; + while ((css = css_next_descendant_pre(css, &root->css))) { /* * Verify the css and acquire a reference. The root * is provided by the caller, so we know it's alive * and kicking, and don't take an extra reference. */ - if (css == &root->css || css_tryget(css)) { - memcg = mem_cgroup_from_css(css); + if (css == &root->css || css_tryget(css)) break; - } } + next = mem_cgroup_from_css(css); + if (reclaim) { /* * The position could have already been updated by a competing * thread, so check that the value hasn't changed since we read * it to avoid reclaiming from the same cgroup twice. */ - (void)cmpxchg(&iter->position, pos, memcg); + if (cmpxchg(&iter->position, pos, next) != pos) { + if (css && css != &root->css) + css_put(css); + goto restart; + } - if (pos) - css_put(&pos->css); + if (!next) { + atomic_inc(&iter->generation); - if (!memcg) - iter->generation++; + /* + * Reclaimers share the hierarchy walk, and a + * new one might jump in right at the end of + * the hierarchy - make sure they see at least + * one group and restart from the beginning. + */ + if (!prev) + goto restart; + } } out_unlock: @@ -1085,7 +1065,7 @@ out_unlock: if (prev && prev != root) css_put(&prev->css); - return memcg; + return next; } /** @@ -1375,6 +1355,13 @@ static const struct memory_stat memory_stats[] = { { "workingset_restore_anon", WORKINGSET_RESTORE_ANON }, { "workingset_restore_file", WORKINGSET_RESTORE_FILE }, { "workingset_nodereclaim", WORKINGSET_NODERECLAIM }, + + { "pgdemote_kswapd", PGDEMOTE_KSWAPD }, + { "pgdemote_direct", PGDEMOTE_DIRECT }, + { "pgdemote_khugepaged", PGDEMOTE_KHUGEPAGED }, +#ifdef CONFIG_NUMA_BALANCING + { "pgpromote_success", PGPROMOTE_SUCCESS }, +#endif }; /* The actual unit of the state item, not the same as the output unit */ @@ -1399,6 +1386,9 @@ static int memcg_page_state_output_unit(int item) /* * Workingset state is actually in pages, but we export it to userspace * as a scalar count of events, so special case it here. + * + * Demotion and promotion activities are exported in pages, consistent + * with their global counterparts. */ switch (item) { case WORKINGSET_REFAULT_ANON: @@ -1408,6 +1398,12 @@ static int memcg_page_state_output_unit(int item) case WORKINGSET_RESTORE_ANON: case WORKINGSET_RESTORE_FILE: case WORKINGSET_NODERECLAIM: + case PGDEMOTE_KSWAPD: + case PGDEMOTE_DIRECT: + case PGDEMOTE_KHUGEPAGED: +#ifdef CONFIG_NUMA_BALANCING + case PGPROMOTE_SUCCESS: +#endif return 1; default: return memcg_page_state_unit(item); @@ -1466,10 +1462,11 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) memcg_events(memcg, PGSTEAL_KHUGEPAGED)); for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) { +#ifdef CONFIG_MEMCG_V1 if (memcg_vm_event_stat[i] == PGPGIN || memcg_vm_event_stat[i] == PGPGOUT) continue; - +#endif seq_buf_printf(s, "%s %lu\n", vm_event_name(memcg_vm_event_stat[i]), memcg_events(memcg, memcg_vm_event_stat[i])); @@ -2366,7 +2363,7 @@ void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) { - VM_BUG_ON_FOLIO(folio_memcg(folio), folio); + VM_BUG_ON_FOLIO(folio_memcg_charged(folio), folio); /* * Any of the following ensures page's memcg stability: * @@ -2388,11 +2385,7 @@ void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg) { css_get(&memcg->css); commit_charge(folio, memcg); - - local_irq_disable(); - mem_cgroup_charge_statistics(memcg, folio_nr_pages(folio)); - memcg1_check_events(memcg, folio_nid(folio)); - local_irq_enable(); + memcg1_commit_charge(folio, memcg); } static inline void __mod_objcg_mlstate(struct obj_cgroup *objcg, @@ -2446,37 +2439,7 @@ struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p) /* * Returns a pointer to the memory cgroup to which the kernel object is charged. - * - * A passed kernel object can be a slab object, vmalloc object or a generic - * kernel page, so different mechanisms for getting the memory cgroup pointer - * should be used. - * - * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller - * can not know for sure how the kernel object is implemented. - * mem_cgroup_from_obj() can be safely used in such cases. - * - * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(), - * cgroup_mutex, etc. - */ -struct mem_cgroup *mem_cgroup_from_obj(void *p) -{ - struct folio *folio; - - if (mem_cgroup_disabled()) - return NULL; - - if (unlikely(is_vmalloc_addr(p))) - folio = page_folio(vmalloc_to_page(p)); - else - folio = virt_to_folio(p); - - return mem_cgroup_from_obj_folio(folio, p); -} - -/* - * Returns a pointer to the memory cgroup to which the kernel object is charged. - * Similar to mem_cgroup_from_obj(), but faster and not suitable for objects, - * allocated using vmalloc(). + * It is not suitable for objects allocated using vmalloc(). * * A passed kernel object must be a slab object or a generic kernel page. * @@ -3057,12 +3020,11 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void split_page_memcg(struct page *head, int old_order, int new_order) { struct folio *folio = page_folio(head); - struct mem_cgroup *memcg = folio_memcg(folio); int i; unsigned int old_nr = 1 << old_order; unsigned int new_nr = 1 << new_order; - if (mem_cgroup_disabled() || !memcg) + if (mem_cgroup_disabled() || !folio_memcg_charged(folio)) return; for (i = new_nr; i < old_nr; i += new_nr) @@ -3071,7 +3033,7 @@ void split_page_memcg(struct page *head, int old_order, int new_order) if (folio_memcg_kmem(folio)) obj_cgroup_get_many(__folio_objcg(folio), old_nr / new_nr - 1); else - css_get_many(&memcg->css, old_nr / new_nr - 1); + css_get_many(&folio_memcg(folio)->css, old_nr / new_nr - 1); } unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) @@ -3385,29 +3347,12 @@ static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) */ #define MEM_CGROUP_ID_MAX ((1UL << MEM_CGROUP_ID_SHIFT) - 1) -static DEFINE_IDR(mem_cgroup_idr); -static DEFINE_SPINLOCK(memcg_idr_lock); - -static int mem_cgroup_alloc_id(void) -{ - int ret; - - idr_preload(GFP_KERNEL); - spin_lock(&memcg_idr_lock); - ret = idr_alloc(&mem_cgroup_idr, NULL, 1, MEM_CGROUP_ID_MAX + 1, - GFP_NOWAIT); - spin_unlock(&memcg_idr_lock); - idr_preload_end(); - return ret; -} +static DEFINE_XARRAY_ALLOC1(mem_cgroup_ids); static void mem_cgroup_id_remove(struct mem_cgroup *memcg) { if (memcg->id.id > 0) { - spin_lock(&memcg_idr_lock); - idr_remove(&mem_cgroup_idr, memcg->id.id); - spin_unlock(&memcg_idr_lock); - + xa_erase(&mem_cgroup_ids, memcg->id.id); memcg->id.id = 0; } } @@ -3442,7 +3387,7 @@ static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) struct mem_cgroup *mem_cgroup_from_id(unsigned short id) { WARN_ON_ONCE(!rcu_read_lock_held()); - return idr_find(&mem_cgroup_idr, id); + return xa_load(&mem_cgroup_ids, id); } #ifdef CONFIG_SHRINKER_DEBUG @@ -3517,6 +3462,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); + memcg1_free_events(memcg); kfree(memcg->vmstats); free_percpu(memcg->vmstats_percpu); kfree(memcg); @@ -3535,17 +3481,17 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) struct mem_cgroup *memcg; int node, cpu; int __maybe_unused i; - long error = -ENOMEM; + long error; memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL); if (!memcg) - return ERR_PTR(error); + return ERR_PTR(-ENOMEM); - memcg->id.id = mem_cgroup_alloc_id(); - if (memcg->id.id < 0) { - error = memcg->id.id; + error = xa_alloc(&mem_cgroup_ids, &memcg->id.id, NULL, + XA_LIMIT(1, MEM_CGROUP_ID_MAX), GFP_KERNEL); + if (error) goto fail; - } + error = -ENOMEM; memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats), GFP_KERNEL_ACCOUNT); @@ -3557,6 +3503,9 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) if (!memcg->vmstats_percpu) goto fail; + if (!memcg1_alloc_events(memcg)) + goto fail; + for_each_possible_cpu(cpu) { if (parent) pstatc = per_cpu_ptr(parent->vmstats_percpu, cpu); @@ -3574,6 +3523,9 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) INIT_WORK(&memcg->high_work, high_work_func); vmpressure_init(&memcg->vmpressure); + INIT_LIST_HEAD(&memcg->memory_peaks); + INIT_LIST_HEAD(&memcg->swap_peaks); + spin_lock_init(&memcg->peaks_lock); memcg->socket_pressure = jiffies; memcg1_memcg_init(memcg); memcg->kmemcg_id = -1; @@ -3619,21 +3571,21 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (parent) { WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent)); - page_counter_init(&memcg->memory, &parent->memory); - page_counter_init(&memcg->swap, &parent->swap); + page_counter_init(&memcg->memory, &parent->memory, true); + page_counter_init(&memcg->swap, &parent->swap, false); #ifdef CONFIG_MEMCG_V1 WRITE_ONCE(memcg->oom_kill_disable, READ_ONCE(parent->oom_kill_disable)); - page_counter_init(&memcg->kmem, &parent->kmem); - page_counter_init(&memcg->tcpmem, &parent->tcpmem); + page_counter_init(&memcg->kmem, &parent->kmem, false); + page_counter_init(&memcg->tcpmem, &parent->tcpmem, false); #endif } else { init_memcg_stats(); init_memcg_events(); - page_counter_init(&memcg->memory, NULL); - page_counter_init(&memcg->swap, NULL); + page_counter_init(&memcg->memory, NULL, true); + page_counter_init(&memcg->swap, NULL, false); #ifdef CONFIG_MEMCG_V1 - page_counter_init(&memcg->kmem, NULL); - page_counter_init(&memcg->tcpmem, NULL); + page_counter_init(&memcg->kmem, NULL, false); + page_counter_init(&memcg->tcpmem, NULL, false); #endif root_mem_cgroup = memcg; return &memcg->css; @@ -3682,9 +3634,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) * publish it here at the end of onlining. This matches the * regular ID destruction during offlining. */ - spin_lock(&memcg_idr_lock); - idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); - spin_unlock(&memcg_idr_lock); + xa_store(&mem_cgroup_ids, memcg->id.id, memcg, GFP_KERNEL); return 0; offline_kmem: @@ -3967,14 +3917,91 @@ static u64 memory_current_read(struct cgroup_subsys_state *css, return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE; } -static u64 memory_peak_read(struct cgroup_subsys_state *css, - struct cftype *cft) +#define OFP_PEAK_UNSET (((-1UL))) + +static int peak_show(struct seq_file *sf, void *v, struct page_counter *pc) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct cgroup_of_peak *ofp = of_peak(sf->private); + u64 fd_peak = READ_ONCE(ofp->value), peak; + + /* User wants global or local peak? */ + if (fd_peak == OFP_PEAK_UNSET) + peak = pc->watermark; + else + peak = max(fd_peak, READ_ONCE(pc->local_watermark)); + + seq_printf(sf, "%llu\n", peak * PAGE_SIZE); + return 0; +} + +static int memory_peak_show(struct seq_file *sf, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); - return (u64)memcg->memory.watermark * PAGE_SIZE; + return peak_show(sf, v, &memcg->memory); } +static int peak_open(struct kernfs_open_file *of) +{ + struct cgroup_of_peak *ofp = of_peak(of); + + ofp->value = OFP_PEAK_UNSET; + return 0; +} + +static void peak_release(struct kernfs_open_file *of) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + struct cgroup_of_peak *ofp = of_peak(of); + + if (ofp->value == OFP_PEAK_UNSET) { + /* fast path (no writes on this fd) */ + return; + } + spin_lock(&memcg->peaks_lock); + list_del(&ofp->list); + spin_unlock(&memcg->peaks_lock); +} + +static ssize_t peak_write(struct kernfs_open_file *of, char *buf, size_t nbytes, + loff_t off, struct page_counter *pc, + struct list_head *watchers) +{ + unsigned long usage; + struct cgroup_of_peak *peer_ctx; + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + struct cgroup_of_peak *ofp = of_peak(of); + + spin_lock(&memcg->peaks_lock); + + usage = page_counter_read(pc); + WRITE_ONCE(pc->local_watermark, usage); + + list_for_each_entry(peer_ctx, watchers, list) + if (usage > peer_ctx->value) + WRITE_ONCE(peer_ctx->value, usage); + + /* initial write, register watcher */ + if (ofp->value == -1) + list_add(&ofp->list, watchers); + + WRITE_ONCE(ofp->value, usage); + spin_unlock(&memcg->peaks_lock); + + return nbytes; +} + +static ssize_t memory_peak_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + + return peak_write(of, buf, nbytes, off, &memcg->memory, + &memcg->memory_peaks); +} + +#undef OFP_PEAK_UNSET + static int memory_min_show(struct seq_file *m, void *v) { return seq_puts_memcg_tunable(m, @@ -4324,7 +4351,10 @@ static struct cftype memory_files[] = { { .name = "peak", .flags = CFTYPE_NOT_ON_ROOT, - .read_u64 = memory_peak_read, + .open = peak_open, + .release = peak_release, + .seq_show = memory_peak_show, + .write = memory_peak_write, }, { .name = "min", @@ -4528,14 +4558,15 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, /* * mem_cgroup_swapin_uncharge_swap - uncharge swap slot - * @entry: swap entry for which the page is charged + * @entry: the first swap entry for which the pages are charged + * @nr_pages: number of pages which will be uncharged * * Call this function after successfully adding the charged page to swapcache. * * Note: This function assumes the page for which swap slot is being uncharged * is order 0 page. */ -void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry) +void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) { /* * Cgroup1's unified memory+swap counter has been charged with the @@ -4555,7 +4586,7 @@ void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry) * let's not wait for it. The page already received a * memory+swap charge, drop the swap entry duplicate. */ - mem_cgroup_uncharge_swap(entry, 1); + mem_cgroup_uncharge_swap(entry, nr_pages); } } @@ -4574,8 +4605,6 @@ static inline void uncharge_gather_clear(struct uncharge_gather *ug) static void uncharge_batch(const struct uncharge_gather *ug) { - unsigned long flags; - if (ug->nr_memory) { page_counter_uncharge(&ug->memcg->memory, ug->nr_memory); if (do_memsw_account()) @@ -4587,11 +4616,7 @@ static void uncharge_batch(const struct uncharge_gather *ug) memcg1_oom_recover(ug->memcg); } - local_irq_save(flags); - __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); - __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory); - memcg1_check_events(ug->memcg, ug->nid); - local_irq_restore(flags); + memcg1_uncharge_batch(ug->memcg, ug->pgpgout, ug->nr_memory, ug->nid); /* drop reference from uncharge_folio */ css_put(&ug->memcg->css); @@ -4606,7 +4631,8 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); VM_BUG_ON_FOLIO(folio_order(folio) > 1 && !folio_test_hugetlb(folio) && - !list_empty(&folio->_deferred_list), folio); + !list_empty(&folio->_deferred_list) && + folio_test_partially_mapped(folio), folio); /* * Nobody should be changing or seriously looking at @@ -4664,7 +4690,7 @@ void __mem_cgroup_uncharge(struct folio *folio) struct uncharge_gather ug; /* Don't touch folio->lru of any random page, pre-check: */ - if (!folio_memcg(folio)) + if (!folio_memcg_charged(folio)) return; uncharge_gather_clear(&ug); @@ -4698,7 +4724,6 @@ void mem_cgroup_replace_folio(struct folio *old, struct folio *new) { struct mem_cgroup *memcg; long nr_pages = folio_nr_pages(new); - unsigned long flags; VM_BUG_ON_FOLIO(!folio_test_locked(old), old); VM_BUG_ON_FOLIO(!folio_test_locked(new), new); @@ -4709,7 +4734,7 @@ void mem_cgroup_replace_folio(struct folio *old, struct folio *new) return; /* Page cache replacement: new folio already charged? */ - if (folio_memcg(new)) + if (folio_memcg_charged(new)) return; memcg = folio_memcg(old); @@ -4726,11 +4751,7 @@ void mem_cgroup_replace_folio(struct folio *old, struct folio *new) css_get(&memcg->css); commit_charge(new, memcg); - - local_irq_save(flags); - mem_cgroup_charge_statistics(memcg, nr_pages); - memcg1_check_events(memcg, folio_nid(new)); - local_irq_restore(flags); + memcg1_commit_charge(new, memcg); } /** @@ -4966,17 +4987,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) page_counter_uncharge(&memcg->memsw, nr_entries); } - /* - * Interrupts should be disabled here because the caller holds the - * i_pages lock which is taken with interrupts-off. It is - * important here to have the interrupts disabled because it is the - * only synchronisation we have for updating the per-CPU variables. - */ - memcg_stats_lock(); - mem_cgroup_charge_statistics(memcg, -nr_entries); - memcg_stats_unlock(); - memcg1_check_events(memcg, folio_nid(folio)); - + memcg1_swapout(folio, memcg); css_put(&memcg->css); } @@ -5116,12 +5127,20 @@ static u64 swap_current_read(struct cgroup_subsys_state *css, return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE; } -static u64 swap_peak_read(struct cgroup_subsys_state *css, - struct cftype *cft) +static int swap_peak_show(struct seq_file *sf, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); + + return peak_show(sf, v, &memcg->swap); +} + +static ssize_t swap_peak_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); - return (u64)memcg->swap.watermark * PAGE_SIZE; + return peak_write(of, buf, nbytes, off, &memcg->swap, + &memcg->swap_peaks); } static int swap_high_show(struct seq_file *m, void *v) @@ -5205,7 +5224,10 @@ static struct cftype swap_files[] = { { .name = "swap.peak", .flags = CFTYPE_NOT_ON_ROOT, - .read_u64 = swap_peak_read, + .open = peak_open, + .release = peak_release, + .seq_show = swap_peak_show, + .write = swap_peak_write, }, { .name = "swap.events", |