summaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c490
1 files changed, 256 insertions, 234 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d563fb515766..7845c64a2c57 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -25,6 +25,7 @@
* Copyright (C) 2020 Alibaba, Inc, Alex Shi
*/
+#include <linux/cgroup-defs.h>
#include <linux/page_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
@@ -41,6 +42,7 @@
#include <linux/rcupdate.h>
#include <linux/limits.h>
#include <linux/export.h>
+#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/rbtree.h>
#include <linux/slab.h>
@@ -93,9 +95,6 @@ static bool cgroup_memory_nobpf __ro_after_init;
static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
#endif
-#define THRESHOLDS_EVENTS_TARGET 128
-#define SOFTLIMIT_EVENTS_TARGET 1024
-
static inline bool task_is_dying(void)
{
return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
@@ -305,6 +304,12 @@ static const unsigned int memcg_node_stat_items[] = {
#ifdef CONFIG_SWAP
NR_SWAPCACHE,
#endif
+#ifdef CONFIG_NUMA_BALANCING
+ PGPROMOTE_SUCCESS,
+#endif
+ PGDEMOTE_KSWAPD,
+ PGDEMOTE_DIRECT,
+ PGDEMOTE_KHUGEPAGED,
};
static const unsigned int memcg_stat_items[] = {
@@ -320,24 +325,27 @@ static const unsigned int memcg_stat_items[] = {
#define NR_MEMCG_NODE_STAT_ITEMS ARRAY_SIZE(memcg_node_stat_items)
#define MEMCG_VMSTAT_SIZE (NR_MEMCG_NODE_STAT_ITEMS + \
ARRAY_SIZE(memcg_stat_items))
-static int8_t mem_cgroup_stats_index[MEMCG_NR_STAT] __read_mostly;
+#define BAD_STAT_IDX(index) ((u32)(index) >= U8_MAX)
+static u8 mem_cgroup_stats_index[MEMCG_NR_STAT] __read_mostly;
static void init_memcg_stats(void)
{
- int8_t i, j = 0;
+ u8 i, j = 0;
+
+ BUILD_BUG_ON(MEMCG_NR_STAT >= U8_MAX);
- BUILD_BUG_ON(MEMCG_NR_STAT >= S8_MAX);
+ memset(mem_cgroup_stats_index, U8_MAX, sizeof(mem_cgroup_stats_index));
- for (i = 0; i < NR_MEMCG_NODE_STAT_ITEMS; ++i)
- mem_cgroup_stats_index[memcg_node_stat_items[i]] = ++j;
+ for (i = 0; i < NR_MEMCG_NODE_STAT_ITEMS; ++i, ++j)
+ mem_cgroup_stats_index[memcg_node_stat_items[i]] = j;
- for (i = 0; i < ARRAY_SIZE(memcg_stat_items); ++i)
- mem_cgroup_stats_index[memcg_stat_items[i]] = ++j;
+ for (i = 0; i < ARRAY_SIZE(memcg_stat_items); ++i, ++j)
+ mem_cgroup_stats_index[memcg_stat_items[i]] = j;
}
static inline int memcg_stats_index(int idx)
{
- return mem_cgroup_stats_index[idx] - 1;
+ return mem_cgroup_stats_index[idx];
}
struct lruvec_stats_percpu {
@@ -369,7 +377,7 @@ unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx)
return node_page_state(lruvec_pgdat(lruvec), idx);
i = memcg_stats_index(idx);
- if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx))
+ if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
return 0;
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
@@ -392,7 +400,7 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
return node_page_state(lruvec_pgdat(lruvec), idx);
i = memcg_stats_index(idx);
- if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx))
+ if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
return 0;
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
@@ -406,8 +414,10 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
/* Subset of vm_event_item to report for memcg event stats */
static const unsigned int memcg_vm_event_stat[] = {
+#ifdef CONFIG_MEMCG_V1
PGPGIN,
PGPGOUT,
+#endif
PGSCAN_KSWAPD,
PGSCAN_DIRECT,
PGSCAN_KHUGEPAGED,
@@ -432,24 +442,32 @@ static const unsigned int memcg_vm_event_stat[] = {
THP_SWPOUT,
THP_SWPOUT_FALLBACK,
#endif
+#ifdef CONFIG_NUMA_BALANCING
+ NUMA_PAGE_MIGRATE,
+ NUMA_PTE_UPDATES,
+ NUMA_HINT_FAULTS,
+#endif
};
#define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat)
-static int8_t mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly;
+static u8 mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly;
static void init_memcg_events(void)
{
- int8_t i;
+ u8 i;
- BUILD_BUG_ON(NR_VM_EVENT_ITEMS >= S8_MAX);
+ BUILD_BUG_ON(NR_VM_EVENT_ITEMS >= U8_MAX);
+
+ memset(mem_cgroup_events_index, U8_MAX,
+ sizeof(mem_cgroup_events_index));
for (i = 0; i < NR_MEMCG_EVENTS; ++i)
- mem_cgroup_events_index[memcg_vm_event_stat[i]] = i + 1;
+ mem_cgroup_events_index[memcg_vm_event_stat[i]] = i;
}
static inline int memcg_events_index(enum vm_event_item idx)
{
- return mem_cgroup_events_index[idx] - 1;
+ return mem_cgroup_events_index[idx];
}
struct memcg_vmstats_percpu {
@@ -469,10 +487,6 @@ struct memcg_vmstats_percpu {
/* Delta calculation for lockless upward propagation */
long state_prev[MEMCG_VMSTAT_SIZE];
unsigned long events_prev[NR_MEMCG_EVENTS];
-
- /* Cgroup1: threshold notifications & softlimit tree updates */
- unsigned long nr_page_events;
- unsigned long targets[MEM_CGROUP_NTARGETS];
} ____cacheline_aligned;
struct memcg_vmstats {
@@ -621,7 +635,7 @@ unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
long x;
int i = memcg_stats_index(idx);
- if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx))
+ if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
return 0;
x = READ_ONCE(memcg->vmstats->state[i]);
@@ -662,7 +676,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
if (mem_cgroup_disabled())
return;
- if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx))
+ if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
return;
__this_cpu_add(memcg->vmstats_percpu->state[i], val);
@@ -675,7 +689,7 @@ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
long x;
int i = memcg_stats_index(idx);
- if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx))
+ if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
return 0;
x = READ_ONCE(memcg->vmstats->state_local[i]);
@@ -694,7 +708,7 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
struct mem_cgroup *memcg;
int i = memcg_stats_index(idx);
- if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx))
+ if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
return;
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
@@ -810,7 +824,7 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
if (mem_cgroup_disabled())
return;
- if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx))
+ if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
return;
memcg_stats_lock();
@@ -823,7 +837,7 @@ unsigned long memcg_events(struct mem_cgroup *memcg, int event)
{
int i = memcg_events_index(event);
- if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, event))
+ if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, event))
return 0;
return READ_ONCE(memcg->vmstats->events[i]);
@@ -833,50 +847,12 @@ unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
{
int i = memcg_events_index(event);
- if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, event))
+ if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, event))
return 0;
return READ_ONCE(memcg->vmstats->events_local[i]);
}
-void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, int nr_pages)
-{
- /* pagein of a big page is an event. So, ignore page size */
- if (nr_pages > 0)
- __count_memcg_events(memcg, PGPGIN, 1);
- else {
- __count_memcg_events(memcg, PGPGOUT, 1);
- nr_pages = -nr_pages; /* for event */
- }
-
- __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
-}
-
-bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
- enum mem_cgroup_events_target target)
-{
- unsigned long val, next;
-
- val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
- next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
- /* from time_after() in jiffies.h */
- if ((long)(next - val) < 0) {
- switch (target) {
- case MEM_CGROUP_TARGET_THRESH:
- next = val + THRESHOLDS_EVENTS_TARGET;
- break;
- case MEM_CGROUP_TARGET_SOFTLIMIT:
- next = val + SOFTLIMIT_EVENTS_TARGET;
- break;
- default:
- break;
- }
- __this_cpu_write(memcg->vmstats_percpu->targets[target], next);
- return true;
- }
- return false;
-}
-
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
{
/*
@@ -971,6 +947,24 @@ again:
}
/**
+ * get_mem_cgroup_from_folio - Obtain a reference on a given folio's memcg.
+ * @folio: folio from which memcg should be extracted.
+ */
+struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio)
+{
+ struct mem_cgroup *memcg = folio_memcg(folio);
+
+ if (mem_cgroup_disabled())
+ return NULL;
+
+ rcu_read_lock();
+ if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css)))
+ memcg = root_mem_cgroup;
+ rcu_read_unlock();
+ return memcg;
+}
+
+/**
* mem_cgroup_iter - iterate over memory cgroup hierarchy
* @root: hierarchy root
* @prev: previously returned memcg, NULL on first invocation
@@ -992,9 +986,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup_reclaim_cookie *reclaim)
{
struct mem_cgroup_reclaim_iter *iter;
- struct cgroup_subsys_state *css = NULL;
- struct mem_cgroup *memcg = NULL;
- struct mem_cgroup *pos = NULL;
+ struct cgroup_subsys_state *css;
+ struct mem_cgroup *pos;
+ struct mem_cgroup *next;
if (mem_cgroup_disabled())
return NULL;
@@ -1003,81 +997,67 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
root = root_mem_cgroup;
rcu_read_lock();
+restart:
+ next = NULL;
if (reclaim) {
- struct mem_cgroup_per_node *mz;
+ int gen;
+ int nid = reclaim->pgdat->node_id;
- mz = root->nodeinfo[reclaim->pgdat->node_id];
- iter = &mz->iter;
+ iter = &root->nodeinfo[nid]->iter;
+ gen = atomic_read(&iter->generation);
/*
* On start, join the current reclaim iteration cycle.
* Exit when a concurrent walker completes it.
*/
if (!prev)
- reclaim->generation = iter->generation;
- else if (reclaim->generation != iter->generation)
+ reclaim->generation = gen;
+ else if (reclaim->generation != gen)
goto out_unlock;
- while (1) {
- pos = READ_ONCE(iter->position);
- if (!pos || css_tryget(&pos->css))
- break;
- /*
- * css reference reached zero, so iter->position will
- * be cleared by ->css_released. However, we should not
- * rely on this happening soon, because ->css_released
- * is called from a work queue, and by busy-waiting we
- * might block it. So we clear iter->position right
- * away.
- */
- (void)cmpxchg(&iter->position, pos, NULL);
- }
- } else if (prev) {
+ pos = READ_ONCE(iter->position);
+ } else
pos = prev;
- }
- if (pos)
- css = &pos->css;
-
- for (;;) {
- css = css_next_descendant_pre(css, &root->css);
- if (!css) {
- /*
- * Reclaimers share the hierarchy walk, and a
- * new one might jump in right at the end of
- * the hierarchy - make sure they see at least
- * one group and restart from the beginning.
- */
- if (!prev)
- continue;
- break;
- }
+ css = pos ? &pos->css : NULL;
+ while ((css = css_next_descendant_pre(css, &root->css))) {
/*
* Verify the css and acquire a reference. The root
* is provided by the caller, so we know it's alive
* and kicking, and don't take an extra reference.
*/
- if (css == &root->css || css_tryget(css)) {
- memcg = mem_cgroup_from_css(css);
+ if (css == &root->css || css_tryget(css))
break;
- }
}
+ next = mem_cgroup_from_css(css);
+
if (reclaim) {
/*
* The position could have already been updated by a competing
* thread, so check that the value hasn't changed since we read
* it to avoid reclaiming from the same cgroup twice.
*/
- (void)cmpxchg(&iter->position, pos, memcg);
+ if (cmpxchg(&iter->position, pos, next) != pos) {
+ if (css && css != &root->css)
+ css_put(css);
+ goto restart;
+ }
- if (pos)
- css_put(&pos->css);
+ if (!next) {
+ atomic_inc(&iter->generation);
- if (!memcg)
- iter->generation++;
+ /*
+ * Reclaimers share the hierarchy walk, and a
+ * new one might jump in right at the end of
+ * the hierarchy - make sure they see at least
+ * one group and restart from the beginning.
+ */
+ if (!prev)
+ goto restart;
+ }
}
out_unlock:
@@ -1085,7 +1065,7 @@ out_unlock:
if (prev && prev != root)
css_put(&prev->css);
- return memcg;
+ return next;
}
/**
@@ -1375,6 +1355,13 @@ static const struct memory_stat memory_stats[] = {
{ "workingset_restore_anon", WORKINGSET_RESTORE_ANON },
{ "workingset_restore_file", WORKINGSET_RESTORE_FILE },
{ "workingset_nodereclaim", WORKINGSET_NODERECLAIM },
+
+ { "pgdemote_kswapd", PGDEMOTE_KSWAPD },
+ { "pgdemote_direct", PGDEMOTE_DIRECT },
+ { "pgdemote_khugepaged", PGDEMOTE_KHUGEPAGED },
+#ifdef CONFIG_NUMA_BALANCING
+ { "pgpromote_success", PGPROMOTE_SUCCESS },
+#endif
};
/* The actual unit of the state item, not the same as the output unit */
@@ -1399,6 +1386,9 @@ static int memcg_page_state_output_unit(int item)
/*
* Workingset state is actually in pages, but we export it to userspace
* as a scalar count of events, so special case it here.
+ *
+ * Demotion and promotion activities are exported in pages, consistent
+ * with their global counterparts.
*/
switch (item) {
case WORKINGSET_REFAULT_ANON:
@@ -1408,6 +1398,12 @@ static int memcg_page_state_output_unit(int item)
case WORKINGSET_RESTORE_ANON:
case WORKINGSET_RESTORE_FILE:
case WORKINGSET_NODERECLAIM:
+ case PGDEMOTE_KSWAPD:
+ case PGDEMOTE_DIRECT:
+ case PGDEMOTE_KHUGEPAGED:
+#ifdef CONFIG_NUMA_BALANCING
+ case PGPROMOTE_SUCCESS:
+#endif
return 1;
default:
return memcg_page_state_unit(item);
@@ -1466,10 +1462,11 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
memcg_events(memcg, PGSTEAL_KHUGEPAGED));
for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) {
+#ifdef CONFIG_MEMCG_V1
if (memcg_vm_event_stat[i] == PGPGIN ||
memcg_vm_event_stat[i] == PGPGOUT)
continue;
-
+#endif
seq_buf_printf(s, "%s %lu\n",
vm_event_name(memcg_vm_event_stat[i]),
memcg_events(memcg, memcg_vm_event_stat[i]));
@@ -2366,7 +2363,7 @@ void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
{
- VM_BUG_ON_FOLIO(folio_memcg(folio), folio);
+ VM_BUG_ON_FOLIO(folio_memcg_charged(folio), folio);
/*
* Any of the following ensures page's memcg stability:
*
@@ -2388,11 +2385,7 @@ void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
{
css_get(&memcg->css);
commit_charge(folio, memcg);
-
- local_irq_disable();
- mem_cgroup_charge_statistics(memcg, folio_nr_pages(folio));
- memcg1_check_events(memcg, folio_nid(folio));
- local_irq_enable();
+ memcg1_commit_charge(folio, memcg);
}
static inline void __mod_objcg_mlstate(struct obj_cgroup *objcg,
@@ -2446,37 +2439,7 @@ struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p)
/*
* Returns a pointer to the memory cgroup to which the kernel object is charged.
- *
- * A passed kernel object can be a slab object, vmalloc object or a generic
- * kernel page, so different mechanisms for getting the memory cgroup pointer
- * should be used.
- *
- * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller
- * can not know for sure how the kernel object is implemented.
- * mem_cgroup_from_obj() can be safely used in such cases.
- *
- * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
- * cgroup_mutex, etc.
- */
-struct mem_cgroup *mem_cgroup_from_obj(void *p)
-{
- struct folio *folio;
-
- if (mem_cgroup_disabled())
- return NULL;
-
- if (unlikely(is_vmalloc_addr(p)))
- folio = page_folio(vmalloc_to_page(p));
- else
- folio = virt_to_folio(p);
-
- return mem_cgroup_from_obj_folio(folio, p);
-}
-
-/*
- * Returns a pointer to the memory cgroup to which the kernel object is charged.
- * Similar to mem_cgroup_from_obj(), but faster and not suitable for objects,
- * allocated using vmalloc().
+ * It is not suitable for objects allocated using vmalloc().
*
* A passed kernel object must be a slab object or a generic kernel page.
*
@@ -3057,12 +3020,11 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
void split_page_memcg(struct page *head, int old_order, int new_order)
{
struct folio *folio = page_folio(head);
- struct mem_cgroup *memcg = folio_memcg(folio);
int i;
unsigned int old_nr = 1 << old_order;
unsigned int new_nr = 1 << new_order;
- if (mem_cgroup_disabled() || !memcg)
+ if (mem_cgroup_disabled() || !folio_memcg_charged(folio))
return;
for (i = new_nr; i < old_nr; i += new_nr)
@@ -3071,7 +3033,7 @@ void split_page_memcg(struct page *head, int old_order, int new_order)
if (folio_memcg_kmem(folio))
obj_cgroup_get_many(__folio_objcg(folio), old_nr / new_nr - 1);
else
- css_get_many(&memcg->css, old_nr / new_nr - 1);
+ css_get_many(&folio_memcg(folio)->css, old_nr / new_nr - 1);
}
unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
@@ -3385,29 +3347,12 @@ static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
*/
#define MEM_CGROUP_ID_MAX ((1UL << MEM_CGROUP_ID_SHIFT) - 1)
-static DEFINE_IDR(mem_cgroup_idr);
-static DEFINE_SPINLOCK(memcg_idr_lock);
-
-static int mem_cgroup_alloc_id(void)
-{
- int ret;
-
- idr_preload(GFP_KERNEL);
- spin_lock(&memcg_idr_lock);
- ret = idr_alloc(&mem_cgroup_idr, NULL, 1, MEM_CGROUP_ID_MAX + 1,
- GFP_NOWAIT);
- spin_unlock(&memcg_idr_lock);
- idr_preload_end();
- return ret;
-}
+static DEFINE_XARRAY_ALLOC1(mem_cgroup_ids);
static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
{
if (memcg->id.id > 0) {
- spin_lock(&memcg_idr_lock);
- idr_remove(&mem_cgroup_idr, memcg->id.id);
- spin_unlock(&memcg_idr_lock);
-
+ xa_erase(&mem_cgroup_ids, memcg->id.id);
memcg->id.id = 0;
}
}
@@ -3442,7 +3387,7 @@ static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
{
WARN_ON_ONCE(!rcu_read_lock_held());
- return idr_find(&mem_cgroup_idr, id);
+ return xa_load(&mem_cgroup_ids, id);
}
#ifdef CONFIG_SHRINKER_DEBUG
@@ -3517,6 +3462,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
+ memcg1_free_events(memcg);
kfree(memcg->vmstats);
free_percpu(memcg->vmstats_percpu);
kfree(memcg);
@@ -3535,17 +3481,17 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
struct mem_cgroup *memcg;
int node, cpu;
int __maybe_unused i;
- long error = -ENOMEM;
+ long error;
memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL);
if (!memcg)
- return ERR_PTR(error);
+ return ERR_PTR(-ENOMEM);
- memcg->id.id = mem_cgroup_alloc_id();
- if (memcg->id.id < 0) {
- error = memcg->id.id;
+ error = xa_alloc(&mem_cgroup_ids, &memcg->id.id, NULL,
+ XA_LIMIT(1, MEM_CGROUP_ID_MAX), GFP_KERNEL);
+ if (error)
goto fail;
- }
+ error = -ENOMEM;
memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats),
GFP_KERNEL_ACCOUNT);
@@ -3557,6 +3503,9 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
if (!memcg->vmstats_percpu)
goto fail;
+ if (!memcg1_alloc_events(memcg))
+ goto fail;
+
for_each_possible_cpu(cpu) {
if (parent)
pstatc = per_cpu_ptr(parent->vmstats_percpu, cpu);
@@ -3574,6 +3523,9 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
INIT_WORK(&memcg->high_work, high_work_func);
vmpressure_init(&memcg->vmpressure);
+ INIT_LIST_HEAD(&memcg->memory_peaks);
+ INIT_LIST_HEAD(&memcg->swap_peaks);
+ spin_lock_init(&memcg->peaks_lock);
memcg->socket_pressure = jiffies;
memcg1_memcg_init(memcg);
memcg->kmemcg_id = -1;
@@ -3619,21 +3571,21 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (parent) {
WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent));
- page_counter_init(&memcg->memory, &parent->memory);
- page_counter_init(&memcg->swap, &parent->swap);
+ page_counter_init(&memcg->memory, &parent->memory, true);
+ page_counter_init(&memcg->swap, &parent->swap, false);
#ifdef CONFIG_MEMCG_V1
WRITE_ONCE(memcg->oom_kill_disable, READ_ONCE(parent->oom_kill_disable));
- page_counter_init(&memcg->kmem, &parent->kmem);
- page_counter_init(&memcg->tcpmem, &parent->tcpmem);
+ page_counter_init(&memcg->kmem, &parent->kmem, false);
+ page_counter_init(&memcg->tcpmem, &parent->tcpmem, false);
#endif
} else {
init_memcg_stats();
init_memcg_events();
- page_counter_init(&memcg->memory, NULL);
- page_counter_init(&memcg->swap, NULL);
+ page_counter_init(&memcg->memory, NULL, true);
+ page_counter_init(&memcg->swap, NULL, false);
#ifdef CONFIG_MEMCG_V1
- page_counter_init(&memcg->kmem, NULL);
- page_counter_init(&memcg->tcpmem, NULL);
+ page_counter_init(&memcg->kmem, NULL, false);
+ page_counter_init(&memcg->tcpmem, NULL, false);
#endif
root_mem_cgroup = memcg;
return &memcg->css;
@@ -3682,9 +3634,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
* publish it here at the end of onlining. This matches the
* regular ID destruction during offlining.
*/
- spin_lock(&memcg_idr_lock);
- idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
- spin_unlock(&memcg_idr_lock);
+ xa_store(&mem_cgroup_ids, memcg->id.id, memcg, GFP_KERNEL);
return 0;
offline_kmem:
@@ -3967,14 +3917,91 @@ static u64 memory_current_read(struct cgroup_subsys_state *css,
return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
}
-static u64 memory_peak_read(struct cgroup_subsys_state *css,
- struct cftype *cft)
+#define OFP_PEAK_UNSET (((-1UL)))
+
+static int peak_show(struct seq_file *sf, void *v, struct page_counter *pc)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct cgroup_of_peak *ofp = of_peak(sf->private);
+ u64 fd_peak = READ_ONCE(ofp->value), peak;
+
+ /* User wants global or local peak? */
+ if (fd_peak == OFP_PEAK_UNSET)
+ peak = pc->watermark;
+ else
+ peak = max(fd_peak, READ_ONCE(pc->local_watermark));
+
+ seq_printf(sf, "%llu\n", peak * PAGE_SIZE);
+ return 0;
+}
+
+static int memory_peak_show(struct seq_file *sf, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
- return (u64)memcg->memory.watermark * PAGE_SIZE;
+ return peak_show(sf, v, &memcg->memory);
}
+static int peak_open(struct kernfs_open_file *of)
+{
+ struct cgroup_of_peak *ofp = of_peak(of);
+
+ ofp->value = OFP_PEAK_UNSET;
+ return 0;
+}
+
+static void peak_release(struct kernfs_open_file *of)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ struct cgroup_of_peak *ofp = of_peak(of);
+
+ if (ofp->value == OFP_PEAK_UNSET) {
+ /* fast path (no writes on this fd) */
+ return;
+ }
+ spin_lock(&memcg->peaks_lock);
+ list_del(&ofp->list);
+ spin_unlock(&memcg->peaks_lock);
+}
+
+static ssize_t peak_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
+ loff_t off, struct page_counter *pc,
+ struct list_head *watchers)
+{
+ unsigned long usage;
+ struct cgroup_of_peak *peer_ctx;
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ struct cgroup_of_peak *ofp = of_peak(of);
+
+ spin_lock(&memcg->peaks_lock);
+
+ usage = page_counter_read(pc);
+ WRITE_ONCE(pc->local_watermark, usage);
+
+ list_for_each_entry(peer_ctx, watchers, list)
+ if (usage > peer_ctx->value)
+ WRITE_ONCE(peer_ctx->value, usage);
+
+ /* initial write, register watcher */
+ if (ofp->value == -1)
+ list_add(&ofp->list, watchers);
+
+ WRITE_ONCE(ofp->value, usage);
+ spin_unlock(&memcg->peaks_lock);
+
+ return nbytes;
+}
+
+static ssize_t memory_peak_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+
+ return peak_write(of, buf, nbytes, off, &memcg->memory,
+ &memcg->memory_peaks);
+}
+
+#undef OFP_PEAK_UNSET
+
static int memory_min_show(struct seq_file *m, void *v)
{
return seq_puts_memcg_tunable(m,
@@ -4324,7 +4351,10 @@ static struct cftype memory_files[] = {
{
.name = "peak",
.flags = CFTYPE_NOT_ON_ROOT,
- .read_u64 = memory_peak_read,
+ .open = peak_open,
+ .release = peak_release,
+ .seq_show = memory_peak_show,
+ .write = memory_peak_write,
},
{
.name = "min",
@@ -4528,14 +4558,15 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
/*
* mem_cgroup_swapin_uncharge_swap - uncharge swap slot
- * @entry: swap entry for which the page is charged
+ * @entry: the first swap entry for which the pages are charged
+ * @nr_pages: number of pages which will be uncharged
*
* Call this function after successfully adding the charged page to swapcache.
*
* Note: This function assumes the page for which swap slot is being uncharged
* is order 0 page.
*/
-void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry)
+void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
{
/*
* Cgroup1's unified memory+swap counter has been charged with the
@@ -4555,7 +4586,7 @@ void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry)
* let's not wait for it. The page already received a
* memory+swap charge, drop the swap entry duplicate.
*/
- mem_cgroup_uncharge_swap(entry, 1);
+ mem_cgroup_uncharge_swap(entry, nr_pages);
}
}
@@ -4574,8 +4605,6 @@ static inline void uncharge_gather_clear(struct uncharge_gather *ug)
static void uncharge_batch(const struct uncharge_gather *ug)
{
- unsigned long flags;
-
if (ug->nr_memory) {
page_counter_uncharge(&ug->memcg->memory, ug->nr_memory);
if (do_memsw_account())
@@ -4587,11 +4616,7 @@ static void uncharge_batch(const struct uncharge_gather *ug)
memcg1_oom_recover(ug->memcg);
}
- local_irq_save(flags);
- __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
- __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory);
- memcg1_check_events(ug->memcg, ug->nid);
- local_irq_restore(flags);
+ memcg1_uncharge_batch(ug->memcg, ug->pgpgout, ug->nr_memory, ug->nid);
/* drop reference from uncharge_folio */
css_put(&ug->memcg->css);
@@ -4606,7 +4631,8 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
VM_BUG_ON_FOLIO(folio_order(folio) > 1 &&
!folio_test_hugetlb(folio) &&
- !list_empty(&folio->_deferred_list), folio);
+ !list_empty(&folio->_deferred_list) &&
+ folio_test_partially_mapped(folio), folio);
/*
* Nobody should be changing or seriously looking at
@@ -4664,7 +4690,7 @@ void __mem_cgroup_uncharge(struct folio *folio)
struct uncharge_gather ug;
/* Don't touch folio->lru of any random page, pre-check: */
- if (!folio_memcg(folio))
+ if (!folio_memcg_charged(folio))
return;
uncharge_gather_clear(&ug);
@@ -4698,7 +4724,6 @@ void mem_cgroup_replace_folio(struct folio *old, struct folio *new)
{
struct mem_cgroup *memcg;
long nr_pages = folio_nr_pages(new);
- unsigned long flags;
VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
@@ -4709,7 +4734,7 @@ void mem_cgroup_replace_folio(struct folio *old, struct folio *new)
return;
/* Page cache replacement: new folio already charged? */
- if (folio_memcg(new))
+ if (folio_memcg_charged(new))
return;
memcg = folio_memcg(old);
@@ -4726,11 +4751,7 @@ void mem_cgroup_replace_folio(struct folio *old, struct folio *new)
css_get(&memcg->css);
commit_charge(new, memcg);
-
- local_irq_save(flags);
- mem_cgroup_charge_statistics(memcg, nr_pages);
- memcg1_check_events(memcg, folio_nid(new));
- local_irq_restore(flags);
+ memcg1_commit_charge(new, memcg);
}
/**
@@ -4966,17 +4987,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
page_counter_uncharge(&memcg->memsw, nr_entries);
}
- /*
- * Interrupts should be disabled here because the caller holds the
- * i_pages lock which is taken with interrupts-off. It is
- * important here to have the interrupts disabled because it is the
- * only synchronisation we have for updating the per-CPU variables.
- */
- memcg_stats_lock();
- mem_cgroup_charge_statistics(memcg, -nr_entries);
- memcg_stats_unlock();
- memcg1_check_events(memcg, folio_nid(folio));
-
+ memcg1_swapout(folio, memcg);
css_put(&memcg->css);
}
@@ -5116,12 +5127,20 @@ static u64 swap_current_read(struct cgroup_subsys_state *css,
return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
}
-static u64 swap_peak_read(struct cgroup_subsys_state *css,
- struct cftype *cft)
+static int swap_peak_show(struct seq_file *sf, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
+
+ return peak_show(sf, v, &memcg->swap);
+}
+
+static ssize_t swap_peak_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
- return (u64)memcg->swap.watermark * PAGE_SIZE;
+ return peak_write(of, buf, nbytes, off, &memcg->swap,
+ &memcg->swap_peaks);
}
static int swap_high_show(struct seq_file *m, void *v)
@@ -5205,7 +5224,10 @@ static struct cftype swap_files[] = {
{
.name = "swap.peak",
.flags = CFTYPE_NOT_ON_ROOT,
- .read_u64 = swap_peak_read,
+ .open = peak_open,
+ .release = peak_release,
+ .seq_show = swap_peak_show,
+ .write = swap_peak_write,
},
{
.name = "swap.events",