diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 737 |
1 files changed, 305 insertions, 432 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 29501f040568..a2c7bcb0e6eb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -80,7 +80,7 @@ int do_swap_account __read_mostly; #ifdef CONFIG_MEMCG_SWAP_ENABLED static int really_do_swap_account __initdata = 1; #else -static int really_do_swap_account __initdata = 0; +static int really_do_swap_account __initdata; #endif #else @@ -357,10 +357,9 @@ struct mem_cgroup { struct cg_proto tcp_mem; #endif #if defined(CONFIG_MEMCG_KMEM) - /* analogous to slab_common's slab_caches list. per-memcg */ + /* analogous to slab_common's slab_caches list, but per-memcg; + * protected by memcg_slab_mutex */ struct list_head memcg_slab_caches; - /* Not a spinlock, we can take a lot of time walking the list */ - struct mutex slab_caches_mutex; /* Index in the kmem_cache->memcg_params->memcg_caches array */ int kmemcg_id; #endif @@ -527,18 +526,14 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) { - /* - * The ID of the root cgroup is 0, but memcg treat 0 as an - * invalid ID, so we return (cgroup_id + 1). - */ - return memcg->css.cgroup->id + 1; + return memcg->css.id; } static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) { struct cgroup_subsys_state *css; - css = css_from_id(id - 1, &memory_cgrp_subsys); + css = css_from_id(id, &memory_cgrp_subsys); return mem_cgroup_from_css(css); } @@ -571,7 +566,8 @@ void sock_update_memcg(struct sock *sk) memcg = mem_cgroup_from_task(current); cg_proto = sk->sk_prot->proto_cgroup(memcg); if (!mem_cgroup_is_root(memcg) && - memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) { + memcg_proto_active(cg_proto) && + css_tryget_online(&memcg->css)) { sk->sk_cgrp = cg_proto; } rcu_read_unlock(); @@ -677,9 +673,11 @@ static void disarm_static_keys(struct mem_cgroup *memcg) static void drain_all_stock_async(struct mem_cgroup *memcg); static struct mem_cgroup_per_zone * -mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) +mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) { - VM_BUG_ON((unsigned)nid >= nr_node_ids); + int nid = zone_to_nid(zone); + int zid = zone_idx(zone); + return &memcg->nodeinfo[nid]->zoneinfo[zid]; } @@ -689,12 +687,12 @@ struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) } static struct mem_cgroup_per_zone * -page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page) +mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page) { int nid = page_to_nid(page); int zid = page_zonenum(page); - return mem_cgroup_zoneinfo(memcg, nid, zid); + return &memcg->nodeinfo[nid]->zoneinfo[zid]; } static struct mem_cgroup_tree_per_zone * @@ -712,11 +710,9 @@ soft_limit_tree_from_page(struct page *page) return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; } -static void -__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, - struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz, - unsigned long long new_usage_in_excess) +static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz, + unsigned long long new_usage_in_excess) { struct rb_node **p = &mctz->rb_root.rb_node; struct rb_node *parent = NULL; @@ -746,10 +742,8 @@ __mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, mz->on_tree = true; } -static void -__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, - struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz) +static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz) { if (!mz->on_tree) return; @@ -757,13 +751,11 @@ __mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, mz->on_tree = false; } -static void -mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, - struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz) +static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz) { spin_lock(&mctz->lock); - __mem_cgroup_remove_exceeded(memcg, mz, mctz); + __mem_cgroup_remove_exceeded(mz, mctz); spin_unlock(&mctz->lock); } @@ -773,16 +765,14 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) unsigned long long excess; struct mem_cgroup_per_zone *mz; struct mem_cgroup_tree_per_zone *mctz; - int nid = page_to_nid(page); - int zid = page_zonenum(page); - mctz = soft_limit_tree_from_page(page); + mctz = soft_limit_tree_from_page(page); /* * Necessary to update all ancestors when hierarchy is used. * because their event counter is not touched. */ for (; memcg; memcg = parent_mem_cgroup(memcg)) { - mz = mem_cgroup_zoneinfo(memcg, nid, zid); + mz = mem_cgroup_page_zoneinfo(memcg, page); excess = res_counter_soft_limit_excess(&memcg->res); /* * We have to update the tree if mz is on RB-tree or @@ -792,12 +782,12 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) spin_lock(&mctz->lock); /* if on-tree, remove it */ if (mz->on_tree) - __mem_cgroup_remove_exceeded(memcg, mz, mctz); + __mem_cgroup_remove_exceeded(mz, mctz); /* * Insert again. mz->usage_in_excess will be updated. * If excess is 0, no tree ops. */ - __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess); + __mem_cgroup_insert_exceeded(mz, mctz, excess); spin_unlock(&mctz->lock); } } @@ -805,15 +795,15 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) { - int node, zone; - struct mem_cgroup_per_zone *mz; struct mem_cgroup_tree_per_zone *mctz; + struct mem_cgroup_per_zone *mz; + int nid, zid; - for_each_node(node) { - for (zone = 0; zone < MAX_NR_ZONES; zone++) { - mz = mem_cgroup_zoneinfo(memcg, node, zone); - mctz = soft_limit_tree_node_zone(node, zone); - mem_cgroup_remove_exceeded(memcg, mz, mctz); + for_each_node(nid) { + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; + mctz = soft_limit_tree_node_zone(nid, zid); + mem_cgroup_remove_exceeded(mz, mctz); } } } @@ -836,9 +826,9 @@ retry: * we will to add it back at the end of reclaim to its correct * position in the tree. */ - __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); + __mem_cgroup_remove_exceeded(mz, mctz); if (!res_counter_soft_limit_excess(&mz->memcg->res) || - !css_tryget(&mz->memcg->css)) + !css_tryget_online(&mz->memcg->css)) goto retry; done: return mz; @@ -947,8 +937,7 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, __this_cpu_add(memcg->stat->nr_page_events, nr_pages); } -unsigned long -mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) +unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) { struct mem_cgroup_per_zone *mz; @@ -956,46 +945,38 @@ mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) return mz->lru_size[lru]; } -static unsigned long -mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid, - unsigned int lru_mask) -{ - struct mem_cgroup_per_zone *mz; - enum lru_list lru; - unsigned long ret = 0; - - mz = mem_cgroup_zoneinfo(memcg, nid, zid); - - for_each_lru(lru) { - if (BIT(lru) & lru_mask) - ret += mz->lru_size[lru]; - } - return ret; -} - -static unsigned long -mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, - int nid, unsigned int lru_mask) +static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, + int nid, + unsigned int lru_mask) { - u64 total = 0; + unsigned long nr = 0; int zid; - for (zid = 0; zid < MAX_NR_ZONES; zid++) - total += mem_cgroup_zone_nr_lru_pages(memcg, - nid, zid, lru_mask); + VM_BUG_ON((unsigned)nid >= nr_node_ids); - return total; + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + struct mem_cgroup_per_zone *mz; + enum lru_list lru; + + for_each_lru(lru) { + if (!(BIT(lru) & lru_mask)) + continue; + mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; + nr += mz->lru_size[lru]; + } + } + return nr; } static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, unsigned int lru_mask) { + unsigned long nr = 0; int nid; - u64 total = 0; for_each_node_state(nid, N_MEMORY) - total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); - return total; + nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); + return nr; } static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, @@ -1077,10 +1058,19 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) rcu_read_lock(); do { - memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); - if (unlikely(!memcg)) + /* + * Page cache insertions can happen withou an + * actual mm context, e.g. during disk probing + * on boot, loopback IO, acct() writes etc. + */ + if (unlikely(!mm)) memcg = root_mem_cgroup; - } while (!css_tryget(&memcg->css)); + else { + memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!memcg)) + memcg = root_mem_cgroup; + } + } while (!css_tryget_online(&memcg->css)); rcu_read_unlock(); return memcg; } @@ -1117,7 +1107,8 @@ skip_node: */ if (next_css) { if ((next_css == &root->css) || - ((next_css->flags & CSS_ONLINE) && css_tryget(next_css))) + ((next_css->flags & CSS_ONLINE) && + css_tryget_online(next_css))) return mem_cgroup_from_css(next_css); prev_css = next_css; @@ -1163,7 +1154,7 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter, * would be returned all the time. */ if (position && position != root && - !css_tryget(&position->css)) + !css_tryget_online(&position->css)) position = NULL; } return position; @@ -1234,11 +1225,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, int uninitialized_var(seq); if (reclaim) { - int nid = zone_to_nid(reclaim->zone); - int zid = zone_idx(reclaim->zone); struct mem_cgroup_per_zone *mz; - mz = mem_cgroup_zoneinfo(root, nid, zid); + mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone); iter = &mz->reclaim_iter[reclaim->priority]; if (prev && reclaim->generation != iter->generation) { iter->last_visited = NULL; @@ -1345,7 +1334,7 @@ struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, goto out; } - mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone)); + mz = mem_cgroup_zone_zoneinfo(memcg, zone); lruvec = &mz->lruvec; out: /* @@ -1404,7 +1393,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup) pc->mem_cgroup = memcg = root_mem_cgroup; - mz = page_cgroup_zoneinfo(memcg, page); + mz = mem_cgroup_page_zoneinfo(memcg, page); lruvec = &mz->lruvec; out: /* @@ -1542,7 +1531,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) int mem_cgroup_swappiness(struct mem_cgroup *memcg) { /* root ? */ - if (!css_parent(&memcg->css)) + if (mem_cgroup_disabled() || !memcg->css.parent) return vm_swappiness; return memcg->swappiness; @@ -1586,23 +1575,12 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg) } /* - * 2 routines for checking "mem" is under move_account() or not. + * A routine for checking "mem" is under move_account() or not. * - * mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This - * is used for avoiding races in accounting. If true, - * pc->mem_cgroup may be overwritten. - * - * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or - * under hierarchy of moving cgroups. This is for - * waiting at hith-memory prressure caused by "move". + * Checking a cgroup is mc.from or mc.to or under hierarchy of + * moving cgroups. This is for waiting at high-memory pressure + * caused by "move". */ - -static bool mem_cgroup_stolen(struct mem_cgroup *memcg) -{ - VM_BUG_ON(!rcu_read_lock_held()); - return atomic_read(&memcg->moving_account) > 0; -} - static bool mem_cgroup_under_move(struct mem_cgroup *memcg) { struct mem_cgroup *from; @@ -1645,7 +1623,6 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) * Take this lock when * - a code tries to modify page's memcg while it's USED. * - a code tries to modify page state accounting in a memcg. - * see mem_cgroup_stolen(), too. */ static void move_lock_mem_cgroup(struct mem_cgroup *memcg, unsigned long *flags) @@ -2280,12 +2257,11 @@ cleanup: } /* - * Currently used to update mapped file statistics, but the routine can be - * generalized to update other statistics as well. + * Used to update mapped file or writeback or other statistics. * * Notes: Race condition * - * We usually use page_cgroup_lock() for accessing page_cgroup member but + * We usually use lock_page_cgroup() for accessing page_cgroup member but * it tends to be costly. But considering some conditions, we doesn't need * to do so _always_. * @@ -2299,8 +2275,8 @@ cleanup: * by flags. * * Considering "move", this is an only case we see a race. To make the race - * small, we check mm->moving_account and detect there are possibility of race - * If there is, we take a lock. + * small, we check memcg->moving_account and detect there are possibility + * of race or not. If there is, we take a lock. */ void __mem_cgroup_begin_update_page_stat(struct page *page, @@ -2318,9 +2294,10 @@ again: * If this memory cgroup is not under account moving, we don't * need to take move_lock_mem_cgroup(). Because we already hold * rcu_read_lock(), any calls to move_account will be delayed until - * rcu_read_unlock() if mem_cgroup_stolen() == true. + * rcu_read_unlock(). */ - if (!mem_cgroup_stolen(memcg)) + VM_BUG_ON(!rcu_read_lock_held()); + if (atomic_read(&memcg->moving_account) <= 0) return; move_lock_mem_cgroup(memcg, flags); @@ -2428,7 +2405,7 @@ static void drain_stock(struct memcg_stock_pcp *stock) */ static void drain_local_stock(struct work_struct *dummy) { - struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); + struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock); drain_stock(stock); clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); } @@ -2675,7 +2652,8 @@ static int mem_cgroup_try_charge(struct mem_cgroup *memcg, * free their memory. */ if (unlikely(test_thread_flag(TIF_MEMDIE) || - fatal_signal_pending(current))) + fatal_signal_pending(current) || + current->flags & PF_EXITING)) goto bypass; if (unlikely(task_in_memcg_oom(current))) @@ -2789,9 +2767,9 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, /* * A helper function to get mem_cgroup from ID. must be called under - * rcu_read_lock(). The caller is responsible for calling css_tryget if - * the mem_cgroup is used for charging. (dropping refcnt from swap can be - * called against removed memcg.) + * rcu_read_lock(). The caller is responsible for calling + * css_tryget_online() if the mem_cgroup is used for charging. (dropping + * refcnt from swap can be called against removed memcg.) */ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) { @@ -2814,14 +2792,14 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) lock_page_cgroup(pc); if (PageCgroupUsed(pc)) { memcg = pc->mem_cgroup; - if (memcg && !css_tryget(&memcg->css)) + if (memcg && !css_tryget_online(&memcg->css)) memcg = NULL; } else if (PageSwapCache(page)) { ent.val = page_private(page); id = lookup_swap_cgroup_id(ent); rcu_read_lock(); memcg = mem_cgroup_lookup(id); - if (memcg && !css_tryget(&memcg->css)) + if (memcg && !css_tryget_online(&memcg->css)) memcg = NULL; rcu_read_unlock(); } @@ -2903,6 +2881,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, static DEFINE_MUTEX(set_limit_mutex); #ifdef CONFIG_MEMCG_KMEM +/* + * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or + * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists. + */ +static DEFINE_MUTEX(memcg_slab_mutex); + static DEFINE_MUTEX(activate_kmem_mutex); static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) @@ -2935,10 +2919,10 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) print_slabinfo_header(m); - mutex_lock(&memcg->slab_caches_mutex); + mutex_lock(&memcg_slab_mutex); list_for_each_entry(params, &memcg->memcg_slab_caches, list) cache_show(memcg_params_to_cache(params), m); - mutex_unlock(&memcg->slab_caches_mutex); + mutex_unlock(&memcg_slab_mutex); return 0; } @@ -3040,8 +3024,6 @@ void memcg_update_array_size(int num) memcg_limited_groups_array_size = memcg_caches_array_size(num); } -static void kmem_cache_destroy_work_func(struct work_struct *w); - int memcg_update_cache_size(struct kmem_cache *s, int num_groups) { struct memcg_cache_params *cur_params = s->memcg_params; @@ -3094,29 +3076,6 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups) return 0; } -char *memcg_create_cache_name(struct mem_cgroup *memcg, - struct kmem_cache *root_cache) -{ - static char *buf = NULL; - - /* - * We need a mutex here to protect the shared buffer. Since this is - * expected to be called only on cache creation, we can employ the - * slab_mutex for that purpose. - */ - lockdep_assert_held(&slab_mutex); - - if (!buf) { - buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); - if (!buf) - return NULL; - } - - cgroup_name(memcg->css.cgroup, buf, NAME_MAX + 1); - return kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, - memcg_cache_id(memcg), buf); -} - int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, struct kmem_cache *root_cache) { @@ -3138,8 +3097,6 @@ int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, if (memcg) { s->memcg_params->memcg = memcg; s->memcg_params->root_cache = root_cache; - INIT_WORK(&s->memcg_params->destroy, - kmem_cache_destroy_work_func); css_get(&memcg->css); } else s->memcg_params->is_root_cache = true; @@ -3156,24 +3113,37 @@ void memcg_free_cache_params(struct kmem_cache *s) kfree(s->memcg_params); } -void memcg_register_cache(struct kmem_cache *s) +static void memcg_register_cache(struct mem_cgroup *memcg, + struct kmem_cache *root_cache) { - struct kmem_cache *root; - struct mem_cgroup *memcg; + static char memcg_name_buf[NAME_MAX + 1]; /* protected by + memcg_slab_mutex */ + struct kmem_cache *cachep; int id; - if (is_root_cache(s)) + lockdep_assert_held(&memcg_slab_mutex); + + id = memcg_cache_id(memcg); + + /* + * Since per-memcg caches are created asynchronously on first + * allocation (see memcg_kmem_get_cache()), several threads can try to + * create the same cache, but only one of them may succeed. + */ + if (cache_from_memcg_idx(root_cache, id)) return; + cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1); + cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf); /* - * Holding the slab_mutex assures nobody will touch the memcg_caches - * array while we are modifying it. + * If we could not create a memcg cache, do not complain, because + * that's not critical at all as we can always proceed with the root + * cache. */ - lockdep_assert_held(&slab_mutex); + if (!cachep) + return; - root = s->memcg_params->root_cache; - memcg = s->memcg_params->memcg; - id = memcg_cache_id(memcg); + list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); /* * Since readers won't lock (see cache_from_memcg_idx()), we need a @@ -3182,49 +3152,30 @@ void memcg_register_cache(struct kmem_cache *s) */ smp_wmb(); - /* - * Initialize the pointer to this cache in its parent's memcg_params - * before adding it to the memcg_slab_caches list, otherwise we can - * fail to convert memcg_params_to_cache() while traversing the list. - */ - VM_BUG_ON(root->memcg_params->memcg_caches[id]); - root->memcg_params->memcg_caches[id] = s; - - mutex_lock(&memcg->slab_caches_mutex); - list_add(&s->memcg_params->list, &memcg->memcg_slab_caches); - mutex_unlock(&memcg->slab_caches_mutex); + BUG_ON(root_cache->memcg_params->memcg_caches[id]); + root_cache->memcg_params->memcg_caches[id] = cachep; } -void memcg_unregister_cache(struct kmem_cache *s) +static void memcg_unregister_cache(struct kmem_cache *cachep) { - struct kmem_cache *root; + struct kmem_cache *root_cache; struct mem_cgroup *memcg; int id; - if (is_root_cache(s)) - return; + lockdep_assert_held(&memcg_slab_mutex); - /* - * Holding the slab_mutex assures nobody will touch the memcg_caches - * array while we are modifying it. - */ - lockdep_assert_held(&slab_mutex); + BUG_ON(is_root_cache(cachep)); - root = s->memcg_params->root_cache; - memcg = s->memcg_params->memcg; + root_cache = cachep->memcg_params->root_cache; + memcg = cachep->memcg_params->memcg; id = memcg_cache_id(memcg); - mutex_lock(&memcg->slab_caches_mutex); - list_del(&s->memcg_params->list); - mutex_unlock(&memcg->slab_caches_mutex); + BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep); + root_cache->memcg_params->memcg_caches[id] = NULL; - /* - * Clear the pointer to this cache in its parent's memcg_params only - * after removing it from the memcg_slab_caches list, otherwise we can - * fail to convert memcg_params_to_cache() while traversing the list. - */ - VM_BUG_ON(root->memcg_params->memcg_caches[id] != s); - root->memcg_params->memcg_caches[id] = NULL; + list_del(&cachep->memcg_params->list); + + kmem_cache_destroy(cachep); } /* @@ -3258,144 +3209,61 @@ static inline void memcg_resume_kmem_account(void) current->memcg_kmem_skip_account--; } -static void kmem_cache_destroy_work_func(struct work_struct *w) -{ - struct kmem_cache *cachep; - struct memcg_cache_params *p; - - p = container_of(w, struct memcg_cache_params, destroy); - - cachep = memcg_params_to_cache(p); - - /* - * If we get down to 0 after shrink, we could delete right away. - * However, memcg_release_pages() already puts us back in the workqueue - * in that case. If we proceed deleting, we'll get a dangling - * reference, and removing the object from the workqueue in that case - * is unnecessary complication. We are not a fast path. - * - * Note that this case is fundamentally different from racing with - * shrink_slab(): if memcg_cgroup_destroy_cache() is called in - * kmem_cache_shrink, not only we would be reinserting a dead cache - * into the queue, but doing so from inside the worker racing to - * destroy it. - * - * So if we aren't down to zero, we'll just schedule a worker and try - * again - */ - if (atomic_read(&cachep->memcg_params->nr_pages) != 0) - kmem_cache_shrink(cachep); - else - kmem_cache_destroy(cachep); -} - -void mem_cgroup_destroy_cache(struct kmem_cache *cachep) -{ - if (!cachep->memcg_params->dead) - return; - - /* - * There are many ways in which we can get here. - * - * We can get to a memory-pressure situation while the delayed work is - * still pending to run. The vmscan shrinkers can then release all - * cache memory and get us to destruction. If this is the case, we'll - * be executed twice, which is a bug (the second time will execute over - * bogus data). In this case, cancelling the work should be fine. - * - * But we can also get here from the worker itself, if - * kmem_cache_shrink is enough to shake all the remaining objects and - * get the page count to 0. In this case, we'll deadlock if we try to - * cancel the work (the worker runs with an internal lock held, which - * is the same lock we would hold for cancel_work_sync().) - * - * Since we can't possibly know who got us here, just refrain from - * running if there is already work pending - */ - if (work_pending(&cachep->memcg_params->destroy)) - return; - /* - * We have to defer the actual destroying to a workqueue, because - * we might currently be in a context that cannot sleep. - */ - schedule_work(&cachep->memcg_params->destroy); -} - -int __kmem_cache_destroy_memcg_children(struct kmem_cache *s) +int __memcg_cleanup_cache_params(struct kmem_cache *s) { struct kmem_cache *c; int i, failed = 0; - /* - * If the cache is being destroyed, we trust that there is no one else - * requesting objects from it. Even if there are, the sanity checks in - * kmem_cache_destroy should caught this ill-case. - * - * Still, we don't want anyone else freeing memcg_caches under our - * noses, which can happen if a new memcg comes to life. As usual, - * we'll take the activate_kmem_mutex to protect ourselves against - * this. - */ - mutex_lock(&activate_kmem_mutex); + mutex_lock(&memcg_slab_mutex); for_each_memcg_cache_index(i) { c = cache_from_memcg_idx(s, i); if (!c) continue; - /* - * We will now manually delete the caches, so to avoid races - * we need to cancel all pending destruction workers and - * proceed with destruction ourselves. - * - * kmem_cache_destroy() will call kmem_cache_shrink internally, - * and that could spawn the workers again: it is likely that - * the cache still have active pages until this very moment. - * This would lead us back to mem_cgroup_destroy_cache. - * - * But that will not execute at all if the "dead" flag is not - * set, so flip it down to guarantee we are in control. - */ - c->memcg_params->dead = false; - cancel_work_sync(&c->memcg_params->destroy); - kmem_cache_destroy(c); + memcg_unregister_cache(c); if (cache_from_memcg_idx(s, i)) failed++; } - mutex_unlock(&activate_kmem_mutex); + mutex_unlock(&memcg_slab_mutex); return failed; } -static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) +static void memcg_unregister_all_caches(struct mem_cgroup *memcg) { struct kmem_cache *cachep; - struct memcg_cache_params *params; + struct memcg_cache_params *params, *tmp; if (!memcg_kmem_is_active(memcg)) return; - mutex_lock(&memcg->slab_caches_mutex); - list_for_each_entry(params, &memcg->memcg_slab_caches, list) { + mutex_lock(&memcg_slab_mutex); + list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { cachep = memcg_params_to_cache(params); - cachep->memcg_params->dead = true; - schedule_work(&cachep->memcg_params->destroy); + kmem_cache_shrink(cachep); + if (atomic_read(&cachep->memcg_params->nr_pages) == 0) + memcg_unregister_cache(cachep); } - mutex_unlock(&memcg->slab_caches_mutex); + mutex_unlock(&memcg_slab_mutex); } -struct create_work { +struct memcg_register_cache_work { struct mem_cgroup *memcg; struct kmem_cache *cachep; struct work_struct work; }; -static void memcg_create_cache_work_func(struct work_struct *w) +static void memcg_register_cache_func(struct work_struct *w) { - struct create_work *cw = container_of(w, struct create_work, work); + struct memcg_register_cache_work *cw = + container_of(w, struct memcg_register_cache_work, work); struct mem_cgroup *memcg = cw->memcg; struct kmem_cache *cachep = cw->cachep; - kmem_cache_create_memcg(memcg, cachep); + mutex_lock(&memcg_slab_mutex); + memcg_register_cache(memcg, cachep); + mutex_unlock(&memcg_slab_mutex); + css_put(&memcg->css); kfree(cw); } @@ -3403,12 +3271,12 @@ static void memcg_create_cache_work_func(struct work_struct *w) /* * Enqueue the creation of a per-memcg kmem_cache. */ -static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, - struct kmem_cache *cachep) +static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, + struct kmem_cache *cachep) { - struct create_work *cw; + struct memcg_register_cache_work *cw; - cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT); + cw = kmalloc(sizeof(*cw), GFP_NOWAIT); if (cw == NULL) { css_put(&memcg->css); return; @@ -3417,17 +3285,17 @@ static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, cw->memcg = memcg; cw->cachep = cachep; - INIT_WORK(&cw->work, memcg_create_cache_work_func); + INIT_WORK(&cw->work, memcg_register_cache_func); schedule_work(&cw->work); } -static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, - struct kmem_cache *cachep) +static void memcg_schedule_register_cache(struct mem_cgroup *memcg, + struct kmem_cache *cachep) { /* * We need to stop accounting when we kmalloc, because if the * corresponding kmalloc cache is not yet created, the first allocation - * in __memcg_create_cache_enqueue will recurse. + * in __memcg_schedule_register_cache will recurse. * * However, it is better to enclose the whole function. Depending on * the debugging options enabled, INIT_WORK(), for instance, can @@ -3436,9 +3304,27 @@ static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, * the safest choice is to do it like this, wrapping the whole function. */ memcg_stop_kmem_account(); - __memcg_create_cache_enqueue(memcg, cachep); + __memcg_schedule_register_cache(memcg, cachep); memcg_resume_kmem_account(); } + +int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) +{ + int res; + + res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, + PAGE_SIZE << order); + if (!res) + atomic_add(1 << order, &cachep->memcg_params->nr_pages); + return res; +} + +void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) +{ + memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order); + atomic_sub(1 << order, &cachep->memcg_params->nr_pages); +} + /* * Return the kmem_cache we're supposed to use for a slab allocation. * We try to use the current memcg's version of the cache. @@ -3477,7 +3363,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, } /* The corresponding put will be done in the workqueue. */ - if (!css_tryget(&memcg->css)) + if (!css_tryget_online(&memcg->css)) goto out; rcu_read_unlock(); @@ -3489,22 +3375,16 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, * * However, there are some clashes that can arrive from locking. * For instance, because we acquire the slab_mutex while doing - * kmem_cache_dup, this means no further allocation could happen - * with the slab_mutex held. - * - * Also, because cache creation issue get_online_cpus(), this - * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex, - * that ends up reversed during cpu hotplug. (cpuset allocates - * a bunch of GFP_KERNEL memory during cpuup). Due to all that, - * better to defer everything. + * memcg_create_kmem_cache, this means no further allocation + * could happen with the slab_mutex held. So it's better to + * defer everything. */ - memcg_create_cache_enqueue(memcg, cachep); + memcg_schedule_register_cache(memcg, cachep); return cachep; out: rcu_read_unlock(); return cachep; } -EXPORT_SYMBOL(__memcg_kmem_get_cache); /* * We need to verify if the allocation against current->mm->owner's memcg is @@ -3531,11 +3411,12 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) /* * Disabling accounting is only relevant for some specific memcg * internal allocations. Therefore we would initially not have such - * check here, since direct calls to the page allocator that are marked - * with GFP_KMEMCG only happen outside memcg core. We are mostly - * concerned with cache allocations, and by having this test at - * memcg_kmem_get_cache, we are already able to relay the allocation to - * the root cache and bypass the memcg cache altogether. + * check here, since direct calls to the page allocator that are + * accounted to kmemcg (alloc_kmem_pages and friends) only happen + * outside memcg core. We are mostly concerned with cache allocations, + * and by having this test at memcg_kmem_get_cache, we are already able + * to relay the allocation to the root cache and bypass the memcg cache + * altogether. * * There is one exception, though: the SLUB allocator does not create * large order caches, but rather service large kmallocs directly from @@ -3622,7 +3503,7 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order) memcg_uncharge_kmem(memcg, PAGE_SIZE << order); } #else -static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) +static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) { } #endif /* CONFIG_MEMCG_KMEM */ @@ -3958,17 +3839,9 @@ int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm, return 0; } - /* - * Page cache insertions can happen without an actual mm - * context, e.g. during disk probing on boot. - */ - if (unlikely(!mm)) - memcg = root_mem_cgroup; - else { - memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); - if (!memcg) - return -ENOMEM; - } + memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); + if (!memcg) + return -ENOMEM; __mem_cgroup_commit_charge(memcg, page, 1, type, false); return 0; } @@ -4250,8 +4123,8 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) memcg = mem_cgroup_lookup(id); if (memcg) { /* - * We uncharge this because swap is freed. - * This memcg can be obsolete one. We avoid calling css_tryget + * We uncharge this because swap is freed. This memcg can + * be obsolete one. We avoid calling css_tryget_online(). */ if (!mem_cgroup_is_root(memcg)) res_counter_uncharge(&memcg->memsw, PAGE_SIZE); @@ -4705,7 +4578,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, break; } while (1); } - __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); + __mem_cgroup_remove_exceeded(mz, mctz); excess = res_counter_soft_limit_excess(&mz->memcg->res); /* * One school of thought says that we should not add @@ -4716,7 +4589,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, * term TODO. */ /* If excess == 0, no tree ops */ - __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess); + __mem_cgroup_insert_exceeded(mz, mctz, excess); spin_unlock(&mctz->lock); css_put(&mz->memcg->css); loop++; @@ -4783,9 +4656,9 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg, if (mem_cgroup_move_parent(page, pc, memcg)) { /* found lock contention or "pc" is obsolete. */ busy = page; - cond_resched(); } else busy = NULL; + cond_resched(); } while (!list_empty(list)); } @@ -4836,18 +4709,28 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) } while (usage > 0); } +/* + * Test whether @memcg has children, dead or alive. Note that this + * function doesn't care whether @memcg has use_hierarchy enabled and + * returns %true if there are child csses according to the cgroup + * hierarchy. Testing use_hierarchy is the caller's responsiblity. + */ static inline bool memcg_has_children(struct mem_cgroup *memcg) { - lockdep_assert_held(&memcg_create_mutex); + bool ret; + /* - * The lock does not prevent addition or deletion to the list - * of children, but it prevents a new child from being - * initialized based on this parent in css_online(), so it's - * enough to decide whether hierarchically inherited - * attributes can still be changed or not. + * The lock does not prevent addition or deletion of children, but + * it prevents a new child from being initialized based on this + * parent in css_online(), so it's enough to decide whether + * hierarchically inherited attributes can still be changed or not. */ - return memcg->use_hierarchy && - !list_empty(&memcg->css.cgroup->children); + lockdep_assert_held(&memcg_create_mutex); + + rcu_read_lock(); + ret = css_next_child(NULL, &memcg->css); + rcu_read_unlock(); + return ret; } /* @@ -4859,11 +4742,6 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg) static int mem_cgroup_force_empty(struct mem_cgroup *memcg) { int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; - struct cgroup *cgrp = memcg->css.cgroup; - - /* returns EBUSY if there is a task or if we come here twice. */ - if (cgroup_has_tasks(cgrp) || !list_empty(&cgrp->children)) - return -EBUSY; /* we call try-to-free pages for make this cgroup empty */ lru_add_drain_all(); @@ -4883,20 +4761,19 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) } } - lru_add_drain(); - mem_cgroup_reparent_charges(memcg); return 0; } -static int mem_cgroup_force_empty_write(struct cgroup_subsys_state *css, - unsigned int event) +static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); if (mem_cgroup_is_root(memcg)) return -EINVAL; - return mem_cgroup_force_empty(memcg); + return mem_cgroup_force_empty(memcg) ?: nbytes; } static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, @@ -4910,7 +4787,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, { int retval = 0; struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct mem_cgroup *parent_memcg = mem_cgroup_from_css(css_parent(&memcg->css)); + struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent); mutex_lock(&memcg_create_mutex); @@ -4927,7 +4804,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, */ if ((!parent_memcg || !parent_memcg->use_hierarchy) && (val == 1 || val == 0)) { - if (list_empty(&memcg->css.cgroup->children)) + if (!memcg_has_children(memcg)) memcg->use_hierarchy = val; else retval = -EBUSY; @@ -5044,7 +4921,8 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg, * of course permitted. */ mutex_lock(&memcg_create_mutex); - if (cgroup_has_tasks(memcg->css.cgroup) || memcg_has_children(memcg)) + if (cgroup_has_tasks(memcg->css.cgroup) || + (memcg->use_hierarchy && memcg_has_children(memcg))) err = -EBUSY; mutex_unlock(&memcg_create_mutex); if (err) @@ -5061,13 +4939,14 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg, * Make sure we have enough space for this cgroup in each root cache's * memcg_params. */ + mutex_lock(&memcg_slab_mutex); err = memcg_update_all_caches(memcg_id + 1); + mutex_unlock(&memcg_slab_mutex); if (err) goto out_rmid; memcg->kmemcg_id = memcg_id; INIT_LIST_HEAD(&memcg->memcg_slab_caches); - mutex_init(&memcg->slab_caches_mutex); /* * We couldn't have accounted to this cgroup, because it hasn't got the @@ -5145,17 +5024,18 @@ static int memcg_update_kmem_limit(struct mem_cgroup *memcg, * The user of this function is... * RES_LIMIT. */ -static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, - char *buffer) +static ssize_t mem_cgroup_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); enum res_type type; int name; unsigned long long val; int ret; - type = MEMFILE_TYPE(cft->private); - name = MEMFILE_ATTR(cft->private); + buf = strstrip(buf); + type = MEMFILE_TYPE(of_cft(of)->private); + name = MEMFILE_ATTR(of_cft(of)->private); switch (name) { case RES_LIMIT: @@ -5164,7 +5044,7 @@ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, break; } /* This function does all necessary parse...reuse it */ - ret = res_counter_memparse_write_strategy(buffer, &val); + ret = res_counter_memparse_write_strategy(buf, &val); if (ret) break; if (type == _MEM) @@ -5177,7 +5057,7 @@ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, return -EINVAL; break; case RES_SOFT_LIMIT: - ret = res_counter_memparse_write_strategy(buffer, &val); + ret = res_counter_memparse_write_strategy(buf, &val); if (ret) break; /* @@ -5194,7 +5074,7 @@ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, ret = -EINVAL; /* should be BUG() ? */ break; } - return ret; + return ret ?: nbytes; } static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, @@ -5207,8 +5087,8 @@ static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, if (!memcg->use_hierarchy) goto out; - while (css_parent(&memcg->css)) { - memcg = mem_cgroup_from_css(css_parent(&memcg->css)); + while (memcg->css.parent) { + memcg = mem_cgroup_from_css(memcg->css.parent); if (!memcg->use_hierarchy) break; tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); @@ -5221,14 +5101,15 @@ out: *memsw_limit = min_memsw_limit; } -static int mem_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event) +static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); int name; enum res_type type; - type = MEMFILE_TYPE(event); - name = MEMFILE_ATTR(event); + type = MEMFILE_TYPE(of_cft(of)->private); + name = MEMFILE_ATTR(of_cft(of)->private); switch (name) { case RES_MAX_USAGE: @@ -5253,7 +5134,7 @@ static int mem_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event) break; } - return 0; + return nbytes; } static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, @@ -5412,7 +5293,7 @@ static int memcg_stat_show(struct seq_file *m, void *v) for_each_online_node(nid) for (zid = 0; zid < MAX_NR_ZONES; zid++) { - mz = mem_cgroup_zoneinfo(memcg, nid, zid); + mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; rstat = &mz->lruvec.reclaim_stat; recent_rotated[0] += rstat->recent_rotated[0]; @@ -5442,22 +5323,14 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); - - if (val > 100 || !parent) - return -EINVAL; - - mutex_lock(&memcg_create_mutex); - /* If under hierarchy, only empty-root can set this value */ - if ((parent->use_hierarchy) || memcg_has_children(memcg)) { - mutex_unlock(&memcg_create_mutex); + if (val > 100) return -EINVAL; - } - - memcg->swappiness = val; - mutex_unlock(&memcg_create_mutex); + if (css->parent) + memcg->swappiness = val; + else + vm_swappiness = val; return 0; } @@ -5789,22 +5662,15 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); /* cannot set to root cgroup and only 0 and 1 are allowed */ - if (!parent || !((val == 0) || (val == 1))) + if (!css->parent || !((val == 0) || (val == 1))) return -EINVAL; - mutex_lock(&memcg_create_mutex); - /* oom-kill-disable is a flag for subhierarchy. */ - if ((parent->use_hierarchy) || memcg_has_children(memcg)) { - mutex_unlock(&memcg_create_mutex); - return -EINVAL; - } memcg->oom_kill_disable = val; if (!val) memcg_oom_recover(memcg); - mutex_unlock(&memcg_create_mutex); + return 0; } @@ -5844,10 +5710,10 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) * which is then paired with css_put during uncharge resp. here. * * Although this might sound strange as this path is called from - * css_offline() when the referencemight have dropped down to 0 - * and shouldn't be incremented anymore (css_tryget would fail) - * we do not have other options because of the kmem allocations - * lifetime. + * css_offline() when the referencemight have dropped down to 0 and + * shouldn't be incremented anymore (css_tryget_online() would + * fail) we do not have other options because of the kmem + * allocations lifetime. */ css_get(&memcg->css); @@ -5966,9 +5832,10 @@ static void memcg_event_ptable_queue_proc(struct file *file, * Input must be in format '<event_fd> <control_fd> <args>'. * Interpretation of args is defined by control file implementation. */ -static int memcg_write_event_control(struct cgroup_subsys_state *css, - struct cftype *cft, char *buffer) +static ssize_t memcg_write_event_control(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { + struct cgroup_subsys_state *css = of_css(of); struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_event *event; struct cgroup_subsys_state *cfile_css; @@ -5979,15 +5846,17 @@ static int memcg_write_event_control(struct cgroup_subsys_state *css, char *endp; int ret; - efd = simple_strtoul(buffer, &endp, 10); + buf = strstrip(buf); + + efd = simple_strtoul(buf, &endp, 10); if (*endp != ' ') return -EINVAL; - buffer = endp + 1; + buf = endp + 1; - cfd = simple_strtoul(buffer, &endp, 10); + cfd = simple_strtoul(buf, &endp, 10); if ((*endp != ' ') && (*endp != '\0')) return -EINVAL; - buffer = endp + 1; + buf = endp + 1; event = kzalloc(sizeof(*event), GFP_KERNEL); if (!event) @@ -6055,8 +5924,8 @@ static int memcg_write_event_control(struct cgroup_subsys_state *css, * automatically removed on cgroup destruction but the removal is * asynchronous, so take an extra ref on @css. */ - cfile_css = css_tryget_from_dir(cfile.file->f_dentry->d_parent, - &memory_cgrp_subsys); + cfile_css = css_tryget_online_from_dir(cfile.file->f_dentry->d_parent, + &memory_cgrp_subsys); ret = -EINVAL; if (IS_ERR(cfile_css)) goto out_put_cfile; @@ -6065,7 +5934,7 @@ static int memcg_write_event_control(struct cgroup_subsys_state *css, goto out_put_cfile; } - ret = event->register_event(memcg, event->eventfd, buffer); + ret = event->register_event(memcg, event->eventfd, buf); if (ret) goto out_put_css; @@ -6078,7 +5947,7 @@ static int memcg_write_event_control(struct cgroup_subsys_state *css, fdput(cfile); fdput(efile); - return 0; + return nbytes; out_put_css: css_put(css); @@ -6103,25 +5972,25 @@ static struct cftype mem_cgroup_files[] = { { .name = "max_usage_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), - .trigger = mem_cgroup_reset, + .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, { .name = "limit_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), - .write_string = mem_cgroup_write, + .write = mem_cgroup_write, .read_u64 = mem_cgroup_read_u64, }, { .name = "soft_limit_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), - .write_string = mem_cgroup_write, + .write = mem_cgroup_write, .read_u64 = mem_cgroup_read_u64, }, { .name = "failcnt", .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), - .trigger = mem_cgroup_reset, + .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, { @@ -6130,7 +5999,7 @@ static struct cftype mem_cgroup_files[] = { }, { .name = "force_empty", - .trigger = mem_cgroup_force_empty_write, + .write = mem_cgroup_force_empty_write, }, { .name = "use_hierarchy", @@ -6140,7 +6009,7 @@ static struct cftype mem_cgroup_files[] = { }, { .name = "cgroup.event_control", /* XXX: for compat */ - .write_string = memcg_write_event_control, + .write = memcg_write_event_control, .flags = CFTYPE_NO_PREFIX, .mode = S_IWUGO, }, @@ -6173,7 +6042,7 @@ static struct cftype mem_cgroup_files[] = { { .name = "kmem.limit_in_bytes", .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), - .write_string = mem_cgroup_write, + .write = mem_cgroup_write, .read_u64 = mem_cgroup_read_u64, }, { @@ -6184,13 +6053,13 @@ static struct cftype mem_cgroup_files[] = { { .name = "kmem.failcnt", .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), - .trigger = mem_cgroup_reset, + .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, { .name = "kmem.max_usage_in_bytes", .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), - .trigger = mem_cgroup_reset, + .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, #ifdef CONFIG_SLABINFO @@ -6213,19 +6082,19 @@ static struct cftype memsw_cgroup_files[] = { { .name = "memsw.max_usage_in_bytes", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), - .trigger = mem_cgroup_reset, + .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, { .name = "memsw.limit_in_bytes", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), - .write_string = mem_cgroup_write, + .write = mem_cgroup_write, .read_u64 = mem_cgroup_read_u64, }, { .name = "memsw.failcnt", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), - .trigger = mem_cgroup_reset, + .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, { }, /* terminate */ @@ -6403,9 +6272,9 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css)); + struct mem_cgroup *parent = mem_cgroup_from_css(css->parent); - if (css->cgroup->id > MEM_CGROUP_ID_MAX) + if (css->id > MEM_CGROUP_ID_MAX) return -ENOSPC; if (!parent) @@ -6490,7 +6359,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) css_for_each_descendant_post(iter, css) mem_cgroup_reparent_charges(mem_cgroup_from_css(iter)); - mem_cgroup_destroy_all_caches(memcg); + memcg_unregister_all_caches(memcg); vmpressure_cleanup(&memcg->vmpressure); } @@ -6500,7 +6369,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) /* * XXX: css_offline() would be where we should reparent all * memory to prepare the cgroup for destruction. However, - * memcg does not do css_tryget() and res_counter charging + * memcg does not do css_tryget_online() and res_counter charging * under the same RCU lock region, which means that charging * could race with offlining. Offlining only happens to * cgroups with no tasks in them but charges can show up @@ -6514,9 +6383,9 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) * lookup_swap_cgroup_id() * rcu_read_lock() * mem_cgroup_lookup() - * css_tryget() + * css_tryget_online() * rcu_read_unlock() - * disable css_tryget() + * disable css_tryget_online() * call_rcu() * offline_css() * reparent_charges() @@ -6686,16 +6555,20 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, pgoff = pte_to_pgoff(ptent); /* page is moved even if it's not RSS of this task(page-faulted). */ - page = find_get_page(mapping, pgoff); - #ifdef CONFIG_SWAP /* shmem/tmpfs may report page out on swap: account for that too. */ - if (radix_tree_exceptional_entry(page)) { - swp_entry_t swap = radix_to_swp_entry(page); - if (do_swap_account) - *entry = swap; - page = find_get_page(swap_address_space(swap), swap.val); - } + if (shmem_mapping(mapping)) { + page = find_get_entry(mapping, pgoff); + if (radix_tree_exceptional_entry(page)) { + swp_entry_t swp = radix_to_swp_entry(page); + if (do_swap_account) + *entry = swp; + page = find_get_page(swap_address_space(swp), swp.val); + } + } else + page = find_get_page(mapping, pgoff); +#else + page = find_get_page(mapping, pgoff); #endif return page; } |