summaryrefslogtreecommitdiffstats
path: root/kernel/cgroup.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r--kernel/cgroup.c956
1 files changed, 751 insertions, 205 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3f1ca934a237..9db1a9629a5c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -26,6 +26,8 @@
* distribution for more details.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/cgroup.h>
#include <linux/cred.h>
#include <linux/ctype.h>
@@ -98,6 +100,12 @@ static DECLARE_RWSEM(css_set_rwsem);
#endif
/*
+ * Protects cgroup_idr and css_idr so that IDs can be released without
+ * grabbing cgroup_mutex.
+ */
+static DEFINE_SPINLOCK(cgroup_idr_lock);
+
+/*
* Protects cgroup_subsys->release_agent_path. Modifying it also requires
* cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
*/
@@ -179,13 +187,46 @@ static struct cftype cgroup_base_files[];
static void cgroup_put(struct cgroup *cgrp);
static int rebind_subsystems(struct cgroup_root *dst_root,
- unsigned long ss_mask);
+ unsigned int ss_mask);
static void cgroup_destroy_css_killed(struct cgroup *cgrp);
static int cgroup_destroy_locked(struct cgroup *cgrp);
+static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss);
+static void kill_css(struct cgroup_subsys_state *css);
static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
bool is_add);
static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
+/* IDR wrappers which synchronize using cgroup_idr_lock */
+static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
+ gfp_t gfp_mask)
+{
+ int ret;
+
+ idr_preload(gfp_mask);
+ spin_lock(&cgroup_idr_lock);
+ ret = idr_alloc(idr, ptr, start, end, gfp_mask);
+ spin_unlock(&cgroup_idr_lock);
+ idr_preload_end();
+ return ret;
+}
+
+static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
+{
+ void *ret;
+
+ spin_lock(&cgroup_idr_lock);
+ ret = idr_replace(idr, ptr, id);
+ spin_unlock(&cgroup_idr_lock);
+ return ret;
+}
+
+static void cgroup_idr_remove(struct idr *idr, int id)
+{
+ spin_lock(&cgroup_idr_lock);
+ idr_remove(idr, id);
+ spin_unlock(&cgroup_idr_lock);
+}
+
/**
* cgroup_css - obtain a cgroup's css for the specified subsystem
* @cgrp: the cgroup of interest
@@ -208,6 +249,34 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
return &cgrp->dummy_css;
}
+/**
+ * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest (%NULL returns the dummy_css)
+ *
+ * Similar to cgroup_css() but returns the effctive css, which is defined
+ * as the matching css of the nearest ancestor including self which has @ss
+ * enabled. If @ss is associated with the hierarchy @cgrp is on, this
+ * function is guaranteed to return non-NULL css.
+ */
+static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
+{
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (!ss)
+ return &cgrp->dummy_css;
+
+ if (!(cgrp->root->subsys_mask & (1 << ss->id)))
+ return NULL;
+
+ while (cgrp->parent &&
+ !(cgrp->parent->child_subsys_mask & (1 << ss->id)))
+ cgrp = cgrp->parent;
+
+ return cgroup_css(cgrp, ss);
+}
+
/* convenient tests for these bits */
static inline bool cgroup_is_dead(const struct cgroup *cgrp)
{
@@ -273,7 +342,7 @@ static int notify_on_release(const struct cgroup *cgrp)
* @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
* @cgrp: the target cgroup to iterate css's of
*
- * Should be called under cgroup_mutex.
+ * Should be called under cgroup_[tree_]mutex.
*/
#define for_each_css(css, ssid, cgrp) \
for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
@@ -284,6 +353,20 @@ static int notify_on_release(const struct cgroup *cgrp)
else
/**
+ * for_each_e_css - iterate all effective css's of a cgroup
+ * @css: the iteration cursor
+ * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
+ * @cgrp: the target cgroup to iterate css's of
+ *
+ * Should be called under cgroup_[tree_]mutex.
+ */
+#define for_each_e_css(css, ssid, cgrp) \
+ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
+ if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
+ ; \
+ else
+
+/**
* for_each_subsys - iterate all enabled cgroup subsystems
* @ss: the iteration cursor
* @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
@@ -296,6 +379,14 @@ static int notify_on_release(const struct cgroup *cgrp)
#define for_each_root(root) \
list_for_each_entry((root), &cgroup_roots, root_list)
+/* iterate over child cgrps, lock should be held throughout iteration */
+#define cgroup_for_each_live_child(child, cgrp) \
+ list_for_each_entry((child), &(cgrp)->children, sibling) \
+ if (({ lockdep_assert_held(&cgroup_tree_mutex); \
+ cgroup_is_dead(child); })) \
+ ; \
+ else
+
/**
* cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
* @cgrp: the cgroup to be checked for liveness
@@ -359,6 +450,43 @@ struct css_set init_css_set = {
static int css_set_count = 1; /* 1 for init_css_set */
+/**
+ * cgroup_update_populated - updated populated count of a cgroup
+ * @cgrp: the target cgroup
+ * @populated: inc or dec populated count
+ *
+ * @cgrp is either getting the first task (css_set) or losing the last.
+ * Update @cgrp->populated_cnt accordingly. The count is propagated
+ * towards root so that a given cgroup's populated_cnt is zero iff the
+ * cgroup and all its descendants are empty.
+ *
+ * @cgrp's interface file "cgroup.populated" is zero if
+ * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt
+ * changes from or to zero, userland is notified that the content of the
+ * interface file has changed. This can be used to detect when @cgrp and
+ * its descendants become populated or empty.
+ */
+static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
+{
+ lockdep_assert_held(&css_set_rwsem);
+
+ do {
+ bool trigger;
+
+ if (populated)
+ trigger = !cgrp->populated_cnt++;
+ else
+ trigger = !--cgrp->populated_cnt;
+
+ if (!trigger)
+ break;
+
+ if (cgrp->populated_kn)
+ kernfs_notify(cgrp->populated_kn);
+ cgrp = cgrp->parent;
+ } while (cgrp);
+}
+
/*
* hash table for cgroup groups. This improves the performance to find
* an existing css_set. This hash doesn't (currently) take into
@@ -383,6 +511,8 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
static void put_css_set_locked(struct css_set *cset, bool taskexit)
{
struct cgrp_cset_link *link, *tmp_link;
+ struct cgroup_subsys *ss;
+ int ssid;
lockdep_assert_held(&css_set_rwsem);
@@ -390,6 +520,8 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit)
return;
/* This css_set is dead. unlink it and release cgroup refcounts */
+ for_each_subsys(ss, ssid)
+ list_del(&cset->e_cset_node[ssid]);
hash_del(&cset->hlist);
css_set_count--;
@@ -400,10 +532,13 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit)
list_del(&link->cgrp_link);
/* @cgrp can't go away while we're holding css_set_rwsem */
- if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {
- if (taskexit)
- set_bit(CGRP_RELEASABLE, &cgrp->flags);
- check_for_release(cgrp);
+ if (list_empty(&cgrp->cset_links)) {
+ cgroup_update_populated(cgrp, false);
+ if (notify_on_release(cgrp)) {
+ if (taskexit)
+ set_bit(CGRP_RELEASABLE, &cgrp->flags);
+ check_for_release(cgrp);
+ }
}
kfree(link);
@@ -452,20 +587,20 @@ static bool compare_css_sets(struct css_set *cset,
{
struct list_head *l1, *l2;
- if (memcmp(template, cset->subsys, sizeof(cset->subsys))) {
- /* Not all subsystems matched */
+ /*
+ * On the default hierarchy, there can be csets which are
+ * associated with the same set of cgroups but different csses.
+ * Let's first ensure that csses match.
+ */
+ if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
return false;
- }
/*
* Compare cgroup pointers in order to distinguish between
- * different cgroups in heirarchies with no subsystems. We
- * could get by with just this check alone (and skip the
- * memcmp above) but on most setups the memcmp check will
- * avoid the need for this more expensive check on almost all
- * candidates.
+ * different cgroups in hierarchies. As different cgroups may
+ * share the same effective css, this comparison is always
+ * necessary.
*/
-
l1 = &cset->cgrp_links;
l2 = &old_cset->cgrp_links;
while (1) {
@@ -529,14 +664,17 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
* won't change, so no need for locking.
*/
for_each_subsys(ss, i) {
- if (root->cgrp.subsys_mask & (1UL << i)) {
- /* Subsystem is in this hierarchy. So we want
- * the subsystem state from the new
- * cgroup */
- template[i] = cgroup_css(cgrp, ss);
+ if (root->subsys_mask & (1UL << i)) {
+ /*
+ * @ss is in this hierarchy, so we want the
+ * effective css from @cgrp.
+ */
+ template[i] = cgroup_e_css(cgrp, ss);
} else {
- /* Subsystem is not in this hierarchy, so we
- * don't want to change the subsystem state */
+ /*
+ * @ss is not in this hierarchy, so we don't want
+ * to change the css.
+ */
template[i] = old_cset->subsys[i];
}
}
@@ -602,10 +740,18 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
struct cgrp_cset_link *link;
BUG_ON(list_empty(tmp_links));
+
+ if (cgroup_on_dfl(cgrp))
+ cset->dfl_cgrp = cgrp;
+
link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
link->cset = cset;
link->cgrp = cgrp;
+
+ if (list_empty(&cgrp->cset_links))
+ cgroup_update_populated(cgrp, true);
list_move(&link->cset_link, &cgrp->cset_links);
+
/*
* Always add links to the tail of the list so that the list
* is sorted by order of hierarchy creation
@@ -628,7 +774,9 @@ static struct css_set *find_css_set(struct css_set *old_cset,
struct css_set *cset;
struct list_head tmp_links;
struct cgrp_cset_link *link;
+ struct cgroup_subsys *ss;
unsigned long key;
+ int ssid;
lockdep_assert_held(&cgroup_mutex);
@@ -679,10 +827,14 @@ static struct css_set *find_css_set(struct css_set *old_cset,
css_set_count++;
- /* Add this cgroup group to the hash table */
+ /* Add @cset to the hash table */
key = css_set_hash(cset->subsys);
hash_add(css_set_table, &cset->hlist, key);
+ for_each_subsys(ss, ssid)
+ list_add_tail(&cset->e_cset_node[ssid],
+ &cset->subsys[ssid]->cgroup->e_csets[ssid]);
+
up_write(&css_set_rwsem);
return cset;
@@ -742,7 +894,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)
BUG_ON(!list_empty(&cgrp->children));
/* Rebind all subsystems back to the default hierarchy */
- rebind_subsystems(&cgrp_dfl_root, cgrp->subsys_mask);
+ rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);
/*
* Release all the links from cset_links to this hierarchy's
@@ -848,7 +1000,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
* update of a tasks cgroup pointer by cgroup_attach_task()
*/
-static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
+static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask);
static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
static const struct file_operations proc_cgroupstats_operations;
@@ -937,15 +1089,7 @@ static void cgroup_put(struct cgroup *cgrp)
if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp)))
return;
- /*
- * XXX: cgrp->id is only used to look up css's. As cgroup and
- * css's lifetimes will be decoupled, it should be made
- * per-subsystem and moved to css->id so that lookups are
- * successful until the target css is released.
- */
- mutex_lock(&cgroup_mutex);
- idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
- mutex_unlock(&cgroup_mutex);
+ cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
cgrp->id = -1;
call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
@@ -964,7 +1108,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
* @cgrp: target cgroup
* @subsys_mask: mask of the subsystem ids whose files should be removed
*/
-static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
+static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask)
{
struct cgroup_subsys *ss;
int i;
@@ -972,18 +1116,17 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
for_each_subsys(ss, i) {
struct cftype *cfts;
- if (!test_bit(i, &subsys_mask))
+ if (!(subsys_mask & (1 << i)))
continue;
list_for_each_entry(cfts, &ss->cfts, node)
cgroup_addrm_files(cgrp, cfts, false);
}
}
-static int rebind_subsystems(struct cgroup_root *dst_root,
- unsigned long ss_mask)
+static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
{
struct cgroup_subsys *ss;
- int ssid, ret;
+ int ssid, i, ret;
lockdep_assert_held(&cgroup_tree_mutex);
lockdep_assert_held(&cgroup_mutex);
@@ -992,16 +1135,12 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
if (!(ss_mask & (1 << ssid)))
continue;
- /* if @ss is on the dummy_root, we can always move it */
- if (ss->root == &cgrp_dfl_root)
- continue;
-
- /* if @ss has non-root cgroups attached to it, can't move */
- if (!list_empty(&ss->root->cgrp.children))
+ /* if @ss has non-root csses attached to it, can't move */
+ if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
return -EBUSY;
/* can't move between two non-dummy roots either */
- if (dst_root != &cgrp_dfl_root)
+ if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
return -EBUSY;
}
@@ -1017,9 +1156,9 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
* Just warn about it and continue.
*/
if (cgrp_dfl_root_visible) {
- pr_warning("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n",
- ret, ss_mask);
- pr_warning("cgroup: you may retry by moving them to a different hierarchy and unbinding\n");
+ pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n",
+ ret, ss_mask);
+ pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
}
}
@@ -1036,6 +1175,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
for_each_subsys(ss, ssid) {
struct cgroup_root *src_root;
struct cgroup_subsys_state *css;
+ struct css_set *cset;
if (!(ss_mask & (1 << ssid)))
continue;
@@ -1050,8 +1190,19 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
ss->root = dst_root;
css->cgroup = &dst_root->cgrp;
- src_root->cgrp.subsys_mask &= ~(1 << ssid);
- dst_root->cgrp.subsys_mask |= 1 << ssid;
+ down_write(&css_set_rwsem);
+ hash_for_each(css_set_table, i, cset, hlist)
+ list_move_tail(&cset->e_cset_node[ss->id],
+ &dst_root->cgrp.e_csets[ss->id]);
+ up_write(&css_set_rwsem);
+
+ src_root->subsys_mask &= ~(1 << ssid);
+ src_root->cgrp.child_subsys_mask &= ~(1 << ssid);
+
+ /* default hierarchy doesn't enable controllers by default */
+ dst_root->subsys_mask |= 1 << ssid;
+ if (dst_root != &cgrp_dfl_root)
+ dst_root->cgrp.child_subsys_mask |= 1 << ssid;
if (ss->bind)
ss->bind(css);
@@ -1069,7 +1220,7 @@ static int cgroup_show_options(struct seq_file *seq,
int ssid;
for_each_subsys(ss, ssid)
- if (root->cgrp.subsys_mask & (1 << ssid))
+ if (root->subsys_mask & (1 << ssid))
seq_printf(seq, ",%s", ss->name);
if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
seq_puts(seq, ",sane_behavior");
@@ -1091,8 +1242,8 @@ static int cgroup_show_options(struct seq_file *seq,
}
struct cgroup_sb_opts {
- unsigned long subsys_mask;
- unsigned long flags;
+ unsigned int subsys_mask;
+ unsigned int flags;
char *release_agent;
bool cpuset_clone_children;
char *name;
@@ -1100,24 +1251,16 @@ struct cgroup_sb_opts {
bool none;
};
-/*
- * Convert a hierarchy specifier into a bitmask of subsystems and
- * flags. Call with cgroup_mutex held to protect the cgroup_subsys[]
- * array. This function takes refcounts on subsystems to be used, unless it
- * returns error, in which case no refcounts are taken.
- */
static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
{
char *token, *o = data;
bool all_ss = false, one_ss = false;
- unsigned long mask = (unsigned long)-1;
+ unsigned int mask = -1U;
struct cgroup_subsys *ss;
int i;
- BUG_ON(!mutex_is_locked(&cgroup_mutex));
-
#ifdef CONFIG_CPUSETS
- mask = ~(1UL << cpuset_cgrp_id);
+ mask = ~(1U << cpuset_cgrp_id);
#endif
memset(opts, 0, sizeof(*opts));
@@ -1198,7 +1341,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
/* Mutually exclusive option 'all' + subsystem name */
if (all_ss)
return -EINVAL;
- set_bit(i, &opts->subsys_mask);
+ opts->subsys_mask |= (1 << i);
one_ss = true;
break;
@@ -1210,12 +1353,12 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
/* Consistency checks */
if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
- pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
+ pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||
opts->cpuset_clone_children || opts->release_agent ||
opts->name) {
- pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
+ pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
return -EINVAL;
}
} else {
@@ -1227,7 +1370,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
if (all_ss || (!one_ss && !opts->none && !opts->name))
for_each_subsys(ss, i)
if (!ss->disabled)
- set_bit(i, &opts->subsys_mask);
+ opts->subsys_mask |= (1 << i);
/*
* We either have to specify by name or by subsystems. (So
@@ -1258,10 +1401,10 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
int ret = 0;
struct cgroup_root *root = cgroup_root_from_kf(kf_root);
struct cgroup_sb_opts opts;
- unsigned long added_mask, removed_mask;
+ unsigned int added_mask, removed_mask;
if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
- pr_err("cgroup: sane_behavior: remount is not allowed\n");
+ pr_err("sane_behavior: remount is not allowed\n");
return -EINVAL;
}
@@ -1273,17 +1416,17 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
if (ret)
goto out_unlock;
- if (opts.subsys_mask != root->cgrp.subsys_mask || opts.release_agent)
- pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
- task_tgid_nr(current), current->comm);
+ if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
+ pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
+ task_tgid_nr(current), current->comm);
- added_mask = opts.subsys_mask & ~root->cgrp.subsys_mask;
- removed_mask = root->cgrp.subsys_mask & ~opts.subsys_mask;
+ added_mask = opts.subsys_mask & ~root->subsys_mask;
+ removed_mask = root->subsys_mask & ~opts.subsys_mask;
/* Don't allow flags or name to change at remount */
if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
(opts.name && strcmp(opts.name, root->name))) {
- pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n",
+ pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
root->flags & CGRP_ROOT_OPTION_MASK, root->name);
ret = -EINVAL;
@@ -1369,6 +1512,9 @@ out_unlock:
static void init_cgroup_housekeeping(struct cgroup *cgrp)
{
+ struct cgroup_subsys *ss;
+ int ssid;
+
atomic_set(&cgrp->refcnt, 1);
INIT_LIST_HEAD(&cgrp->sibling);
INIT_LIST_HEAD(&cgrp->children);
@@ -1377,6 +1523,11 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->pidlists);
mutex_init(&cgrp->pidlist_mutex);
cgrp->dummy_css.cgroup = cgrp;
+
+ for_each_subsys(ss, ssid)
+ INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
+
+ init_waitqueue_head(&cgrp->offline_waitq);
}
static void init_cgroup_root(struct cgroup_root *root,
@@ -1399,7 +1550,7 @@ static void init_cgroup_root(struct cgroup_root *root,
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}
-static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
+static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
{
LIST_HEAD(tmp_links);
struct cgroup *root_cgrp = &root->cgrp;
@@ -1409,7 +1560,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
lockdep_assert_held(&cgroup_tree_mutex);
lockdep_assert_held(&cgroup_mutex);
- ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
+ ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT);
if (ret < 0)
goto out;
root_cgrp->id = ret;
@@ -1535,7 +1686,7 @@ retry:
* subsystems) then they must match.
*/
if ((opts.subsys_mask || opts.none) &&
- (opts.subsys_mask != root->cgrp.subsys_mask)) {
+ (opts.subsys_mask != root->subsys_mask)) {
if (!name_match)
continue;
ret = -EBUSY;
@@ -1544,11 +1695,11 @@ retry:
if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
- pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
+ pr_err("sane_behavior: new mount options should match the existing superblock\n");
ret = -EINVAL;
goto out_unlock;
} else {
- pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");
+ pr_warn("new mount options do not match the existing superblock, will be ignored\n");
}
}
@@ -1737,7 +1888,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
/**
* cgroup_task_migrate - move a task from one cgroup to another.
- * @old_cgrp; the cgroup @tsk is being migrated from
+ * @old_cgrp: the cgroup @tsk is being migrated from
* @tsk: the task being migrated
* @new_cset: the new css_set @tsk is being attached to
*
@@ -1829,10 +1980,6 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
- /* nothing to do if this cset already belongs to the cgroup */
- if (src_cgrp == dst_cgrp)
- return;
-
if (!list_empty(&src_cset->mg_preload_node))
return;
@@ -1847,13 +1994,14 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
/**
* cgroup_migrate_prepare_dst - prepare destination css_sets for migration
- * @dst_cgrp: the destination cgroup
+ * @dst_cgrp: the destination cgroup (may be %NULL)
* @preloaded_csets: list of preloaded source css_sets
*
* Tasks are about to be moved to @dst_cgrp and all the source css_sets
* have been preloaded to @preloaded_csets. This function looks up and
- * pins all destination css_sets, links each to its source, and put them on
- * @preloaded_csets.
+ * pins all destination css_sets, links each to its source, and append them
+ * to @preloaded_csets. If @dst_cgrp is %NULL, the destination of each
+ * source css_set is assumed to be its cgroup on the default hierarchy.
*
* This function must be called after cgroup_migrate_add_src() has been
* called on each migration source css_set. After migration is performed
@@ -1864,19 +2012,42 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
struct list_head *preloaded_csets)
{
LIST_HEAD(csets);
- struct css_set *src_cset;
+ struct css_set *src_cset, *tmp_cset;
lockdep_assert_held(&cgroup_mutex);
+ /*
+ * Except for the root, child_subsys_mask must be zero for a cgroup
+ * with tasks so that child cgroups don't compete against tasks.
+ */
+ if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && dst_cgrp->parent &&
+ dst_cgrp->child_subsys_mask)
+ return -EBUSY;
+
/* look up the dst cset for each src cset and link it to src */
- list_for_each_entry(src_cset, preloaded_csets, mg_preload_node) {
+ list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
struct css_set *dst_cset;
- dst_cset = find_css_set(src_cset, dst_cgrp);
+ dst_cset = find_css_set(src_cset,
+ dst_cgrp ?: src_cset->dfl_cgrp);
if (!dst_cset)
goto err;
WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
+
+ /*
+ * If src cset equals dst, it's noop. Drop the src.
+ * cgroup_migrate() will skip the cset too. Note that we
+ * can't handle src == dst as some nodes are used by both.
+ */
+ if (src_cset == dst_cset) {
+ src_cset->mg_src_cgrp = NULL;
+ list_del_init(&src_cset->mg_preload_node);
+ put_css_set(src_cset, false);
+ put_css_set(dst_cset, false);
+ continue;
+ }
+
src_cset->mg_dst_cset = dst_cset;
if (list_empty(&dst_cset->mg_preload_node))
@@ -1885,7 +2056,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
put_css_set(dst_cset, false);
}
- list_splice(&csets, preloaded_csets);
+ list_splice_tail(&csets, preloaded_csets);
return 0;
err:
cgroup_migrate_finish(&csets);
@@ -1966,7 +2137,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
return 0;
/* check that we can legitimately attach to the cgroup */
- for_each_css(css, i, cgrp) {
+ for_each_e_css(css, i, cgrp) {
if (css->ss->can_attach) {
ret = css->ss->can_attach(css, &tset);
if (ret) {
@@ -1996,7 +2167,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
*/
tset.csets = &tset.dst_csets;
- for_each_css(css, i, cgrp)
+ for_each_e_css(css, i, cgrp)
if (css->ss->attach)
css->ss->attach(css, &tset);
@@ -2004,7 +2175,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
goto out_release_tset;
out_cancel_attach:
- for_each_css(css, i, cgrp) {
+ for_each_e_css(css, i, cgrp) {
if (css == failed_css)
break;
if (css->ss->cancel_attach)
@@ -2218,6 +2389,332 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
return 0;
}
+static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask)
+{
+ struct cgroup_subsys *ss;
+ bool printed = false;
+ int ssid;
+
+ for_each_subsys(ss, ssid) {
+ if (ss_mask & (1 << ssid)) {
+ if (printed)
+ seq_putc(seq, ' ');
+ seq_printf(seq, "%s", ss->name);
+ printed = true;
+ }
+ }
+ if (printed)
+ seq_putc(seq, '\n');
+}
+
+/* show controllers which are currently attached to the default hierarchy */
+static int cgroup_root_controllers_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+ cgroup_print_ss_mask(seq, cgrp->root->subsys_mask);
+ return 0;
+}
+
+/* show controllers which are enabled from the parent */
+static int cgroup_controllers_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+ cgroup_print_ss_mask(seq, cgrp->parent->child_subsys_mask);
+ return 0;
+}
+
+/* show controllers which are enabled for a given cgroup's children */
+static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+ cgroup_print_ss_mask(seq, cgrp->child_subsys_mask);
+ return 0;
+}
+
+/**
+ * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
+ * @cgrp: root of the subtree to update csses for
+ *
+ * @cgrp's child_subsys_mask has changed and its subtree's (self excluded)
+ * css associations need to be updated accordingly. This function looks up
+ * all css_sets which are attached to the subtree, creates the matching
+ * updated css_sets and migrates the tasks to the new ones.
+ */
+static int cgroup_update_dfl_csses(struct cgroup *cgrp)
+{
+ LIST_HEAD(preloaded_csets);
+ struct cgroup_subsys_state *css;
+ struct css_set *src_cset;
+ int ret;
+
+ lockdep_assert_held(&cgroup_tree_mutex);
+ lockdep_assert_held(&cgroup_mutex);
+
+ /* look up all csses currently attached to @cgrp's subtree */
+ down_read(&css_set_rwsem);
+ css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
+ struct cgrp_cset_link *link;
+
+ /* self is not affected by child_subsys_mask change */
+ if (css->cgroup == cgrp)
+ continue;
+
+ list_for_each_entry(link, &css->cgroup->cset_links, cset_link)
+ cgroup_migrate_add_src(link->cset, cgrp,
+ &preloaded_csets);
+ }
+ up_read(&css_set_rwsem);
+
+ /* NULL dst indicates self on default hierarchy */
+ ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
+ if (ret)
+ goto out_finish;
+
+ list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
+ struct task_struct *last_task = NULL, *task;
+
+ /* src_csets precede dst_csets, break on the first dst_cset */
+ if (!src_cset->mg_src_cgrp)
+ break;
+
+ /*
+ * All tasks in src_cset need to be migrated to the
+ * matching dst_cset. Empty it process by process. We
+ * walk tasks but migrate processes. The leader might even
+ * belong to a different cset but such src_cset would also
+ * be among the target src_csets because the default
+ * hierarchy enforces per-process membership.
+ */
+ while (true) {
+ down_read(&css_set_rwsem);
+ task = list_first_entry_or_null(&src_cset->tasks,
+ struct task_struct, cg_list);
+ if (task) {
+ task = task->group_leader;
+ WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp);
+ get_task_struct(task);
+ }
+ up_read(&css_set_rwsem);
+
+ if (!task)
+ break;
+
+ /* guard against possible infinite loop */
+ if (WARN(last_task == task,
+ "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n"))
+ goto out_finish;
+ last_task = task;
+
+ threadgroup_lock(task);
+ /* raced against de_thread() from another thread? */
+ if (!thread_group_leader(task)) {
+ threadgroup_unlock(task);
+ put_task_struct(task);
+ continue;
+ }
+
+ ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
+
+ threadgroup_unlock(task);
+ put_task_struct(task);
+
+ if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
+ goto out_finish;
+ }
+ }
+
+out_finish:
+ cgroup_migrate_finish(&preloaded_csets);
+ return ret;
+}
+
+/* change the enabled child controllers for a cgroup in the default hierarchy */
+static int cgroup_subtree_control_write(struct cgroup_subsys_state *dummy_css,
+ struct cftype *cft, char *buffer)
+{
+ unsigned int enable_req = 0, disable_req = 0, enable, disable;
+ struct cgroup *cgrp = dummy_css->cgroup, *child;
+ struct cgroup_subsys *ss;
+ char *tok, *p;
+ int ssid, ret;
+
+ /*
+ * Parse input - white space separated list of subsystem names
+ * prefixed with either + or -.
+ */
+ p = buffer;
+ while ((tok = strsep(&p, " \t\n"))) {
+ for_each_subsys(ss, ssid) {
+ if (ss->disabled || strcmp(tok + 1, ss->name))
+ continue;
+
+ if (*tok == '+') {
+ enable_req |= 1 << ssid;
+ disable_req &= ~(1 << ssid);
+ } else if (*tok == '-') {
+ disable_req |= 1 << ssid;
+ enable_req &= ~(1 << ssid);
+ } else {
+ return -EINVAL;
+ }
+ break;
+ }
+ if (ssid == CGROUP_SUBSYS_COUNT)
+ return -EINVAL;
+ }
+
+ /*
+ * We're gonna grab cgroup_tree_mutex which nests outside kernfs
+ * active_ref. cgroup_lock_live_group() already provides enough
+ * protection. Ensure @cgrp stays accessible and break the
+ * active_ref protection.
+ */
+ cgroup_get(cgrp);
+ kernfs_break_active_protection(cgrp->control_kn);
+retry:
+ enable = enable_req;
+ disable = disable_req;
+
+ mutex_lock(&cgroup_tree_mutex);
+
+ for_each_subsys(ss, ssid) {
+ if (enable & (1 << ssid)) {
+ if (cgrp->child_subsys_mask & (1 << ssid)) {
+ enable &= ~(1 << ssid);
+ continue;
+ }
+
+ /*
+ * Because css offlining is asynchronous, userland
+ * might try to re-enable the same controller while
+ * the previous instance is still around. In such
+ * cases, wait till it's gone using offline_waitq.
+ */
+ cgroup_for_each_live_child(child, cgrp) {
+ wait_queue_t wait;
+
+ if (!cgroup_css(child, ss))
+ continue;
+
+ prepare_to_wait(&child->offline_waitq, &wait,
+ TASK_UNINTERRUPTIBLE);
+ mutex_unlock(&cgroup_tree_mutex);
+ schedule();
+ finish_wait(&child->offline_waitq, &wait);
+ goto retry;
+ }
+
+ /* unavailable or not enabled on the parent? */
+ if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
+ (cgrp->parent &&
+ !(cgrp->parent->child_subsys_mask & (1 << ssid)))) {
+ ret = -ENOENT;
+ goto out_unlock_tree;
+ }
+ } else if (disable & (1 << ssid)) {
+ if (!(cgrp->child_subsys_mask & (1 << ssid))) {
+ disable &= ~(1 << ssid);
+ continue;
+ }
+
+ /* a child has it enabled? */
+ cgroup_for_each_live_child(child, cgrp) {
+ if (child->child_subsys_mask & (1 << ssid)) {
+ ret = -EBUSY;
+ goto out_unlock_tree;
+ }
+ }
+ }
+ }
+
+ if (!enable && !disable) {
+ ret = 0;
+ goto out_unlock_tree;
+ }
+
+ if (!cgroup_lock_live_group(cgrp)) {
+ ret = -ENODEV;
+ goto out_unlock_tree;
+ }
+
+ /*
+ * Except for the root, child_subsys_mask must be zero for a cgroup
+ * with tasks so that child cgroups don't compete against tasks.
+ */
+ if (enable && cgrp->parent && !list_empty(&cgrp->cset_links)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ /*
+ * Create csses for enables and update child_subsys_mask. This
+ * changes cgroup_e_css() results which in turn makes the
+ * subsequent cgroup_update_dfl_csses() associate all tasks in the
+ * subtree to the updated csses.
+ */
+ for_each_subsys(ss, ssid) {
+ if (!(enable & (1 << ssid)))
+ continue;
+
+ cgroup_for_each_live_child(child, cgrp) {
+ ret = create_css(child, ss);
+ if (ret)
+ goto err_undo_css;
+ }
+ }
+
+ cgrp->child_subsys_mask |= enable;
+ cgrp->child_subsys_mask &= ~disable;
+
+ ret = cgroup_update_dfl_csses(cgrp);
+ if (ret)
+ goto err_undo_css;
+
+ /* all tasks are now migrated away from the old csses, kill them */
+ for_each_subsys(ss, ssid) {
+ if (!(disable & (1 << ssid)))
+ continue;
+
+ cgroup_for_each_live_child(child, cgrp)
+ kill_css(cgroup_css(child, ss));
+ }
+
+ kernfs_activate(cgrp->kn);
+ ret = 0;
+out_unlock:
+ mutex_unlock(&cgroup_mutex);
+out_unlock_tree:
+ mutex_unlock(&cgroup_tree_mutex);
+ kernfs_unbreak_active_protection(cgrp->control_kn);
+ cgroup_put(cgrp);
+ return ret;
+
+err_undo_css:
+ cgrp->child_subsys_mask &= ~enable;
+ cgrp->child_subsys_mask |= disable;
+
+ for_each_subsys(ss, ssid) {
+ if (!(enable & (1 << ssid)))
+ continue;
+
+ cgroup_for_each_live_child(child, cgrp) {
+ struct cgroup_subsys_state *css = cgroup_css(child, ss);
+ if (css)
+ kill_css(css);
+ }
+ }
+ goto out_unlock;
+}
+
+static int cgroup_populated_show(struct seq_file *seq, void *v)
+{
+ seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt);
+ return 0;
+}
+
static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
@@ -2377,9 +2874,16 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
return PTR_ERR(kn);
ret = cgroup_kn_set_ugid(kn);
- if (ret)
+ if (ret) {
kernfs_remove(kn);
- return ret;
+ return ret;
+ }
+
+ if (cft->seq_show == cgroup_subtree_control_show)
+ cgrp->control_kn = kn;
+ else if (cft->seq_show == cgroup_populated_show)
+ cgrp->populated_kn = kn;
+ return 0;
}
/**
@@ -2415,8 +2919,8 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
if (is_add) {
ret = cgroup_add_file(cgrp, cft);
if (ret) {
- pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
- cft->name, ret);
+ pr_warn("%s: failed to add %s, err=%d\n",
+ __func__, cft->name, ret);
return ret;
}
} else {
@@ -2436,10 +2940,6 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
lockdep_assert_held(&cgroup_tree_mutex);
- /* don't bother if @ss isn't attached */
- if (ss->root == &cgrp_dfl_root)
- return 0;
-
/* add/rm files for all cgroups created before */
css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
struct cgroup *cgrp = css->cgroup;
@@ -2641,10 +3141,19 @@ css_next_child(struct cgroup_subsys_state *pos_css,
break;
}
- if (&next->sibling == &cgrp->children)
- return NULL;
+ /*
+ * @next, if not pointing to the head, can be dereferenced and is
+ * the next sibling; however, it might have @ss disabled. If so,
+ * fast-forward to the next enabled one.
+ */
+ while (&next->sibling != &cgrp->children) {
+ struct cgroup_subsys_state *next_css = cgroup_css(next, parent_css->ss);
- return cgroup_css(next, parent_css->ss);
+ if (next_css)
+ return next_css;
+ next = list_entry_rcu(next->sibling.next, struct cgroup, sibling);
+ }
+ return NULL;
}
/**
@@ -2781,27 +3290,36 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
*/
static void css_advance_task_iter(struct css_task_iter *it)
{
- struct list_head *l = it->cset_link;
+ struct list_head *l = it->cset_pos;
struct cgrp_cset_link *link;
struct css_set *cset;
/* Advance to the next non-empty css_set */
do {
l = l->next;
- if (l == &it->origin_css->cgroup->cset_links) {
- it->cset_link = NULL;
+ if (l == it->cset_head) {
+ it->cset_pos = NULL;
return;
}
- link = list_entry(l, struct cgrp_cset_link, cset_link);
- cset = link->cset;
+
+ if (it->ss) {
+ cset = container_of(l, struct css_set,
+ e_cset_node[it->ss->id]);
+ } else {
+ link = list_entry(l, struct cgrp_cset_link, cset_link);
+ cset = link->cset;
+ }
} while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
- it->cset_link = l;
+ it->cset_pos = l;
if (!list_empty(&cset->tasks))
- it->task = cset->tasks.next;
+ it->task_pos = cset->tasks.next;
else
- it->task = cset->mg_tasks.next;
+ it->task_pos = cset->mg_tasks.next;
+
+ it->tasks_head = &cset->tasks;
+ it->mg_tasks_head = &cset->mg_tasks;
}
/**
@@ -2827,8 +3345,14 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
down_read(&css_set_rwsem);
- it->origin_css = css;
- it->cset_link = &css->cgroup->cset_links;
+ it->ss = css->ss;
+
+ if (it->ss)
+ it->cset_pos = &css->cgroup->e_csets[css->ss->id];
+ else
+ it->cset_pos = &css->cgroup->cset_links;
+
+ it->cset_head = it->cset_pos;
css_advance_task_iter(it);
}
@@ -2844,12 +3368,10 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
struct task_struct *css_task_iter_next(struct css_task_iter *it)
{
struct task_struct *res;
- struct list_head *l = it->task;
- struct cgrp_cset_link *link = list_entry(it->cset_link,
- struct cgrp_cset_link, cset_link);
+ struct list_head *l = it->task_pos;
/* If the iterator cg is NULL, we have no tasks */
- if (!it->cset_link)
+ if (!it->cset_pos)
return NULL;
res = list_entry(l, struct task_struct, cg_list);
@@ -2860,13 +3382,13 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
*/
l = l->next;
- if (l == &link->cset->tasks)
- l = link->cset->mg_tasks.next;
+ if (l == it->tasks_head)
+ l = it->mg_tasks_head->next;
- if (l == &link->cset->mg_tasks)
+ if (l == it->mg_tasks_head)
css_advance_task_iter(it);
else
- it->task = l;
+ it->task_pos = l;
return res;
}
@@ -3388,17 +3910,6 @@ static int cgroup_pidlist_show(struct seq_file *s, void *v)
return seq_printf(s, "%d\n", *(int *)v);
}
-/*
- * seq_operations functions for iterating on pidlists through seq_file -
- * independent of whether it's tasks or procs
- */
-static const struct seq_operations cgroup_pidlist_seq_operations = {
- .start = cgroup_pidlist_start,
- .stop = cgroup_pidlist_stop,
- .next = cgroup_pidlist_next,
- .show = cgroup_pidlist_show,
-};
-
static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
struct cftype *cft)
{
@@ -3454,6 +3965,27 @@ static struct cftype cgroup_base_files[] = {
.flags = CFTYPE_ONLY_ON_ROOT,
.seq_show = cgroup_sane_behavior_show,
},
+ {
+ .name = "cgroup.controllers",
+ .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT,
+ .seq_show = cgroup_root_controllers_show,
+ },
+ {
+ .name = "cgroup.controllers",
+ .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_controllers_show,
+ },
+ {
+ .name = "cgroup.subtree_control",
+ .flags = CFTYPE_ONLY_ON_DFL,
+ .seq_show = cgroup_subtree_control_show,
+ .write_string = cgroup_subtree_control_write,
+ },
+ {
+ .name = "cgroup.populated",
+ .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_populated_show,
+ },
/*
* Historical crazy stuff. These don't have "cgroup." prefix and
@@ -3494,7 +4026,7 @@ static struct cftype cgroup_base_files[] = {
*
* On failure, no file is added.
*/
-static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
+static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask)
{
struct cgroup_subsys *ss;
int i, ret = 0;
@@ -3503,7 +4035,7 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
for_each_subsys(ss, i) {
struct cftype *cfts;
- if (!test_bit(i, &subsys_mask))
+ if (!(subsys_mask & (1 << i)))
continue;
list_for_each_entry(cfts, &ss->cfts, node) {
@@ -3566,22 +4098,29 @@ static void css_release(struct percpu_ref *ref)
{
struct cgroup_subsys_state *css =
container_of(ref, struct cgroup_subsys_state, refcnt);
+ struct cgroup_subsys *ss = css->ss;
+
+ RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
+ cgroup_idr_remove(&ss->css_idr, css->id);
- RCU_INIT_POINTER(css->cgroup->subsys[css->ss->id], NULL);
call_rcu(&css->rcu_head, css_free_rcu_fn);
}
-static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss,
- struct cgroup *cgrp)
+static void init_and_link_css(struct cgroup_subsys_state *css,
+ struct cgroup_subsys *ss, struct cgroup *cgrp)
{
+ cgroup_get(cgrp);
+
css->cgroup = cgrp;
css->ss = ss;
css->flags = 0;
- if (cgrp->parent)
+ if (cgrp->parent) {
css->parent = cgroup_css(cgrp->parent, ss);
- else
+ css_get(css->parent);
+ } else {
css->flags |= CSS_ROOT;
+ }
BUG_ON(cgroup_css(cgrp, ss));
}
@@ -3621,7 +4160,9 @@ static void offline_css(struct cgroup_subsys_state *css)
css->flags &= ~CSS_ONLINE;
css->cgroup->nr_css--;
- RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css);
+ RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
+
+ wake_up_all(&css->cgroup->offline_waitq);
}
/**
@@ -3645,31 +4186,34 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
if (IS_ERR(css))
return PTR_ERR(css);
+ init_and_link_css(css, ss, cgrp);
+
err = percpu_ref_init(&css->refcnt, css_release);
if (err)
goto err_free_css;
- init_css(css, ss, cgrp);
+ err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT);
+ if (err < 0)
+ goto err_free_percpu_ref;
+ css->id = err;
err = cgroup_populate_dir(cgrp, 1 << ss->id);
if (err)
- goto err_free_percpu_ref;
+ goto err_free_id;
+
+ /* @css is ready to be brought online now, make it visible */
+ cgroup_idr_replace(&ss->css_idr, css, css->id);
err = online_css(css);
if (err)
goto err_clear_dir;
- cgroup_get(cgrp);
- css_get(css->parent);
-
- cgrp->subsys_mask |= 1 << ss->id;
-
if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
parent->parent) {
- pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
- current->comm, current->pid, ss->name);
+ pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
+ current->comm, current->pid, ss->name);
if (!strcmp(ss->name, "memory"))
- pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
+ pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
ss->warned_broken_hierarchy = true;
}
@@ -3677,10 +4221,12 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
err_clear_dir:
cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
+err_free_id:
+ cgroup_idr_remove(&ss->css_idr, css->id);
err_free_percpu_ref:
percpu_ref_cancel_init(&css->refcnt);
err_free_css:
- ss->css_free(css);
+ call_rcu(&css->rcu_head, css_free_rcu_fn);
return err;
}
@@ -3699,13 +4245,6 @@ static long cgroup_create(struct cgroup *parent, const char *name,
struct cgroup_subsys *ss;
struct kernfs_node *kn;
- /*
- * XXX: The default hierarchy isn't fully implemented yet. Block
- * !root cgroup creation on it for now.
- */
- if (root == &cgrp_dfl_root)
- return -EINVAL;
-
/* allocate the cgroup and its ID, 0 is reserved for the root */
cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
if (!cgrp)
@@ -3729,7 +4268,7 @@ static long cgroup_create(struct cgroup *parent, const char *name,
* Temporarily set the pointer to NULL, so idr_find() won't return
* a half-baked cgroup.
*/
- cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
+ cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT);
if (cgrp->id < 0) {
err = -ENOMEM;
goto err_unlock;
@@ -3772,7 +4311,7 @@ static long cgroup_create(struct cgroup *parent, const char *name,
* @cgrp is now fully operational. If something fails after this
* point, it'll be released via the normal destruction path.
*/
- idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
+ cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
err = cgroup_kn_set_ugid(kn);
if (err)
@@ -3784,13 +4323,20 @@ static long cgroup_create(struct cgroup *parent, const char *name,
/* let's create and online css's */
for_each_subsys(ss, ssid) {
- if (root->cgrp.subsys_mask & (1 << ssid)) {
+ if (parent->child_subsys_mask & (1 << ssid)) {
err = create_css(cgrp, ss);
if (err)
goto err_destroy;
}
}
+ /*
+ * On the default hierarchy, a child doesn't automatically inherit
+ * child_subsys_mask from the parent. Each is configured manually.
+ */
+ if (!cgroup_on_dfl(cgrp))
+ cgrp->child_subsys_mask = parent->child_subsys_mask;
+
kernfs_activate(kn);
mutex_unlock(&cgroup_mutex);
@@ -3799,7 +4345,7 @@ static long cgroup_create(struct cgroup *parent, const char *name,
return 0;
err_free_id:
- idr_remove(&root->cgroup_idr, cgrp->id);
+ cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
err_unlock:
mutex_unlock(&cgroup_mutex);
err_unlock_tree:
@@ -3886,7 +4432,16 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
queue_work(cgroup_destroy_wq, &css->destroy_work);
}
-static void __kill_css(struct cgroup_subsys_state *css)
+/**
+ * kill_css - destroy a css
+ * @css: css to destroy
+ *
+ * This function initiates destruction of @css by removing cgroup interface
+ * files and putting its base reference. ->css_offline() will be invoked
+ * asynchronously once css_tryget() is guaranteed to fail and when the
+ * reference count reaches zero, @css will be released.
+ */
+static void kill_css(struct cgroup_subsys_state *css)
{
lockdep_assert_held(&cgroup_tree_mutex);
@@ -3916,28 +4471,6 @@ static void __kill_css(struct cgroup_subsys_state *css)
}
/**
- * kill_css - destroy a css
- * @css: css to destroy
- *
- * This function initiates destruction of @css by removing cgroup interface
- * files and putting its base reference. ->css_offline() will be invoked
- * asynchronously once css_tryget() is guaranteed to fail and when the
- * reference count reaches zero, @css will be released.
- */
-static void kill_css(struct cgroup_subsys_state *css)
-{
- struct cgroup *cgrp = css->cgroup;
-
- lockdep_assert_held(&cgroup_tree_mutex);
-
- /* if already killed, noop */
- if (cgrp->subsys_mask & (1 << css->ss->id)) {
- cgrp->subsys_mask &= ~(1 << css->ss->id);
- __kill_css(css);
- }
-}
-
-/**
* cgroup_destroy_locked - the first stage of cgroup destruction
* @cgrp: cgroup to be destroyed
*
@@ -4053,7 +4586,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
/**
* cgroup_destroy_css_killed - the second step of cgroup destruction
- * @work: cgroup->destroy_free_work
+ * @cgrp: the cgroup whose csses have just finished offlining
*
* This function is invoked from a work item for a cgroup which is being
* destroyed after all css's are offlined and performs the rest of
@@ -4116,7 +4649,7 @@ static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
.rename = cgroup_rename,
};
-static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
+static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
{
struct cgroup_subsys_state *css;
@@ -4125,6 +4658,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
mutex_lock(&cgroup_tree_mutex);
mutex_lock(&cgroup_mutex);
+ idr_init(&ss->css_idr);
INIT_LIST_HEAD(&ss->cfts);
/* Create the root cgroup state for this subsystem */
@@ -4132,7 +4666,14 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
/* We don't handle early failures gracefully */
BUG_ON(IS_ERR(css));
- init_css(css, ss, &cgrp_dfl_root.cgrp);
+ init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
+ if (early) {
+ /* idr_alloc() can't be called safely during early init */
+ css->id = 1;
+ } else {
+ css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
+ BUG_ON(css->id < 0);
+ }
/* Update the init_css_set to contain a subsys
* pointer to this state - since the subsystem is
@@ -4149,7 +4690,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
BUG_ON(online_css(css));
- cgrp_dfl_root.cgrp.subsys_mask |= 1 << ss->id;
+ cgrp_dfl_root.subsys_mask |= 1 << ss->id;
mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgroup_tree_mutex);
@@ -4183,7 +4724,7 @@ int __init cgroup_init_early(void)
ss->name = cgroup_subsys_name[i];
if (ss->early_init)
- cgroup_init_subsys(ss);
+ cgroup_init_subsys(ss, true);
}
return 0;
}
@@ -4215,8 +4756,19 @@ int __init cgroup_init(void)
mutex_unlock(&cgroup_tree_mutex);
for_each_subsys(ss, ssid) {
- if (!ss->early_init)
- cgroup_init_subsys(ss);
+ if (ss->early_init) {
+ struct cgroup_subsys_state *css =
+ init_css_set.subsys[ss->id];
+
+ css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
+ GFP_KERNEL);
+ BUG_ON(css->id < 0);
+ } else {
+ cgroup_init_subsys(ss, false);
+ }
+
+ list_add_tail(&init_css_set.e_cset_node[ssid],
+ &cgrp_dfl_root.cgrp.e_csets[ssid]);
/*
* cftype registration needs kmalloc and can't be done
@@ -4306,7 +4858,7 @@ int proc_cgroup_show(struct seq_file *m, void *v)
seq_printf(m, "%d:", root->hierarchy_id);
for_each_subsys(ss, ssid)
- if (root->cgrp.subsys_mask & (1 << ssid))
+ if (root->subsys_mask & (1 << ssid))
seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
if (strlen(root->name))
seq_printf(m, "%sname=%s", count ? "," : "",
@@ -4667,14 +5219,8 @@ struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
*/
struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
{
- struct cgroup *cgrp;
-
- cgroup_assert_mutexes_or_rcu_locked();
-
- cgrp = idr_find(&ss->root->cgroup_idr, id);
- if (cgrp)
- return cgroup_css(cgrp, ss);
- return NULL;
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ return idr_find(&ss->css_idr, id);
}
#ifdef CONFIG_CGROUP_DEBUG