diff options
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r-- | kernel/cgroup.c | 754 |
1 files changed, 386 insertions, 368 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f24f724620dd..f34c41bfaa37 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -138,6 +138,9 @@ struct cgroupfs_root { /* Hierarchy-specific flags */ unsigned long flags; + /* IDs for cgroups in this hierarchy */ + struct ida cgroup_ida; + /* The path to use for release notifications. */ char release_agent_path[PATH_MAX]; @@ -171,8 +174,8 @@ struct css_id { * The css to which this ID points. This pointer is set to valid value * after cgroup is populated. If cgroup is removed, this will be NULL. * This pointer is expected to be RCU-safe because destroy() - * is called after synchronize_rcu(). But for safe use, css_is_removed() - * css_tryget() should be used for avoiding race. + * is called after synchronize_rcu(). But for safe use, css_tryget() + * should be used for avoiding race. */ struct cgroup_subsys_state __rcu *css; /* @@ -242,6 +245,10 @@ static DEFINE_SPINLOCK(hierarchy_id_lock); */ static int need_forkexit_callback __read_mostly; +static int cgroup_destroy_locked(struct cgroup *cgrp); +static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, + struct cftype cfts[], bool is_add); + #ifdef CONFIG_PROVE_LOCKING int cgroup_lock_is_held(void) { @@ -294,11 +301,6 @@ static int notify_on_release(const struct cgroup *cgrp) return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); } -static int clone_children(const struct cgroup *cgrp) -{ - return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); -} - /* * for_each_subsys() allows you to iterate on each subsystem attached to * an active hierarchy @@ -782,12 +784,12 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, * The task_lock() exception * * The need for this exception arises from the action of - * cgroup_attach_task(), which overwrites one tasks cgroup pointer with + * cgroup_attach_task(), which overwrites one task's cgroup pointer with * another. It does so using cgroup_mutex, however there are * several performance critical places that need to reference * task->cgroup without the expense of grabbing a system global * mutex. Therefore except as noted below, when dereferencing or, as - * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use + * in cgroup_attach_task(), modifying a task's cgroup pointer we use * task_lock(), which acts on a spinlock (task->alloc_lock) already in * the task_struct routinely used for such matters. * @@ -854,30 +856,6 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) return inode; } -/* - * Call subsys's pre_destroy handler. - * This is called before css refcnt check. - */ -static int cgroup_call_pre_destroy(struct cgroup *cgrp) -{ - struct cgroup_subsys *ss; - int ret = 0; - - for_each_subsys(cgrp->root, ss) { - if (!ss->pre_destroy) - continue; - - ret = ss->pre_destroy(cgrp); - if (ret) { - /* ->pre_destroy() failure is being deprecated */ - WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs); - break; - } - } - - return ret; -} - static void cgroup_diput(struct dentry *dentry, struct inode *inode) { /* is dentry a directory ? if so, kfree() associated cgroup */ @@ -898,7 +876,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) * Release the subsystem state objects. */ for_each_subsys(cgrp->root, ss) - ss->destroy(cgrp); + ss->css_free(cgrp); cgrp->root->number_of_cgroups--; mutex_unlock(&cgroup_mutex); @@ -917,6 +895,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) simple_xattrs_free(&cgrp->xattrs); + ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); kfree_rcu(cgrp, rcu_head); } else { struct cfent *cfe = __d_cfe(dentry); @@ -987,7 +966,7 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files, if (!test_bit(ss->subsys_id, &subsys_mask)) continue; list_for_each_entry(set, &ss->cftsets, node) - cgroup_rm_file(cgrp, set->cfts); + cgroup_addrm_files(cgrp, NULL, set->cfts, false); } if (base_files) { while (!list_empty(&cgrp->files)) @@ -1015,33 +994,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry) } /* - * A queue for waiters to do rmdir() cgroup. A tasks will sleep when - * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some - * reference to css->refcnt. In general, this refcnt is expected to goes down - * to zero, soon. - * - * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; - */ -static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); - -static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) -{ - if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) - wake_up_all(&cgroup_rmdir_waitq); -} - -void cgroup_exclude_rmdir(struct cgroup_subsys_state *css) -{ - css_get(css); -} - -void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) -{ - cgroup_wakeup_rmdir_waiter(css->cgroup); - css_put(css); -} - -/* * Call with cgroup_mutex held. Drops reference counts on modules, including * any duplicate ones that parse_cgroupfs_options took. If this function * returns an error, no reference counts are touched. @@ -1150,7 +1102,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",xattr"); if (strlen(root->release_agent_path)) seq_printf(seq, ",release_agent=%s", root->release_agent_path); - if (clone_children(&root->top_cgroup)) + if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags)) seq_puts(seq, ",clone_children"); if (strlen(root->name)) seq_printf(seq, ",name=%s", root->name); @@ -1162,7 +1114,7 @@ struct cgroup_sb_opts { unsigned long subsys_mask; unsigned long flags; char *release_agent; - bool clone_children; + bool cpuset_clone_children; char *name; /* User explicitly requested empty subsystem */ bool none; @@ -1213,7 +1165,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) continue; } if (!strcmp(token, "clone_children")) { - opts->clone_children = true; + opts->cpuset_clone_children = true; continue; } if (!strcmp(token, "xattr")) { @@ -1397,14 +1349,21 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) goto out_unlock; } + /* + * Clear out the files of subsystems that should be removed, do + * this before rebind_subsystems, since rebind_subsystems may + * change this hierarchy's subsys_list. + */ + cgroup_clear_directory(cgrp->dentry, false, removed_mask); + ret = rebind_subsystems(root, opts.subsys_mask); if (ret) { + /* rebind_subsystems failed, re-populate the removed files */ + cgroup_populate_dir(cgrp, false, removed_mask); drop_parsed_module_refcounts(opts.subsys_mask); goto out_unlock; } - /* clear out any existing files and repopulate subsystem files */ - cgroup_clear_directory(cgrp->dentry, false, removed_mask); /* re-populate subsystem files */ cgroup_populate_dir(cgrp, false, added_mask); @@ -1432,6 +1391,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->children); INIT_LIST_HEAD(&cgrp->files); INIT_LIST_HEAD(&cgrp->css_sets); + INIT_LIST_HEAD(&cgrp->allcg_node); INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); @@ -1450,8 +1410,8 @@ static void init_cgroup_root(struct cgroupfs_root *root) root->number_of_cgroups = 1; cgrp->root = root; cgrp->top_cgroup = cgrp; - list_add_tail(&cgrp->allcg_node, &root->allcg_list); init_cgroup_housekeeping(cgrp); + list_add_tail(&cgrp->allcg_node, &root->allcg_list); } static bool init_root_id(struct cgroupfs_root *root) @@ -1518,12 +1478,13 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) root->subsys_mask = opts->subsys_mask; root->flags = opts->flags; + ida_init(&root->cgroup_ida); if (opts->release_agent) strcpy(root->release_agent_path, opts->release_agent); if (opts->name) strcpy(root->name, opts->name); - if (opts->clone_children) - set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags); + if (opts->cpuset_clone_children) + set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags); return root; } @@ -1536,6 +1497,7 @@ static void cgroup_drop_root(struct cgroupfs_root *root) spin_lock(&hierarchy_id_lock); ida_remove(&hierarchy_ida, root->hierarchy_id); spin_unlock(&hierarchy_id_lock); + ida_destroy(&root->cgroup_ida); kfree(root); } @@ -1701,7 +1663,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, free_cg_links(&tmp_cg_links); - BUG_ON(!list_empty(&root_cgrp->sibling)); BUG_ON(!list_empty(&root_cgrp->children)); BUG_ON(root->number_of_cgroups != 1); @@ -1750,7 +1711,6 @@ static void cgroup_kill_sb(struct super_block *sb) { BUG_ON(root->number_of_cgroups != 1); BUG_ON(!list_empty(&cgrp->children)); - BUG_ON(!list_empty(&cgrp->sibling)); mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_root_mutex); @@ -1808,9 +1768,11 @@ static struct kobject *cgroup_kobj; */ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) { + struct dentry *dentry = cgrp->dentry; char *start; - struct dentry *dentry = rcu_dereference_check(cgrp->dentry, - cgroup_lock_is_held()); + + rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(), + "cgroup_path() called without proper locking"); if (!dentry || cgrp == dummytop) { /* @@ -1821,9 +1783,9 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) return 0; } - start = buf + buflen; + start = buf + buflen - 1; - *--start = '\0'; + *start = '\0'; for (;;) { int len = dentry->d_name.len; @@ -1834,8 +1796,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) if (!cgrp) break; - dentry = rcu_dereference_check(cgrp->dentry, - cgroup_lock_is_held()); + dentry = cgrp->dentry; if (!cgrp->parent) continue; if (--start < buf) @@ -1930,9 +1891,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size); /* * cgroup_task_migrate - move a task from one cgroup to another. * - * 'guarantee' is set if the caller promises that a new css_set for the task - * will already exist. If not set, this function might sleep, and can fail with - * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked. + * Must be called with cgroup_mutex and threadgroup locked. */ static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, struct task_struct *tsk, struct css_set *newcg) @@ -2025,12 +1984,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) } synchronize_rcu(); - - /* - * wake up rmdir() waiter. the rmdir should fail since the cgroup - * is no longer empty. - */ - cgroup_wakeup_rmdir_waiter(cgrp); out: if (retval) { for_each_subsys(root, ss) { @@ -2200,7 +2153,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) * step 5: success! and cleanup */ synchronize_rcu(); - cgroup_wakeup_rmdir_waiter(cgrp); retval = 0; out_put_css_set_refs: if (retval) { @@ -2711,10 +2663,17 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode, /* start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); + inc_nlink(dentry->d_parent->d_inode); - /* start with the directory inode held, so that we can - * populate it without racing with another mkdir */ - mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); + /* + * Control reaches here with cgroup_mutex held. + * @inode->i_mutex should nest outside cgroup_mutex but we + * want to populate it immediately without releasing + * cgroup_mutex. As @inode isn't visible to anyone else + * yet, trylock will always succeed without affecting + * lockdep checks. + */ + WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex)); } else if (S_ISREG(mode)) { inode->i_size = 0; inode->i_fop = &cgroup_file_operations; @@ -2725,32 +2684,6 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode, return 0; } -/* - * cgroup_create_dir - create a directory for an object. - * @cgrp: the cgroup we create the directory for. It must have a valid - * ->parent field. And we are going to fill its ->dentry field. - * @dentry: dentry of the new cgroup - * @mode: mode to set on new directory. - */ -static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, - umode_t mode) -{ - struct dentry *parent; - int error = 0; - - parent = cgrp->parent->dentry; - error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb); - if (!error) { - dentry->d_fsdata = cgrp; - inc_nlink(parent->d_inode); - rcu_assign_pointer(cgrp->dentry, dentry); - dget(dentry); - } - dput(dentry); - - return error; -} - /** * cgroup_file_mode - deduce file mode of a control file * @cft: the control file in question @@ -2791,12 +2724,6 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, simple_xattrs_init(&cft->xattrs); - /* does @cft->flags tell us to skip creation on @cgrp? */ - if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) - return 0; - if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) - return 0; - if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { strcpy(name, subsys->name); strcat(name, "."); @@ -2837,6 +2764,12 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, int err, ret = 0; for (cft = cfts; cft->name[0] != '\0'; cft++) { + /* does cft->flags tell us to skip this file on @cgrp? */ + if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) + continue; + if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) + continue; + if (is_add) err = cgroup_add_file(cgrp, subsys, cft); else @@ -3044,6 +2977,92 @@ static void cgroup_enable_task_cg_lists(void) write_unlock(&css_set_lock); } +/** + * cgroup_next_descendant_pre - find the next descendant for pre-order walk + * @pos: the current position (%NULL to initiate traversal) + * @cgroup: cgroup whose descendants to walk + * + * To be used by cgroup_for_each_descendant_pre(). Find the next + * descendant to visit for pre-order traversal of @cgroup's descendants. + */ +struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, + struct cgroup *cgroup) +{ + struct cgroup *next; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + /* if first iteration, pretend we just visited @cgroup */ + if (!pos) { + if (list_empty(&cgroup->children)) + return NULL; + pos = cgroup; + } + + /* visit the first child if exists */ + next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling); + if (next) + return next; + + /* no child, visit my or the closest ancestor's next sibling */ + do { + next = list_entry_rcu(pos->sibling.next, struct cgroup, + sibling); + if (&next->sibling != &pos->parent->children) + return next; + + pos = pos->parent; + } while (pos != cgroup); + + return NULL; +} +EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); + +static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) +{ + struct cgroup *last; + + do { + last = pos; + pos = list_first_or_null_rcu(&pos->children, struct cgroup, + sibling); + } while (pos); + + return last; +} + +/** + * cgroup_next_descendant_post - find the next descendant for post-order walk + * @pos: the current position (%NULL to initiate traversal) + * @cgroup: cgroup whose descendants to walk + * + * To be used by cgroup_for_each_descendant_post(). Find the next + * descendant to visit for post-order traversal of @cgroup's descendants. + */ +struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, + struct cgroup *cgroup) +{ + struct cgroup *next; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + /* if first iteration, visit the leftmost descendant */ + if (!pos) { + next = cgroup_leftmost_descendant(cgroup); + return next != cgroup ? next : NULL; + } + + /* if there's an unvisited sibling, visit its leftmost descendant */ + next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); + if (&next->sibling != &pos->parent->children) + return cgroup_leftmost_descendant(next); + + /* no sibling left, visit parent */ + next = pos->parent; + return next != cgroup ? next : NULL; +} +EXPORT_SYMBOL_GPL(cgroup_next_descendant_post); + void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) __acquires(css_set_lock) { @@ -3757,7 +3776,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, if (flags & POLLHUP) { __remove_wait_queue(event->wqh, &event->wait); spin_lock(&cgrp->event_list_lock); - list_del(&event->list); + list_del_init(&event->list); spin_unlock(&cgrp->event_list_lock); /* * We are in atomic context, but cgroup_event_remove() may @@ -3894,7 +3913,7 @@ fail: static u64 cgroup_clone_children_read(struct cgroup *cgrp, struct cftype *cft) { - return clone_children(cgrp); + return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); } static int cgroup_clone_children_write(struct cgroup *cgrp, @@ -3902,9 +3921,9 @@ static int cgroup_clone_children_write(struct cgroup *cgrp, u64 val) { if (val) - set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); + set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); else - clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); + clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); return 0; } @@ -4017,19 +4036,57 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, css->flags = 0; css->id = NULL; if (cgrp == dummytop) - set_bit(CSS_ROOT, &css->flags); + css->flags |= CSS_ROOT; BUG_ON(cgrp->subsys[ss->subsys_id]); cgrp->subsys[ss->subsys_id] = css; /* - * If !clear_css_refs, css holds an extra ref to @cgrp->dentry - * which is put on the last css_put(). dput() requires process - * context, which css_put() may be called without. @css->dput_work - * will be used to invoke dput() asynchronously from css_put(). + * css holds an extra ref to @cgrp->dentry which is put on the last + * css_put(). dput() requires process context, which css_put() may + * be called without. @css->dput_work will be used to invoke + * dput() asynchronously from css_put(). */ INIT_WORK(&css->dput_work, css_dput_fn); - if (ss->__DEPRECATED_clear_css_refs) - set_bit(CSS_CLEAR_CSS_REFS, &css->flags); +} + +/* invoke ->post_create() on a new CSS and mark it online if successful */ +static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + int ret = 0; + + lockdep_assert_held(&cgroup_mutex); + + if (ss->css_online) + ret = ss->css_online(cgrp); + if (!ret) + cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE; + return ret; +} + +/* if the CSS is online, invoke ->pre_destory() on it and mark it offline */ +static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) + __releases(&cgroup_mutex) __acquires(&cgroup_mutex) +{ + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + + lockdep_assert_held(&cgroup_mutex); + + if (!(css->flags & CSS_ONLINE)) + return; + + /* + * css_offline() should be called with cgroup_mutex unlocked. See + * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for + * details. This temporary unlocking should go away once + * cgroup_mutex is unexported from controllers. + */ + if (ss->css_offline) { + mutex_unlock(&cgroup_mutex); + ss->css_offline(cgrp); + mutex_lock(&cgroup_mutex); + } + + cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE; } /* @@ -4049,10 +4106,27 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, struct cgroup_subsys *ss; struct super_block *sb = root->sb; + /* allocate the cgroup and its ID, 0 is reserved for the root */ cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); if (!cgrp) return -ENOMEM; + cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); + if (cgrp->id < 0) + goto err_free_cgrp; + + /* + * Only live parents can have children. Note that the liveliness + * check isn't strictly necessary because cgroup_mkdir() and + * cgroup_rmdir() are fully synchronized by i_mutex; however, do it + * anyway so that locking is contained inside cgroup proper and we + * don't get nasty surprises if we ever grow another caller. + */ + if (!cgroup_lock_live_group(parent)) { + err = -ENODEV; + goto err_free_id; + } + /* Grab a reference on the superblock so the hierarchy doesn't * get deleted on unmount if there are child cgroups. This * can be done outside cgroup_mutex, since the sb can't @@ -4060,8 +4134,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, * fs */ atomic_inc(&sb->s_active); - mutex_lock(&cgroup_mutex); - init_cgroup_housekeeping(cgrp); cgrp->parent = parent; @@ -4071,26 +4143,51 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); - if (clone_children(parent)) - set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); + if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) + set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); for_each_subsys(root, ss) { struct cgroup_subsys_state *css; - css = ss->create(cgrp); + css = ss->css_alloc(cgrp); if (IS_ERR(css)) { err = PTR_ERR(css); - goto err_destroy; + goto err_free_all; } init_cgroup_css(css, ss, cgrp); if (ss->use_id) { err = alloc_css_id(ss, parent, cgrp); if (err) - goto err_destroy; + goto err_free_all; } - /* At error, ->destroy() callback has to free assigned ID. */ - if (clone_children(parent) && ss->post_clone) - ss->post_clone(cgrp); + } + + /* + * Create directory. cgroup_create_file() returns with the new + * directory locked on success so that it can be populated without + * dropping cgroup_mutex. + */ + err = cgroup_create_file(dentry, S_IFDIR | mode, sb); + if (err < 0) + goto err_free_all; + lockdep_assert_held(&dentry->d_inode->i_mutex); + + /* allocation complete, commit to creation */ + dentry->d_fsdata = cgrp; + cgrp->dentry = dentry; + list_add_tail(&cgrp->allcg_node, &root->allcg_list); + list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); + root->number_of_cgroups++; + + /* each css holds a ref to the cgroup's dentry */ + for_each_subsys(root, ss) + dget(dentry); + + /* creation succeeded, notify subsystems */ + for_each_subsys(root, ss) { + err = online_css(ss, cgrp); + if (err) + goto err_destroy; if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && parent->parent) { @@ -4102,50 +4199,34 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, } } - list_add(&cgrp->sibling, &cgrp->parent->children); - root->number_of_cgroups++; - - err = cgroup_create_dir(cgrp, dentry, mode); - if (err < 0) - goto err_remove; - - /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */ - for_each_subsys(root, ss) - if (!ss->__DEPRECATED_clear_css_refs) - dget(dentry); - - /* The cgroup directory was pre-locked for us */ - BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); - - list_add_tail(&cgrp->allcg_node, &root->allcg_list); - err = cgroup_populate_dir(cgrp, true, root->subsys_mask); - /* If err < 0, we have a half-filled directory - oh well ;) */ + if (err) + goto err_destroy; mutex_unlock(&cgroup_mutex); mutex_unlock(&cgrp->dentry->d_inode->i_mutex); return 0; - err_remove: - - list_del(&cgrp->sibling); - root->number_of_cgroups--; - - err_destroy: - +err_free_all: for_each_subsys(root, ss) { if (cgrp->subsys[ss->subsys_id]) - ss->destroy(cgrp); + ss->css_free(cgrp); } - mutex_unlock(&cgroup_mutex); - /* Release the reference count that we took on the superblock */ deactivate_super(sb); - +err_free_id: + ida_simple_remove(&root->cgroup_ida, cgrp->id); +err_free_cgrp: kfree(cgrp); return err; + +err_destroy: + cgroup_destroy_locked(cgrp); + mutex_unlock(&cgroup_mutex); + mutex_unlock(&dentry->d_inode->i_mutex); + return err; } static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) @@ -4197,153 +4278,60 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) return 0; } -/* - * Atomically mark all (or else none) of the cgroup's CSS objects as - * CSS_REMOVED. Return true on success, or false if the cgroup has - * busy subsystems. Call with cgroup_mutex held - * - * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or - * not, cgroup removal behaves differently. - * - * If clear is set, css refcnt for the subsystem should be zero before - * cgroup removal can be committed. This is implemented by - * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be - * called multiple times until all css refcnts reach zero and is allowed to - * veto removal on any invocation. This behavior is deprecated and will be - * removed as soon as the existing user (memcg) is updated. - * - * If clear is not set, each css holds an extra reference to the cgroup's - * dentry and cgroup removal proceeds regardless of css refs. - * ->pre_destroy() will be called at least once and is not allowed to fail. - * On the last put of each css, whenever that may be, the extra dentry ref - * is put so that dentry destruction happens only after all css's are - * released. - */ -static int cgroup_clear_css_refs(struct cgroup *cgrp) +static int cgroup_destroy_locked(struct cgroup *cgrp) + __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { + struct dentry *d = cgrp->dentry; + struct cgroup *parent = cgrp->parent; + DEFINE_WAIT(wait); + struct cgroup_event *event, *tmp; struct cgroup_subsys *ss; - unsigned long flags; - bool failed = false; + LIST_HEAD(tmp_list); + + lockdep_assert_held(&d->d_inode->i_mutex); + lockdep_assert_held(&cgroup_mutex); - local_irq_save(flags); + if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) + return -EBUSY; /* - * Block new css_tryget() by deactivating refcnt. If all refcnts - * for subsystems w/ clear_css_refs set were 1 at the moment of - * deactivation, we succeeded. + * Block new css_tryget() by deactivating refcnt and mark @cgrp + * removed. This makes future css_tryget() and child creation + * attempts fail thus maintaining the removal conditions verified + * above. */ for_each_subsys(cgrp->root, ss) { struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; WARN_ON(atomic_read(&css->refcnt) < 0); atomic_add(CSS_DEACT_BIAS, &css->refcnt); - - if (ss->__DEPRECATED_clear_css_refs) - failed |= css_refcnt(css) != 1; - } - - /* - * If succeeded, set REMOVED and put all the base refs; otherwise, - * restore refcnts to positive values. Either way, all in-progress - * css_tryget() will be released. - */ - for_each_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; - - if (!failed) { - set_bit(CSS_REMOVED, &css->flags); - css_put(css); - } else { - atomic_sub(CSS_DEACT_BIAS, &css->refcnt); - } } + set_bit(CGRP_REMOVED, &cgrp->flags); - local_irq_restore(flags); - return !failed; -} - -static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) -{ - struct cgroup *cgrp = dentry->d_fsdata; - struct dentry *d; - struct cgroup *parent; - DEFINE_WAIT(wait); - struct cgroup_event *event, *tmp; - int ret; - - /* the vfs holds both inode->i_mutex already */ -again: - mutex_lock(&cgroup_mutex); - if (atomic_read(&cgrp->count) != 0) { - mutex_unlock(&cgroup_mutex); - return -EBUSY; - } - if (!list_empty(&cgrp->children)) { - mutex_unlock(&cgroup_mutex); - return -EBUSY; - } - mutex_unlock(&cgroup_mutex); - - /* - * In general, subsystem has no css->refcnt after pre_destroy(). But - * in racy cases, subsystem may have to get css->refcnt after - * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes - * make rmdir return -EBUSY too often. To avoid that, we use waitqueue - * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir - * and subsystem's reference count handling. Please see css_get/put - * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation. - */ - set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); + /* tell subsystems to initate destruction */ + for_each_subsys(cgrp->root, ss) + offline_css(ss, cgrp); /* - * Call pre_destroy handlers of subsys. Notify subsystems - * that rmdir() request comes. + * Put all the base refs. Each css holds an extra reference to the + * cgroup's dentry and cgroup removal proceeds regardless of css + * refs. On the last put of each css, whenever that may be, the + * extra dentry ref is put so that dentry destruction happens only + * after all css's are released. */ - ret = cgroup_call_pre_destroy(cgrp); - if (ret) { - clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); - return ret; - } - - mutex_lock(&cgroup_mutex); - parent = cgrp->parent; - if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { - clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); - mutex_unlock(&cgroup_mutex); - return -EBUSY; - } - prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); - if (!cgroup_clear_css_refs(cgrp)) { - mutex_unlock(&cgroup_mutex); - /* - * Because someone may call cgroup_wakeup_rmdir_waiter() before - * prepare_to_wait(), we need to check this flag. - */ - if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)) - schedule(); - finish_wait(&cgroup_rmdir_waitq, &wait); - clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); - if (signal_pending(current)) - return -EINTR; - goto again; - } - /* NO css_tryget() can success after here. */ - finish_wait(&cgroup_rmdir_waitq, &wait); - clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); + for_each_subsys(cgrp->root, ss) + css_put(cgrp->subsys[ss->subsys_id]); raw_spin_lock(&release_list_lock); - set_bit(CGRP_REMOVED, &cgrp->flags); if (!list_empty(&cgrp->release_list)) list_del_init(&cgrp->release_list); raw_spin_unlock(&release_list_lock); /* delete this cgroup from parent->children */ - list_del_init(&cgrp->sibling); - + list_del_rcu(&cgrp->sibling); list_del_init(&cgrp->allcg_node); - d = dget(cgrp->dentry); - + dget(d); cgroup_d_remove_dir(d); dput(d); @@ -4353,21 +4341,35 @@ again: /* * Unregister events and notify userspace. * Notify userspace about cgroup removing only after rmdir of cgroup - * directory to avoid race between userspace and kernelspace + * directory to avoid race between userspace and kernelspace. Use + * a temporary list to avoid a deadlock with cgroup_event_wake(). Since + * cgroup_event_wake() is called with the wait queue head locked, + * remove_wait_queue() cannot be called while holding event_list_lock. */ spin_lock(&cgrp->event_list_lock); - list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { - list_del(&event->list); + list_splice_init(&cgrp->event_list, &tmp_list); + spin_unlock(&cgrp->event_list_lock); + list_for_each_entry_safe(event, tmp, &tmp_list, list) { + list_del_init(&event->list); remove_wait_queue(event->wqh, &event->wait); eventfd_signal(event->eventfd, 1); schedule_work(&event->remove); } - spin_unlock(&cgrp->event_list_lock); - mutex_unlock(&cgroup_mutex); return 0; } +static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) +{ + int ret; + + mutex_lock(&cgroup_mutex); + ret = cgroup_destroy_locked(dentry->d_fsdata); + mutex_unlock(&cgroup_mutex); + + return ret; +} + static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) { INIT_LIST_HEAD(&ss->cftsets); @@ -4388,13 +4390,15 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); + mutex_lock(&cgroup_mutex); + /* init base cftset */ cgroup_init_cftsets(ss); /* Create the top cgroup state for this subsystem */ list_add(&ss->sibling, &rootnode.subsys_list); ss->root = &rootnode; - css = ss->create(dummytop); + css = ss->css_alloc(dummytop); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_cgroup_css(css, ss, dummytop); @@ -4403,7 +4407,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) * pointer to this state - since the subsystem is * newly registered, all tasks and hence the * init_css_set is in the subsystem's top cgroup. */ - init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; + init_css_set.subsys[ss->subsys_id] = css; need_forkexit_callback |= ss->fork || ss->exit; @@ -4413,6 +4417,9 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) BUG_ON(!list_empty(&init_task.tasks)); ss->active = 1; + BUG_ON(online_css(ss, dummytop)); + + mutex_unlock(&cgroup_mutex); /* this function shouldn't be used with modular subsystems, since they * need to register a subsys_id, among other things */ @@ -4430,12 +4437,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) */ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) { - int i; struct cgroup_subsys_state *css; + int i, ret; /* check name and function validity */ if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || - ss->create == NULL || ss->destroy == NULL) + ss->css_alloc == NULL || ss->css_free == NULL) return -EINVAL; /* @@ -4464,10 +4471,11 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) subsys[ss->subsys_id] = ss; /* - * no ss->create seems to need anything important in the ss struct, so - * this can happen first (i.e. before the rootnode attachment). + * no ss->css_alloc seems to need anything important in the ss + * struct, so this can happen first (i.e. before the rootnode + * attachment). */ - css = ss->create(dummytop); + css = ss->css_alloc(dummytop); if (IS_ERR(css)) { /* failure case - need to deassign the subsys[] slot. */ subsys[ss->subsys_id] = NULL; @@ -4482,14 +4490,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) init_cgroup_css(css, ss, dummytop); /* init_idr must be after init_cgroup_css because it sets css->id. */ if (ss->use_id) { - int ret = cgroup_init_idr(ss, css); - if (ret) { - dummytop->subsys[ss->subsys_id] = NULL; - ss->destroy(dummytop); - subsys[ss->subsys_id] = NULL; - mutex_unlock(&cgroup_mutex); - return ret; - } + ret = cgroup_init_idr(ss, css); + if (ret) + goto err_unload; } /* @@ -4522,10 +4525,19 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) write_unlock(&css_set_lock); ss->active = 1; + ret = online_css(ss, dummytop); + if (ret) + goto err_unload; /* success! */ mutex_unlock(&cgroup_mutex); return 0; + +err_unload: + mutex_unlock(&cgroup_mutex); + /* @ss can't be mounted here as try_module_get() would fail */ + cgroup_unload_subsys(ss); + return ret; } EXPORT_SYMBOL_GPL(cgroup_load_subsys); @@ -4552,6 +4564,15 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) BUG_ON(ss->root != &rootnode); mutex_lock(&cgroup_mutex); + + offline_css(ss, dummytop); + ss->active = 0; + + if (ss->use_id) { + idr_remove_all(&ss->idr); + idr_destroy(&ss->idr); + } + /* deassign the subsys_id */ subsys[ss->subsys_id] = NULL; @@ -4567,7 +4588,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) struct css_set *cg = link->cg; hlist_del(&cg->hlist); - BUG_ON(!cg->subsys[ss->subsys_id]); cg->subsys[ss->subsys_id] = NULL; hhead = css_set_hash(cg->subsys); hlist_add_head(&cg->hlist, hhead); @@ -4575,12 +4595,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) write_unlock(&css_set_lock); /* - * remove subsystem's css from the dummytop and free it - need to free - * before marking as null because ss->destroy needs the cgrp->subsys - * pointer to find their state. note that this also takes care of - * freeing the css_id. + * remove subsystem's css from the dummytop and free it - need to + * free before marking as null because ss->css_free needs the + * cgrp->subsys pointer to find their state. note that this also + * takes care of freeing the css_id. */ - ss->destroy(dummytop); + ss->css_free(dummytop); dummytop->subsys[ss->subsys_id] = NULL; mutex_unlock(&cgroup_mutex); @@ -4624,8 +4644,8 @@ int __init cgroup_init_early(void) BUG_ON(!ss->name); BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); - BUG_ON(!ss->create); - BUG_ON(!ss->destroy); + BUG_ON(!ss->css_alloc); + BUG_ON(!ss->css_free); if (ss->subsys_id != i) { printk(KERN_ERR "cgroup: Subsys %s id == %d\n", ss->name, ss->subsys_id); @@ -4832,44 +4852,19 @@ void cgroup_fork(struct task_struct *child) } /** - * cgroup_fork_callbacks - run fork callbacks - * @child: the new task - * - * Called on a new task very soon before adding it to the - * tasklist. No need to take any locks since no-one can - * be operating on this task. - */ -void cgroup_fork_callbacks(struct task_struct *child) -{ - if (need_forkexit_callback) { - int i; - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - - /* - * forkexit callbacks are only supported for - * builtin subsystems. - */ - if (!ss || ss->module) - continue; - - if (ss->fork) - ss->fork(child); - } - } -} - -/** * cgroup_post_fork - called on a new task after adding it to the task list * @child: the task in question * - * Adds the task to the list running through its css_set if necessary. - * Has to be after the task is visible on the task list in case we race - * with the first call to cgroup_iter_start() - to guarantee that the - * new task ends up on its list. + * Adds the task to the list running through its css_set if necessary and + * call the subsystem fork() callbacks. Has to be after the task is + * visible on the task list in case we race with the first call to + * cgroup_iter_start() - to guarantee that the new task ends up on its + * list. */ void cgroup_post_fork(struct task_struct *child) { + int i; + /* * use_task_css_set_links is set to 1 before we walk the tasklist * under the tasklist_lock and we read it here after we added the child @@ -4889,7 +4884,30 @@ void cgroup_post_fork(struct task_struct *child) task_unlock(child); write_unlock(&css_set_lock); } + + /* + * Call ss->fork(). This must happen after @child is linked on + * css_set; otherwise, @child might change state between ->fork() + * and addition to css_set. + */ + if (need_forkexit_callback) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + + /* + * fork/exit callbacks are supported only for + * builtin subsystems and we don't need further + * synchronization as they never go away. + */ + if (!ss || ss->module) + continue; + + if (ss->fork) + ss->fork(child); + } + } } + /** * cgroup_exit - detach cgroup from exiting task * @tsk: pointer to task_struct of exiting process @@ -5022,15 +5040,17 @@ static void check_for_release(struct cgroup *cgrp) /* Caller must verify that the css is not for root cgroup */ bool __css_tryget(struct cgroup_subsys_state *css) { - do { - int v = css_refcnt(css); + while (true) { + int t, v; - if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v) + v = css_refcnt(css); + t = atomic_cmpxchg(&css->refcnt, v, v + 1); + if (likely(t == v)) return true; + else if (t < 0) + return false; cpu_relax(); - } while (!test_bit(CSS_REMOVED, &css->flags)); - - return false; + } } EXPORT_SYMBOL_GPL(__css_tryget); @@ -5049,11 +5069,9 @@ void __css_put(struct cgroup_subsys_state *css) set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); } - cgroup_wakeup_rmdir_waiter(cgrp); break; case 0: - if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags)) - schedule_work(&css->dput_work); + schedule_work(&css->dput_work); break; } rcu_read_unlock(); @@ -5439,7 +5457,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) } #ifdef CONFIG_CGROUP_DEBUG -static struct cgroup_subsys_state *debug_create(struct cgroup *cont) +static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont) { struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); @@ -5449,7 +5467,7 @@ static struct cgroup_subsys_state *debug_create(struct cgroup *cont) return css; } -static void debug_destroy(struct cgroup *cont) +static void debug_css_free(struct cgroup *cont) { kfree(cont->subsys[debug_subsys_id]); } @@ -5578,8 +5596,8 @@ static struct cftype debug_files[] = { struct cgroup_subsys debug_subsys = { .name = "debug", - .create = debug_create, - .destroy = debug_destroy, + .css_alloc = debug_css_alloc, + .css_free = debug_css_free, .subsys_id = debug_subsys_id, .base_cftypes = debug_files, }; |