diff options
Diffstat (limited to 'kernel')
48 files changed, 2656 insertions, 1520 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 70776aec2562..7dc8788cfd52 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -149,12 +149,14 @@ struct cgroup_root cgrp_dfl_root; */ static bool cgrp_dfl_root_visible; +/* + * Set by the boot param of the same name and makes subsystems with NULL + * ->dfl_files to use ->legacy_files on the default hierarchy. + */ +static bool cgroup_legacy_files_on_dfl; + /* some controllers are not supported in the default hierarchy */ -static const unsigned int cgrp_dfl_root_inhibit_ss_mask = 0 -#ifdef CONFIG_CGROUP_DEBUG - | (1 << debug_cgrp_id) -#endif - ; +static unsigned int cgrp_dfl_root_inhibit_ss_mask; /* The list of hierarchy roots */ @@ -180,13 +182,15 @@ static u64 css_serial_nr_next = 1; */ static int need_forkexit_callback __read_mostly; -static struct cftype cgroup_base_files[]; +static struct cftype cgroup_dfl_base_files[]; +static struct cftype cgroup_legacy_base_files[]; static void cgroup_put(struct cgroup *cgrp); static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask); static int cgroup_destroy_locked(struct cgroup *cgrp); -static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss); +static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, + bool visible); static void css_release(struct percpu_ref *ref); static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], @@ -1037,6 +1041,58 @@ static void cgroup_put(struct cgroup *cgrp) } /** + * cgroup_refresh_child_subsys_mask - update child_subsys_mask + * @cgrp: the target cgroup + * + * On the default hierarchy, a subsystem may request other subsystems to be + * enabled together through its ->depends_on mask. In such cases, more + * subsystems than specified in "cgroup.subtree_control" may be enabled. + * + * This function determines which subsystems need to be enabled given the + * current @cgrp->subtree_control and records it in + * @cgrp->child_subsys_mask. The resulting mask is always a superset of + * @cgrp->subtree_control and follows the usual hierarchy rules. + */ +static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp) +{ + struct cgroup *parent = cgroup_parent(cgrp); + unsigned int cur_ss_mask = cgrp->subtree_control; + struct cgroup_subsys *ss; + int ssid; + + lockdep_assert_held(&cgroup_mutex); + + if (!cgroup_on_dfl(cgrp)) { + cgrp->child_subsys_mask = cur_ss_mask; + return; + } + + while (true) { + unsigned int new_ss_mask = cur_ss_mask; + + for_each_subsys(ss, ssid) + if (cur_ss_mask & (1 << ssid)) + new_ss_mask |= ss->depends_on; + + /* + * Mask out subsystems which aren't available. This can + * happen only if some depended-upon subsystems were bound + * to non-default hierarchies. + */ + if (parent) + new_ss_mask &= parent->child_subsys_mask; + else + new_ss_mask &= cgrp->root->subsys_mask; + + if (new_ss_mask == cur_ss_mask) + break; + cur_ss_mask = new_ss_mask; + } + + cgrp->child_subsys_mask = cur_ss_mask; +} + +/** * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods * @kn: the kernfs_node being serviced * @@ -1208,12 +1264,15 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask) up_write(&css_set_rwsem); src_root->subsys_mask &= ~(1 << ssid); - src_root->cgrp.child_subsys_mask &= ~(1 << ssid); + src_root->cgrp.subtree_control &= ~(1 << ssid); + cgroup_refresh_child_subsys_mask(&src_root->cgrp); /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; - if (dst_root != &cgrp_dfl_root) - dst_root->cgrp.child_subsys_mask |= 1 << ssid; + if (dst_root != &cgrp_dfl_root) { + dst_root->cgrp.subtree_control |= 1 << ssid; + cgroup_refresh_child_subsys_mask(&dst_root->cgrp); + } if (ss->bind) ss->bind(css); @@ -1233,8 +1292,6 @@ static int cgroup_show_options(struct seq_file *seq, for_each_subsys(ss, ssid) if (root->subsys_mask & (1 << ssid)) seq_printf(seq, ",%s", ss->name); - if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) - seq_puts(seq, ",sane_behavior"); if (root->flags & CGRP_ROOT_NOPREFIX) seq_puts(seq, ",noprefix"); if (root->flags & CGRP_ROOT_XATTR) @@ -1268,6 +1325,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) bool all_ss = false, one_ss = false; unsigned int mask = -1U; struct cgroup_subsys *ss; + int nr_opts = 0; int i; #ifdef CONFIG_CPUSETS @@ -1277,6 +1335,8 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) memset(opts, 0, sizeof(*opts)); while ((token = strsep(&o, ",")) != NULL) { + nr_opts++; + if (!*token) return -EINVAL; if (!strcmp(token, "none")) { @@ -1361,37 +1421,33 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) return -ENOENT; } - /* Consistency checks */ - if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); - - if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) || - opts->cpuset_clone_children || opts->release_agent || - opts->name) { - pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n"); + if (nr_opts != 1) { + pr_err("sane_behavior: no other mount options allowed\n"); return -EINVAL; } - } else { - /* - * If the 'all' option was specified select all the - * subsystems, otherwise if 'none', 'name=' and a subsystem - * name options were not specified, let's default to 'all' - */ - if (all_ss || (!one_ss && !opts->none && !opts->name)) - for_each_subsys(ss, i) - if (!ss->disabled) - opts->subsys_mask |= (1 << i); - - /* - * We either have to specify by name or by subsystems. (So - * all empty hierarchies must have a name). - */ - if (!opts->subsys_mask && !opts->name) - return -EINVAL; + return 0; } /* + * If the 'all' option was specified select all the subsystems, + * otherwise if 'none', 'name=' and a subsystem name options were + * not specified, let's default to 'all' + */ + if (all_ss || (!one_ss && !opts->none && !opts->name)) + for_each_subsys(ss, i) + if (!ss->disabled) + opts->subsys_mask |= (1 << i); + + /* + * We either have to specify by name or by subsystems. (So all + * empty hierarchies must have a name). + */ + if (!opts->subsys_mask && !opts->name) + return -EINVAL; + + /* * Option noprefix was introduced just for backward compatibility * with the old cpuset, so we allow noprefix only if mounting just * the cpuset subsystem. @@ -1399,7 +1455,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask)) return -EINVAL; - /* Can't specify "none" and some subsystems */ if (opts->subsys_mask && opts->none) return -EINVAL; @@ -1414,8 +1469,8 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) struct cgroup_sb_opts opts; unsigned int added_mask, removed_mask; - if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) { - pr_err("sane_behavior: remount is not allowed\n"); + if (root == &cgrp_dfl_root) { + pr_err("remount is not allowed\n"); return -EINVAL; } @@ -1434,11 +1489,10 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) removed_mask = root->subsys_mask & ~opts.subsys_mask; /* Don't allow flags or name to change at remount */ - if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || + if ((opts.flags ^ root->flags) || (opts.name && strcmp(opts.name, root->name))) { pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n", - opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "", - root->flags & CGRP_ROOT_OPTION_MASK, root->name); + opts.flags, opts.name ?: "", root->flags, root->name); ret = -EINVAL; goto out_unlock; } @@ -1563,6 +1617,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; + struct cftype *base_files; struct css_set *cset; int i, ret; @@ -1600,7 +1655,12 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) } root_cgrp->kn = root->kf_root->kn; - ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true); + if (root == &cgrp_dfl_root) + base_files = cgroup_dfl_base_files; + else + base_files = cgroup_legacy_base_files; + + ret = cgroup_addrm_files(root_cgrp, base_files, true); if (ret) goto destroy_root; @@ -1638,7 +1698,7 @@ destroy_root: exit_root_id: cgroup_exit_root_id(root); cancel_ref: - percpu_ref_cancel_init(&root_cgrp->self.refcnt); + percpu_ref_exit(&root_cgrp->self.refcnt); out: free_cgrp_cset_links(&tmp_links); return ret; @@ -1672,7 +1732,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, goto out_unlock; /* look for a matching existing root */ - if (!opts.subsys_mask && !opts.none && !opts.name) { + if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) { cgrp_dfl_root_visible = true; root = &cgrp_dfl_root; cgroup_get(&root->cgrp); @@ -1730,15 +1790,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, goto out_unlock; } - if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { - if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { - pr_err("sane_behavior: new mount options should match the existing superblock\n"); - ret = -EINVAL; - goto out_unlock; - } else { - pr_warn("new mount options do not match the existing superblock, will be ignored\n"); - } - } + if (root->flags ^ opts.flags) + pr_warn("new mount options do not match the existing superblock, will be ignored\n"); /* * We want to reuse @root whose lifetime is governed by its @@ -2457,9 +2510,7 @@ static int cgroup_release_agent_show(struct seq_file *seq, void *v) static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) { - struct cgroup *cgrp = seq_css(seq)->cgroup; - - seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp)); + seq_puts(seq, "0\n"); return 0; } @@ -2496,7 +2547,7 @@ static int cgroup_controllers_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->child_subsys_mask); + cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->subtree_control); return 0; } @@ -2505,7 +2556,7 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - cgroup_print_ss_mask(seq, cgrp->child_subsys_mask); + cgroup_print_ss_mask(seq, cgrp->subtree_control); return 0; } @@ -2611,6 +2662,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, loff_t off) { unsigned int enable = 0, disable = 0; + unsigned int css_enable, css_disable, old_ctrl, new_ctrl; struct cgroup *cgrp, *child; struct cgroup_subsys *ss; char *tok; @@ -2650,11 +2702,26 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, for_each_subsys(ss, ssid) { if (enable & (1 << ssid)) { - if (cgrp->child_subsys_mask & (1 << ssid)) { + if (cgrp->subtree_control & (1 << ssid)) { enable &= ~(1 << ssid); continue; } + /* unavailable or not enabled on the parent? */ + if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) || + (cgroup_parent(cgrp) && + !(cgroup_parent(cgrp)->subtree_control & (1 << ssid)))) { + ret = -ENOENT; + goto out_unlock; + } + + /* + * @ss is already enabled through dependency and + * we'll just make it visible. Skip draining. + */ + if (cgrp->child_subsys_mask & (1 << ssid)) + continue; + /* * Because css offlining is asynchronous, userland * might try to re-enable the same controller while @@ -2677,23 +2744,15 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, return restart_syscall(); } - - /* unavailable or not enabled on the parent? */ - if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) || - (cgroup_parent(cgrp) && - !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ssid)))) { - ret = -ENOENT; - goto out_unlock; - } } else if (disable & (1 << ssid)) { - if (!(cgrp->child_subsys_mask & (1 << ssid))) { + if (!(cgrp->subtree_control & (1 << ssid))) { disable &= ~(1 << ssid); continue; } /* a child has it enabled? */ cgroup_for_each_live_child(child, cgrp) { - if (child->child_subsys_mask & (1 << ssid)) { + if (child->subtree_control & (1 << ssid)) { ret = -EBUSY; goto out_unlock; } @@ -2707,7 +2766,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, } /* - * Except for the root, child_subsys_mask must be zero for a cgroup + * Except for the root, subtree_control must be zero for a cgroup * with tasks so that child cgroups don't compete against tasks. */ if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) { @@ -2716,36 +2775,75 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, } /* - * Create csses for enables and update child_subsys_mask. This - * changes cgroup_e_css() results which in turn makes the - * subsequent cgroup_update_dfl_csses() associate all tasks in the - * subtree to the updated csses. + * Update subsys masks and calculate what needs to be done. More + * subsystems than specified may need to be enabled or disabled + * depending on subsystem dependencies. + */ + cgrp->subtree_control |= enable; + cgrp->subtree_control &= ~disable; + + old_ctrl = cgrp->child_subsys_mask; + cgroup_refresh_child_subsys_mask(cgrp); + new_ctrl = cgrp->child_subsys_mask; + + css_enable = ~old_ctrl & new_ctrl; + css_disable = old_ctrl & ~new_ctrl; + enable |= css_enable; + disable |= css_disable; + + /* + * Create new csses or make the existing ones visible. A css is + * created invisible if it's being implicitly enabled through + * dependency. An invisible css is made visible when the userland + * explicitly enables it. */ for_each_subsys(ss, ssid) { if (!(enable & (1 << ssid))) continue; cgroup_for_each_live_child(child, cgrp) { - ret = create_css(child, ss); + if (css_enable & (1 << ssid)) + ret = create_css(child, ss, + cgrp->subtree_control & (1 << ssid)); + else + ret = cgroup_populate_dir(child, 1 << ssid); if (ret) goto err_undo_css; } } - cgrp->child_subsys_mask |= enable; - cgrp->child_subsys_mask &= ~disable; - + /* + * At this point, cgroup_e_css() results reflect the new csses + * making the following cgroup_update_dfl_csses() properly update + * css associations of all tasks in the subtree. + */ ret = cgroup_update_dfl_csses(cgrp); if (ret) goto err_undo_css; - /* all tasks are now migrated away from the old csses, kill them */ + /* + * All tasks are migrated out of disabled csses. Kill or hide + * them. A css is hidden when the userland requests it to be + * disabled while other subsystems are still depending on it. The + * css must not actively control resources and be in the vanilla + * state if it's made visible again later. Controllers which may + * be depended upon should provide ->css_reset() for this purpose. + */ for_each_subsys(ss, ssid) { if (!(disable & (1 << ssid))) continue; - cgroup_for_each_live_child(child, cgrp) - kill_css(cgroup_css(child, ss)); + cgroup_for_each_live_child(child, cgrp) { + struct cgroup_subsys_state *css = cgroup_css(child, ss); + + if (css_disable & (1 << ssid)) { + kill_css(css); + } else { + cgroup_clear_dir(child, 1 << ssid); + if (ss->css_reset) + ss->css_reset(css); + } + } } kernfs_activate(cgrp->kn); @@ -2755,8 +2853,9 @@ out_unlock: return ret ?: nbytes; err_undo_css: - cgrp->child_subsys_mask &= ~enable; - cgrp->child_subsys_mask |= disable; + cgrp->subtree_control &= ~enable; + cgrp->subtree_control |= disable; + cgroup_refresh_child_subsys_mask(cgrp); for_each_subsys(ss, ssid) { if (!(enable & (1 << ssid))) @@ -2764,8 +2863,14 @@ err_undo_css: cgroup_for_each_live_child(child, cgrp) { struct cgroup_subsys_state *css = cgroup_css(child, ss); - if (css) + + if (!css) + continue; + + if (css_enable & (1 << ssid)) kill_css(css); + else + cgroup_clear_dir(child, 1 << ssid); } } goto out_unlock; @@ -2878,9 +2983,9 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, /* * This isn't a proper migration and its usefulness is very - * limited. Disallow if sane_behavior. + * limited. Disallow on the default hierarchy. */ - if (cgroup_sane_behavior(cgrp)) + if (cgroup_on_dfl(cgrp)) return -EPERM; /* @@ -2964,9 +3069,9 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], for (cft = cfts; cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ - if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) + if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) continue; - if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) + if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp)) continue; if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp)) continue; @@ -3024,6 +3129,9 @@ static void cgroup_exit_cftypes(struct cftype *cfts) kfree(cft->kf_ops); cft->kf_ops = NULL; cft->ss = NULL; + + /* revert flags set by cgroup core while adding @cfts */ + cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL); } } @@ -3109,7 +3217,7 @@ int cgroup_rm_cftypes(struct cftype *cfts) * function currently returns 0 as long as @cfts registration is successful * even if some file creation attempts on existing cgroups fail. */ -int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { int ret; @@ -3135,6 +3243,40 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) } /** + * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy + * @ss: target cgroup subsystem + * @cfts: zero-length name terminated array of cftypes + * + * Similar to cgroup_add_cftypes() but the added files are only used for + * the default hierarchy. + */ +int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +{ + struct cftype *cft; + + for (cft = cfts; cft && cft->name[0] != '\0'; cft++) + cft->flags |= __CFTYPE_ONLY_ON_DFL; + return cgroup_add_cftypes(ss, cfts); +} + +/** + * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies + * @ss: target cgroup subsystem + * @cfts: zero-length name terminated array of cftypes + * + * Similar to cgroup_add_cftypes() but the added files are only used for + * the legacy hierarchies. + */ +int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +{ + struct cftype *cft; + + for (cft = cfts; cft && cft->name[0] != '\0'; cft++) + cft->flags |= __CFTYPE_NOT_ON_DFL; + return cgroup_add_cftypes(ss, cfts); +} + +/** * cgroup_task_count - count the number of tasks in a cgroup. * @cgrp: the cgroup in question * @@ -3699,8 +3841,9 @@ after: * * All this extra complexity was caused by the original implementation * committing to an entirely unnecessary property. In the long term, we - * want to do away with it. Explicitly scramble sort order if - * sane_behavior so that no such expectation exists in the new interface. + * want to do away with it. Explicitly scramble sort order if on the + * default hierarchy so that no such expectation exists in the new + * interface. * * Scrambling is done by swapping every two consecutive bits, which is * non-identity one-to-one mapping which disturbs sort order sufficiently. @@ -3715,7 +3858,7 @@ static pid_t pid_fry(pid_t pid) static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid) { - if (cgroup_sane_behavior(cgrp)) + if (cgroup_on_dfl(cgrp)) return pid_fry(pid); else return pid; @@ -3818,7 +3961,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, css_task_iter_end(&it); length = n; /* now sort & (if procs) strip out duplicates */ - if (cgroup_sane_behavior(cgrp)) + if (cgroup_on_dfl(cgrp)) sort(array, length, sizeof(pid_t), fried_cmppid, NULL); else sort(array, length, sizeof(pid_t), cmppid, NULL); @@ -4040,7 +4183,8 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css, return 0; } -static struct cftype cgroup_base_files[] = { +/* cgroup core interface files for the default hierarchy */ +static struct cftype cgroup_dfl_base_files[] = { { .name = "cgroup.procs", .seq_start = cgroup_pidlist_start, @@ -4052,46 +4196,52 @@ static struct cftype cgroup_base_files[] = { .mode = S_IRUGO | S_IWUSR, }, { - .name = "cgroup.clone_children", - .flags = CFTYPE_INSANE, - .read_u64 = cgroup_clone_children_read, - .write_u64 = cgroup_clone_children_write, - }, - { - .name = "cgroup.sane_behavior", - .flags = CFTYPE_ONLY_ON_ROOT, - .seq_show = cgroup_sane_behavior_show, - }, - { .name = "cgroup.controllers", - .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT, + .flags = CFTYPE_ONLY_ON_ROOT, .seq_show = cgroup_root_controllers_show, }, { .name = "cgroup.controllers", - .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT, + .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_controllers_show, }, { .name = "cgroup.subtree_control", - .flags = CFTYPE_ONLY_ON_DFL, .seq_show = cgroup_subtree_control_show, .write = cgroup_subtree_control_write, }, { .name = "cgroup.populated", - .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT, + .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_populated_show, }, + { } /* terminate */ +}; - /* - * Historical crazy stuff. These don't have "cgroup." prefix and - * don't exist if sane_behavior. If you're depending on these, be - * prepared to be burned. - */ +/* cgroup core interface files for the legacy hierarchies */ +static struct cftype cgroup_legacy_base_files[] = { + { + .name = "cgroup.procs", + .seq_start = cgroup_pidlist_start, + .seq_next = cgroup_pidlist_next, + .seq_stop = cgroup_pidlist_stop, + .seq_show = cgroup_pidlist_show, + .private = CGROUP_FILE_PROCS, + .write = cgroup_procs_write, + .mode = S_IRUGO | S_IWUSR, + }, + { + .name = "cgroup.clone_children", + .read_u64 = cgroup_clone_children_read, + .write_u64 = cgroup_clone_children_write, + }, + { + .name = "cgroup.sane_behavior", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = cgroup_sane_behavior_show, + }, { .name = "tasks", - .flags = CFTYPE_INSANE, /* use "procs" instead */ .seq_start = cgroup_pidlist_start, .seq_next = cgroup_pidlist_next, .seq_stop = cgroup_pidlist_stop, @@ -4102,13 +4252,12 @@ static struct cftype cgroup_base_files[] = { }, { .name = "notify_on_release", - .flags = CFTYPE_INSANE, .read_u64 = cgroup_read_notify_on_release, .write_u64 = cgroup_write_notify_on_release, }, { .name = "release_agent", - .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, + .flags = CFTYPE_ONLY_ON_ROOT, .seq_show = cgroup_release_agent_show, .write = cgroup_release_agent_write, .max_write_len = PATH_MAX - 1, @@ -4175,6 +4324,8 @@ static void css_free_work_fn(struct work_struct *work) container_of(work, struct cgroup_subsys_state, destroy_work); struct cgroup *cgrp = css->cgroup; + percpu_ref_exit(&css->refcnt); + if (css->ss) { /* css free path */ if (css->parent) @@ -4314,12 +4465,14 @@ static void offline_css(struct cgroup_subsys_state *css) * create_css - create a cgroup_subsys_state * @cgrp: the cgroup new css will be associated with * @ss: the subsys of new css + * @visible: whether to create control knobs for the new css or not * * Create a new css associated with @cgrp - @ss pair. On success, the new - * css is online and installed in @cgrp with all interface files created. - * Returns 0 on success, -errno on failure. + * css is online and installed in @cgrp with all interface files created if + * @visible. Returns 0 on success, -errno on failure. */ -static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) +static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, + bool visible) { struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss); @@ -4343,9 +4496,11 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) goto err_free_percpu_ref; css->id = err; - err = cgroup_populate_dir(cgrp, 1 << ss->id); - if (err) - goto err_free_id; + if (visible) { + err = cgroup_populate_dir(cgrp, 1 << ss->id); + if (err) + goto err_free_id; + } /* @css is ready to be brought online now, make it visible */ list_add_tail_rcu(&css->sibling, &parent_css->children); @@ -4372,7 +4527,7 @@ err_list_del: err_free_id: cgroup_idr_remove(&ss->css_idr, css->id); err_free_percpu_ref: - percpu_ref_cancel_init(&css->refcnt); + percpu_ref_exit(&css->refcnt); err_free_css: call_rcu(&css->rcu_head, css_free_rcu_fn); return err; @@ -4385,6 +4540,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, struct cgroup_root *root; struct cgroup_subsys *ss; struct kernfs_node *kn; + struct cftype *base_files; int ssid, ret; parent = cgroup_kn_lock_live(parent_kn); @@ -4455,14 +4611,20 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (ret) goto out_destroy; - ret = cgroup_addrm_files(cgrp, cgroup_base_files, true); + if (cgroup_on_dfl(cgrp)) + base_files = cgroup_dfl_base_files; + else + base_files = cgroup_legacy_base_files; + + ret = cgroup_addrm_files(cgrp, base_files, true); if (ret) goto out_destroy; /* let's create and online css's */ for_each_subsys(ss, ssid) { if (parent->child_subsys_mask & (1 << ssid)) { - ret = create_css(cgrp, ss); + ret = create_css(cgrp, ss, + parent->subtree_control & (1 << ssid)); if (ret) goto out_destroy; } @@ -4470,10 +4632,12 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, /* * On the default hierarchy, a child doesn't automatically inherit - * child_subsys_mask from the parent. Each is configured manually. + * subtree_control from the parent. Each is configured manually. */ - if (!cgroup_on_dfl(cgrp)) - cgrp->child_subsys_mask = parent->child_subsys_mask; + if (!cgroup_on_dfl(cgrp)) { + cgrp->subtree_control = parent->subtree_control; + cgroup_refresh_child_subsys_mask(cgrp); + } kernfs_activate(kn); @@ -4483,7 +4647,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, out_free_id: cgroup_idr_remove(&root->cgroup_idr, cgrp->id); out_cancel_ref: - percpu_ref_cancel_init(&cgrp->self.refcnt); + percpu_ref_exit(&cgrp->self.refcnt); out_free_cgrp: kfree(cgrp); out_unlock: @@ -4736,8 +4900,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) */ int __init cgroup_init_early(void) { - static struct cgroup_sb_opts __initdata opts = - { .flags = CGRP_ROOT_SANE_BEHAVIOR }; + static struct cgroup_sb_opts __initdata opts; struct cgroup_subsys *ss; int i; @@ -4775,7 +4938,8 @@ int __init cgroup_init(void) unsigned long key; int ssid, err; - BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); + BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); + BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); mutex_lock(&cgroup_mutex); @@ -4807,9 +4971,22 @@ int __init cgroup_init(void) * disabled flag and cftype registration needs kmalloc, * both of which aren't available during early_init. */ - if (!ss->disabled) { - cgrp_dfl_root.subsys_mask |= 1 << ss->id; - WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes)); + if (ss->disabled) + continue; + + cgrp_dfl_root.subsys_mask |= 1 << ss->id; + + if (cgroup_legacy_files_on_dfl && !ss->dfl_cftypes) + ss->dfl_cftypes = ss->legacy_cftypes; + + if (!ss->dfl_cftypes) + cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id; + + if (ss->dfl_cftypes == ss->legacy_cftypes) { + WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes)); + } else { + WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes)); + WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes)); } } @@ -5205,6 +5382,14 @@ static int __init cgroup_disable(char *str) } __setup("cgroup_disable=", cgroup_disable); +static int __init cgroup_set_legacy_files_on_dfl(char *str) +{ + printk("cgroup: using legacy files on the default hierarchy\n"); + cgroup_legacy_files_on_dfl = true; + return 0; +} +__setup("cgroup__DEVEL__legacy_files_on_dfl", cgroup_set_legacy_files_on_dfl); + /** * css_tryget_online_from_dir - get corresponding css from a cgroup dentry * @dentry: directory dentry of interest @@ -5399,6 +5584,6 @@ static struct cftype debug_files[] = { struct cgroup_subsys debug_cgrp_subsys = { .css_alloc = debug_css_alloc, .css_free = debug_css_free, - .base_cftypes = debug_files, + .legacy_cftypes = debug_files, }; #endif /* CONFIG_CGROUP_DEBUG */ diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index a79e40f9d700..92b98cc0ee76 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -480,5 +480,5 @@ struct cgroup_subsys freezer_cgrp_subsys = { .css_free = freezer_css_free, .attach = freezer_attach, .fork = freezer_fork, - .base_cftypes = files, + .legacy_cftypes = files, }; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 116a4164720a..22874d7cf2c0 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -76,8 +76,34 @@ struct cpuset { struct cgroup_subsys_state css; unsigned long flags; /* "unsigned long" so bitops work */ - cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ - nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ + + /* + * On default hierarchy: + * + * The user-configured masks can only be changed by writing to + * cpuset.cpus and cpuset.mems, and won't be limited by the + * parent masks. + * + * The effective masks is the real masks that apply to the tasks + * in the cpuset. They may be changed if the configured masks are + * changed or hotplug happens. + * + * effective_mask == configured_mask & parent's effective_mask, + * and if it ends up empty, it will inherit the parent's mask. + * + * + * On legacy hierachy: + * + * The user-configured masks are always the same with effective masks. + */ + + /* user-configured CPUs and Memory Nodes allow to tasks */ + cpumask_var_t cpus_allowed; + nodemask_t mems_allowed; + + /* effective CPUs and Memory Nodes allow to tasks */ + cpumask_var_t effective_cpus; + nodemask_t effective_mems; /* * This is old Memory Nodes tasks took on. @@ -307,9 +333,9 @@ static struct file_system_type cpuset_fs_type = { */ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) { - while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) + while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) cs = parent_cs(cs); - cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); + cpumask_and(pmask, cs->effective_cpus, cpu_online_mask); } /* @@ -325,9 +351,9 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) */ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) { - while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) + while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY])) cs = parent_cs(cs); - nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]); + nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]); } /* @@ -376,13 +402,20 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) if (!trial) return NULL; - if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) { - kfree(trial); - return NULL; - } - cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); + if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) + goto free_cs; + if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL)) + goto free_cpus; + cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); + cpumask_copy(trial->effective_cpus, cs->effective_cpus); return trial; + +free_cpus: + free_cpumask_var(trial->cpus_allowed); +free_cs: + kfree(trial); + return NULL; } /** @@ -391,6 +424,7 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) */ static void free_trial_cpuset(struct cpuset *trial) { + free_cpumask_var(trial->effective_cpus); free_cpumask_var(trial->cpus_allowed); kfree(trial); } @@ -436,9 +470,9 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) par = parent_cs(cur); - /* We must be a subset of our parent cpuset */ + /* On legacy hiearchy, we must be a subset of our parent cpuset. */ ret = -EACCES; - if (!is_cpuset_subset(trial, par)) + if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par)) goto out; /* @@ -480,11 +514,11 @@ out: #ifdef CONFIG_SMP /* * Helper routine for generate_sched_domains(). - * Do cpusets a, b have overlapping cpus_allowed masks? + * Do cpusets a, b have overlapping effective cpus_allowed masks? */ static int cpusets_overlap(struct cpuset *a, struct cpuset *b) { - return cpumask_intersects(a->cpus_allowed, b->cpus_allowed); + return cpumask_intersects(a->effective_cpus, b->effective_cpus); } static void @@ -601,7 +635,7 @@ static int generate_sched_domains(cpumask_var_t **domains, *dattr = SD_ATTR_INIT; update_domain_attr_tree(dattr, &top_cpuset); } - cpumask_copy(doms[0], top_cpuset.cpus_allowed); + cpumask_copy(doms[0], top_cpuset.effective_cpus); goto done; } @@ -705,7 +739,7 @@ restart: struct cpuset *b = csa[j]; if (apn == b->pn) { - cpumask_or(dp, dp, b->cpus_allowed); + cpumask_or(dp, dp, b->effective_cpus); if (dattr) update_domain_attr_tree(dattr + nslot, b); @@ -757,7 +791,7 @@ static void rebuild_sched_domains_locked(void) * passing doms with offlined cpu to partition_sched_domains(). * Anyways, hotplug work item will rebuild sched domains. */ - if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask)) + if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) goto out; /* Generate domain masks and attrs */ @@ -781,45 +815,6 @@ void rebuild_sched_domains(void) mutex_unlock(&cpuset_mutex); } -/* - * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus - * @cs: the cpuset in interest - * - * A cpuset's effective cpumask is the cpumask of the nearest ancestor - * with non-empty cpus. We use effective cpumask whenever: - * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask - * if the cpuset they reside in has no cpus) - * - we want to retrieve task_cs(tsk)'s cpus_allowed. - * - * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an - * exception. See comments there. - */ -static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs) -{ - while (cpumask_empty(cs->cpus_allowed)) - cs = parent_cs(cs); - return cs; -} - -/* - * effective_nodemask_cpuset - return nearest ancestor with non-empty mems - * @cs: the cpuset in interest - * - * A cpuset's effective nodemask is the nodemask of the nearest ancestor - * with non-empty memss. We use effective nodemask whenever: - * - we update tasks' mems_allowed. (they take on the ancestor's nodemask - * if the cpuset they reside in has no mems) - * - we want to retrieve task_cs(tsk)'s mems_allowed. - * - * Called with cpuset_mutex held. - */ -static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs) -{ - while (nodes_empty(cs->mems_allowed)) - cs = parent_cs(cs); - return cs; -} - /** * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed @@ -830,53 +825,80 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs) */ static void update_tasks_cpumask(struct cpuset *cs) { - struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); struct css_task_iter it; struct task_struct *task; css_task_iter_start(&cs->css, &it); while ((task = css_task_iter_next(&it))) - set_cpus_allowed_ptr(task, cpus_cs->cpus_allowed); + set_cpus_allowed_ptr(task, cs->effective_cpus); css_task_iter_end(&it); } /* - * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. - * @root_cs: the root cpuset of the hierarchy - * @update_root: update root cpuset or not? + * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree + * @cs: the cpuset to consider + * @new_cpus: temp variable for calculating new effective_cpus + * + * When congifured cpumask is changed, the effective cpumasks of this cpuset + * and all its descendants need to be updated. * - * This will update cpumasks of tasks in @root_cs and all other empty cpusets - * which take on cpumask of @root_cs. + * On legacy hierachy, effective_cpus will be the same with cpu_allowed. * * Called with cpuset_mutex held */ -static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root) +static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; + bool need_rebuild_sched_domains = false; rcu_read_lock(); - cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { - if (cp == root_cs) { - if (!update_root) - continue; - } else { - /* skip the whole subtree if @cp have some CPU */ - if (!cpumask_empty(cp->cpus_allowed)) { - pos_css = css_rightmost_descendant(pos_css); - continue; - } + cpuset_for_each_descendant_pre(cp, pos_css, cs) { + struct cpuset *parent = parent_cs(cp); + + cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus); + + /* + * If it becomes empty, inherit the effective mask of the + * parent, which is guaranteed to have some CPUs. + */ + if (cpumask_empty(new_cpus)) + cpumask_copy(new_cpus, parent->effective_cpus); + + /* Skip the whole subtree if the cpumask remains the same. */ + if (cpumask_equal(new_cpus, cp->effective_cpus)) { + pos_css = css_rightmost_descendant(pos_css); + continue; } + if (!css_tryget_online(&cp->css)) continue; rcu_read_unlock(); + mutex_lock(&callback_mutex); + cpumask_copy(cp->effective_cpus, new_cpus); + mutex_unlock(&callback_mutex); + + WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && + !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); + update_tasks_cpumask(cp); + /* + * If the effective cpumask of any non-empty cpuset is changed, + * we need to rebuild sched domains. + */ + if (!cpumask_empty(cp->cpus_allowed) && + is_sched_load_balance(cp)) + need_rebuild_sched_domains = true; + rcu_read_lock(); css_put(&cp->css); } rcu_read_unlock(); + + if (need_rebuild_sched_domains) + rebuild_sched_domains_locked(); } /** @@ -889,7 +911,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, const char *buf) { int retval; - int is_load_balanced; /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ if (cs == &top_cpuset) @@ -908,7 +929,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (retval < 0) return retval; - if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask)) + if (!cpumask_subset(trialcs->cpus_allowed, + top_cpuset.cpus_allowed)) return -EINVAL; } @@ -920,16 +942,12 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (retval < 0) return retval; - is_load_balanced = is_sched_load_balance(trialcs); - mutex_lock(&callback_mutex); cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); mutex_unlock(&callback_mutex); - update_tasks_cpumask_hier(cs, true); - - if (is_load_balanced) - rebuild_sched_domains_locked(); + /* use trialcs->cpus_allowed as a temp variable */ + update_cpumasks_hier(cs, trialcs->cpus_allowed); return 0; } @@ -951,15 +969,13 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to) { struct task_struct *tsk = current; - struct cpuset *mems_cs; tsk->mems_allowed = *to; do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); rcu_read_lock(); - mems_cs = effective_nodemask_cpuset(task_cs(tsk)); - guarantee_online_mems(mems_cs, &tsk->mems_allowed); + guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed); rcu_read_unlock(); } @@ -1028,13 +1044,12 @@ static void *cpuset_being_rebound; static void update_tasks_nodemask(struct cpuset *cs) { static nodemask_t newmems; /* protected by cpuset_mutex */ - struct cpuset *mems_cs = effective_nodemask_cpuset(cs); struct css_task_iter it; struct task_struct *task; cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ - guarantee_online_mems(mems_cs, &newmems); + guarantee_online_mems(cs, &newmems); /* * The mpol_rebind_mm() call takes mmap_sem, which we couldn't @@ -1077,36 +1092,52 @@ static void update_tasks_nodemask(struct cpuset *cs) } /* - * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. - * @cs: the root cpuset of the hierarchy - * @update_root: update the root cpuset or not? + * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree + * @cs: the cpuset to consider + * @new_mems: a temp variable for calculating new effective_mems * - * This will update nodemasks of tasks in @root_cs and all other empty cpusets - * which take on nodemask of @root_cs. + * When configured nodemask is changed, the effective nodemasks of this cpuset + * and all its descendants need to be updated. + * + * On legacy hiearchy, effective_mems will be the same with mems_allowed. * * Called with cpuset_mutex held */ -static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root) +static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; rcu_read_lock(); - cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { - if (cp == root_cs) { - if (!update_root) - continue; - } else { - /* skip the whole subtree if @cp have some CPU */ - if (!nodes_empty(cp->mems_allowed)) { - pos_css = css_rightmost_descendant(pos_css); - continue; - } + cpuset_for_each_descendant_pre(cp, pos_css, cs) { + struct cpuset *parent = parent_cs(cp); + + nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems); + + /* + * If it becomes empty, inherit the effective mask of the + * parent, which is guaranteed to have some MEMs. + */ + if (nodes_empty(*new_mems)) + *new_mems = parent->effective_mems; + + /* Skip the whole subtree if the nodemask remains the same. */ + if (nodes_equal(*new_mems, cp->effective_mems)) { + pos_css = css_rightmost_descendant(pos_css); + continue; } + if (!css_tryget_online(&cp->css)) continue; rcu_read_unlock(); + mutex_lock(&callback_mutex); + cp->effective_mems = *new_mems; + mutex_unlock(&callback_mutex); + + WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && + !nodes_equal(cp->mems_allowed, cp->effective_mems)); + update_tasks_nodemask(cp); rcu_read_lock(); @@ -1156,8 +1187,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, goto done; if (!nodes_subset(trialcs->mems_allowed, - node_states[N_MEMORY])) { - retval = -EINVAL; + top_cpuset.mems_allowed)) { + retval = -EINVAL; goto done; } } @@ -1174,7 +1205,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, cs->mems_allowed = trialcs->mems_allowed; mutex_unlock(&callback_mutex); - update_tasks_nodemask_hier(cs, true); + /* use trialcs->mems_allowed as a temp variable */ + update_nodemasks_hier(cs, &cs->mems_allowed); done: return retval; } @@ -1389,12 +1421,9 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css, mutex_lock(&cpuset_mutex); - /* - * We allow to move tasks into an empty cpuset if sane_behavior - * flag is set. - */ + /* allow moving tasks into an empty cpuset if on default hierarchy */ ret = -ENOSPC; - if (!cgroup_sane_behavior(css->cgroup) && + if (!cgroup_on_dfl(css->cgroup) && (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) goto out_unlock; @@ -1452,8 +1481,6 @@ static void cpuset_attach(struct cgroup_subsys_state *css, struct task_struct *leader = cgroup_taskset_first(tset); struct cpuset *cs = css_cs(css); struct cpuset *oldcs = cpuset_attach_old_cs; - struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); - struct cpuset *mems_cs = effective_nodemask_cpuset(cs); mutex_lock(&cpuset_mutex); @@ -1461,9 +1488,9 @@ static void cpuset_attach(struct cgroup_subsys_state *css, if (cs == &top_cpuset) cpumask_copy(cpus_attach, cpu_possible_mask); else - guarantee_online_cpus(cpus_cs, cpus_attach); + guarantee_online_cpus(cs, cpus_attach); - guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); + guarantee_online_mems(cs, &cpuset_attach_nodemask_to); cgroup_taskset_for_each(task, tset) { /* @@ -1480,11 +1507,9 @@ static void cpuset_attach(struct cgroup_subsys_state *css, * Change mm, possibly for multiple threads in a threadgroup. This is * expensive and may sleep. */ - cpuset_attach_nodemask_to = cs->mems_allowed; + cpuset_attach_nodemask_to = cs->effective_mems; mm = get_task_mm(leader); if (mm) { - struct cpuset *mems_oldcs = effective_nodemask_cpuset(oldcs); - mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); /* @@ -1495,7 +1520,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css, * mm from. */ if (is_memory_migrate(cs)) { - cpuset_migrate_mm(mm, &mems_oldcs->old_mems_allowed, + cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, &cpuset_attach_nodemask_to); } mmput(mm); @@ -1516,6 +1541,8 @@ typedef enum { FILE_MEMORY_MIGRATE, FILE_CPULIST, FILE_MEMLIST, + FILE_EFFECTIVE_CPULIST, + FILE_EFFECTIVE_MEMLIST, FILE_CPU_EXCLUSIVE, FILE_MEM_EXCLUSIVE, FILE_MEM_HARDWALL, @@ -1694,6 +1721,12 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) case FILE_MEMLIST: s += nodelist_scnprintf(s, count, cs->mems_allowed); break; + case FILE_EFFECTIVE_CPULIST: + s += cpulist_scnprintf(s, count, cs->effective_cpus); + break; + case FILE_EFFECTIVE_MEMLIST: + s += nodelist_scnprintf(s, count, cs->effective_mems); + break; default: ret = -EINVAL; goto out_unlock; @@ -1779,6 +1812,18 @@ static struct cftype files[] = { }, { + .name = "effective_cpus", + .seq_show = cpuset_common_seq_show, + .private = FILE_EFFECTIVE_CPULIST, + }, + + { + .name = "effective_mems", + .seq_show = cpuset_common_seq_show, + .private = FILE_EFFECTIVE_MEMLIST, + }, + + { .name = "cpu_exclusive", .read_u64 = cpuset_read_u64, .write_u64 = cpuset_write_u64, @@ -1869,18 +1914,26 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) cs = kzalloc(sizeof(*cs), GFP_KERNEL); if (!cs) return ERR_PTR(-ENOMEM); - if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) { - kfree(cs); - return ERR_PTR(-ENOMEM); - } + if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) + goto free_cs; + if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL)) + goto free_cpus; set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); cpumask_clear(cs->cpus_allowed); nodes_clear(cs->mems_allowed); + cpumask_clear(cs->effective_cpus); + nodes_clear(cs->effective_mems); fmeter_init(&cs->fmeter); cs->relax_domain_level = -1; return &cs->css; + +free_cpus: + free_cpumask_var(cs->cpus_allowed); +free_cs: + kfree(cs); + return ERR_PTR(-ENOMEM); } static int cpuset_css_online(struct cgroup_subsys_state *css) @@ -1903,6 +1956,13 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cpuset_inc(); + mutex_lock(&callback_mutex); + if (cgroup_on_dfl(cs->css.cgroup)) { + cpumask_copy(cs->effective_cpus, parent->effective_cpus); + cs->effective_mems = parent->effective_mems; + } + mutex_unlock(&callback_mutex); + if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) goto out_unlock; @@ -1962,20 +2022,40 @@ static void cpuset_css_free(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); + free_cpumask_var(cs->effective_cpus); free_cpumask_var(cs->cpus_allowed); kfree(cs); } +static void cpuset_bind(struct cgroup_subsys_state *root_css) +{ + mutex_lock(&cpuset_mutex); + mutex_lock(&callback_mutex); + + if (cgroup_on_dfl(root_css->cgroup)) { + cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); + top_cpuset.mems_allowed = node_possible_map; + } else { + cpumask_copy(top_cpuset.cpus_allowed, + top_cpuset.effective_cpus); + top_cpuset.mems_allowed = top_cpuset.effective_mems; + } + + mutex_unlock(&callback_mutex); + mutex_unlock(&cpuset_mutex); +} + struct cgroup_subsys cpuset_cgrp_subsys = { - .css_alloc = cpuset_css_alloc, - .css_online = cpuset_css_online, - .css_offline = cpuset_css_offline, - .css_free = cpuset_css_free, - .can_attach = cpuset_can_attach, - .cancel_attach = cpuset_cancel_attach, - .attach = cpuset_attach, - .base_cftypes = files, - .early_init = 1, + .css_alloc = cpuset_css_alloc, + .css_online = cpuset_css_online, + .css_offline = cpuset_css_offline, + .css_free = cpuset_css_free, + .can_attach = cpuset_can_attach, + .cancel_attach = cpuset_cancel_attach, + .attach = cpuset_attach, + .bind = cpuset_bind, + .legacy_cftypes = files, + .early_init = 1, }; /** @@ -1990,9 +2070,13 @@ int __init cpuset_init(void) if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)) BUG(); + if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)) + BUG(); cpumask_setall(top_cpuset.cpus_allowed); nodes_setall(top_cpuset.mems_allowed); + cpumask_setall(top_cpuset.effective_cpus); + nodes_setall(top_cpuset.effective_mems); fmeter_init(&top_cpuset.fmeter); set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); @@ -2035,6 +2119,66 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) } } +static void +hotplug_update_tasks_legacy(struct cpuset *cs, + struct cpumask *new_cpus, nodemask_t *new_mems, + bool cpus_updated, bool mems_updated) +{ + bool is_empty; + + mutex_lock(&callback_mutex); + cpumask_copy(cs->cpus_allowed, new_cpus); + cpumask_copy(cs->effective_cpus, new_cpus); + cs->mems_allowed = *new_mems; + cs->effective_mems = *new_mems; + mutex_unlock(&callback_mutex); + + /* + * Don't call update_tasks_cpumask() if the cpuset becomes empty, + * as the tasks will be migratecd to an ancestor. + */ + if (cpus_updated && !cpumask_empty(cs->cpus_allowed)) + update_tasks_cpumask(cs); + if (mems_updated && !nodes_empty(cs->mems_allowed)) + update_tasks_nodemask(cs); + + is_empty = cpumask_empty(cs->cpus_allowed) || + nodes_empty(cs->mems_allowed); + + mutex_unlock(&cpuset_mutex); + + /* + * Move tasks to the nearest ancestor with execution resources, + * This is full cgroup operation which will also call back into + * cpuset. Should be done outside any lock. + */ + if (is_empty) + remove_tasks_in_empty_cpuset(cs); + + mutex_lock(&cpuset_mutex); +} + +static void +hotplug_update_tasks(struct cpuset *cs, + struct cpumask *new_cpus, nodemask_t *new_mems, + bool cpus_updated, bool mems_updated) +{ + if (cpumask_empty(new_cpus)) + cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus); + if (nodes_empty(*new_mems)) + *new_mems = parent_cs(cs)->effective_mems; + + mutex_lock(&callback_mutex); + cpumask_copy(cs->effective_cpus, new_cpus); + cs->effective_mems = *new_mems; + mutex_unlock(&callback_mutex); + + if (cpus_updated) + update_tasks_cpumask(cs); + if (mems_updated) + update_tasks_nodemask(cs); +} + /** * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug * @cs: cpuset in interest @@ -2045,11 +2189,10 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) */ static void cpuset_hotplug_update_tasks(struct cpuset *cs) { - static cpumask_t off_cpus; - static nodemask_t off_mems; - bool is_empty; - bool sane = cgroup_sane_behavior(cs->css.cgroup); - + static cpumask_t new_cpus; + static nodemask_t new_mems; + bool cpus_updated; + bool mems_updated; retry: wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); @@ -2064,51 +2207,20 @@ retry: goto retry; } - cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); - nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); - - mutex_lock(&callback_mutex); - cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus); - mutex_unlock(&callback_mutex); - - /* - * If sane_behavior flag is set, we need to update tasks' cpumask - * for empty cpuset to take on ancestor's cpumask. Otherwise, don't - * call update_tasks_cpumask() if the cpuset becomes empty, as - * the tasks in it will be migrated to an ancestor. - */ - if ((sane && cpumask_empty(cs->cpus_allowed)) || - (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed))) - update_tasks_cpumask(cs); + cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus); + nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems); - mutex_lock(&callback_mutex); - nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); - mutex_unlock(&callback_mutex); - - /* - * If sane_behavior flag is set, we need to update tasks' nodemask - * for empty cpuset to take on ancestor's nodemask. Otherwise, don't - * call update_tasks_nodemask() if the cpuset becomes empty, as - * the tasks in it will be migratd to an ancestor. - */ - if ((sane && nodes_empty(cs->mems_allowed)) || - (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed))) - update_tasks_nodemask(cs); + cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); + mems_updated = !nodes_equal(new_mems, cs->effective_mems); - is_empty = cpumask_empty(cs->cpus_allowed) || - nodes_empty(cs->mems_allowed); + if (cgroup_on_dfl(cs->css.cgroup)) + hotplug_update_tasks(cs, &new_cpus, &new_mems, + cpus_updated, mems_updated); + else + hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems, + cpus_updated, mems_updated); mutex_unlock(&cpuset_mutex); - - /* - * If sane_behavior flag is set, we'll keep tasks in empty cpusets. - * - * Otherwise move tasks to the nearest ancestor with execution - * resources. This is full cgroup operation which will - * also call back into cpuset. Should be done outside any lock. - */ - if (!sane && is_empty) - remove_tasks_in_empty_cpuset(cs); } /** @@ -2132,6 +2244,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) static cpumask_t new_cpus; static nodemask_t new_mems; bool cpus_updated, mems_updated; + bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup); mutex_lock(&cpuset_mutex); @@ -2139,13 +2252,15 @@ static void cpuset_hotplug_workfn(struct work_struct *work) cpumask_copy(&new_cpus, cpu_active_mask); new_mems = node_states[N_MEMORY]; - cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus); - mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems); + cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus); + mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems); /* synchronize cpus_allowed to cpu_active_mask */ if (cpus_updated) { mutex_lock(&callback_mutex); - cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); + if (!on_dfl) + cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); + cpumask_copy(top_cpuset.effective_cpus, &new_cpus); mutex_unlock(&callback_mutex); /* we don't mess with cpumasks of tasks in top_cpuset */ } @@ -2153,7 +2268,9 @@ static void cpuset_hotplug_workfn(struct work_struct *work) /* synchronize mems_allowed to N_MEMORY */ if (mems_updated) { mutex_lock(&callback_mutex); - top_cpuset.mems_allowed = new_mems; + if (!on_dfl) + top_cpuset.mems_allowed = new_mems; + top_cpuset.effective_mems = new_mems; mutex_unlock(&callback_mutex); update_tasks_nodemask(&top_cpuset); } @@ -2228,6 +2345,9 @@ void __init cpuset_init_smp(void) top_cpuset.mems_allowed = node_states[N_MEMORY]; top_cpuset.old_mems_allowed = top_cpuset.mems_allowed; + cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); + top_cpuset.effective_mems = node_states[N_MEMORY]; + register_hotmemory_notifier(&cpuset_track_online_nodes_nb); } @@ -2244,23 +2364,17 @@ void __init cpuset_init_smp(void) void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) { - struct cpuset *cpus_cs; - mutex_lock(&callback_mutex); rcu_read_lock(); - cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); - guarantee_online_cpus(cpus_cs, pmask); + guarantee_online_cpus(task_cs(tsk), pmask); rcu_read_unlock(); mutex_unlock(&callback_mutex); } void cpuset_cpus_allowed_fallback(struct task_struct *tsk) { - struct cpuset *cpus_cs; - rcu_read_lock(); - cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); - do_set_cpus_allowed(tsk, cpus_cs->cpus_allowed); + do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus); rcu_read_unlock(); /* @@ -2299,13 +2413,11 @@ void cpuset_init_current_mems_allowed(void) nodemask_t cpuset_mems_allowed(struct task_struct *tsk) { - struct cpuset *mems_cs; nodemask_t mask; mutex_lock(&callback_mutex); rcu_read_lock(); - mems_cs = effective_nodemask_cpuset(task_cs(tsk)); - guarantee_online_mems(mems_cs, &mask); + guarantee_online_mems(task_cs(tsk), &mask); rcu_read_unlock(); mutex_unlock(&callback_mutex); diff --git a/kernel/futex.c b/kernel/futex.c index b632b5f3f094..d3a9d946d0b7 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -792,94 +792,91 @@ void exit_pi_state_list(struct task_struct *curr) * [10] There is no transient state which leaves owner and user space * TID out of sync. */ -static int -lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, - union futex_key *key, struct futex_pi_state **ps) + +/* + * Validate that the existing waiter has a pi_state and sanity check + * the pi_state against the user space value. If correct, attach to + * it. + */ +static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, + struct futex_pi_state **ps) { - struct futex_pi_state *pi_state = NULL; - struct futex_q *this, *next; - struct task_struct *p; pid_t pid = uval & FUTEX_TID_MASK; - plist_for_each_entry_safe(this, next, &hb->chain, list) { - if (match_futex(&this->key, key)) { - /* - * Sanity check the waiter before increasing - * the refcount and attaching to it. - */ - pi_state = this->pi_state; - /* - * Userspace might have messed up non-PI and - * PI futexes [3] - */ - if (unlikely(!pi_state)) - return -EINVAL; + /* + * Userspace might have messed up non-PI and PI futexes [3] + */ + if (unlikely(!pi_state)) + return -EINVAL; - WARN_ON(!atomic_read(&pi_state->refcount)); + WARN_ON(!atomic_read(&pi_state->refcount)); + /* + * Handle the owner died case: + */ + if (uval & FUTEX_OWNER_DIED) { + /* + * exit_pi_state_list sets owner to NULL and wakes the + * topmost waiter. The task which acquires the + * pi_state->rt_mutex will fixup owner. + */ + if (!pi_state->owner) { /* - * Handle the owner died case: + * No pi state owner, but the user space TID + * is not 0. Inconsistent state. [5] */ - if (uval & FUTEX_OWNER_DIED) { - /* - * exit_pi_state_list sets owner to NULL and - * wakes the topmost waiter. The task which - * acquires the pi_state->rt_mutex will fixup - * owner. - */ - if (!pi_state->owner) { - /* - * No pi state owner, but the user - * space TID is not 0. Inconsistent - * state. [5] - */ - if (pid) - return -EINVAL; - /* - * Take a ref on the state and - * return. [4] - */ - goto out_state; - } - - /* - * If TID is 0, then either the dying owner - * has not yet executed exit_pi_state_list() - * or some waiter acquired the rtmutex in the - * pi state, but did not yet fixup the TID in - * user space. - * - * Take a ref on the state and return. [6] - */ - if (!pid) - goto out_state; - } else { - /* - * If the owner died bit is not set, - * then the pi_state must have an - * owner. [7] - */ - if (!pi_state->owner) - return -EINVAL; - } - + if (pid) + return -EINVAL; /* - * Bail out if user space manipulated the - * futex value. If pi state exists then the - * owner TID must be the same as the user - * space TID. [9/10] + * Take a ref on the state and return success. [4] */ - if (pid != task_pid_vnr(pi_state->owner)) - return -EINVAL; - - out_state: - atomic_inc(&pi_state->refcount); - *ps = pi_state; - return 0; + goto out_state; } + + /* + * If TID is 0, then either the dying owner has not + * yet executed exit_pi_state_list() or some waiter + * acquired the rtmutex in the pi state, but did not + * yet fixup the TID in user space. + * + * Take a ref on the state and return success. [6] + */ + if (!pid) + goto out_state; + } else { + /* + * If the owner died bit is not set, then the pi_state + * must have an owner. [7] + */ + if (!pi_state->owner) + return -EINVAL; } /* + * Bail out if user space manipulated the futex value. If pi + * state exists then the owner TID must be the same as the + * user space TID. [9/10] + */ + if (pid != task_pid_vnr(pi_state->owner)) + return -EINVAL; +out_state: + atomic_inc(&pi_state->refcount); + *ps = pi_state; + return 0; +} + +/* + * Lookup the task for the TID provided from user space and attach to + * it after doing proper sanity checks. + */ +static int attach_to_pi_owner(u32 uval, union futex_key *key, + struct futex_pi_state **ps) +{ + pid_t pid = uval & FUTEX_TID_MASK; + struct futex_pi_state *pi_state; + struct task_struct *p; + + /* * We are the first waiter - try to look up the real owner and attach * the new pi_state to it, but bail out when TID = 0 [1] */ @@ -920,7 +917,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, pi_state = alloc_pi_state(); /* - * Initialize the pi_mutex in locked state and make 'p' + * Initialize the pi_mutex in locked state and make @p * the owner of it: */ rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); @@ -940,6 +937,36 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, return 0; } +static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, + union futex_key *key, struct futex_pi_state **ps) +{ + struct futex_q *match = futex_top_waiter(hb, key); + + /* + * If there is a waiter on that futex, validate it and + * attach to the pi_state when the validation succeeds. + */ + if (match) + return attach_to_pi_state(uval, match->pi_state, ps); + + /* + * We are the first waiter - try to look up the owner based on + * @uval and attach to it. + */ + return attach_to_pi_owner(uval, key, ps); +} + +static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) +{ + u32 uninitialized_var(curval); + + if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) + return -EFAULT; + + /*If user space value changed, let the caller retry */ + return curval != uval ? -EAGAIN : 0; +} + /** * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex * @uaddr: the pi futex user address @@ -963,113 +990,69 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, struct futex_pi_state **ps, struct task_struct *task, int set_waiters) { - int lock_taken, ret, force_take = 0; - u32 uval, newval, curval, vpid = task_pid_vnr(task); - -retry: - ret = lock_taken = 0; + u32 uval, newval, vpid = task_pid_vnr(task); + struct futex_q *match; + int ret; /* - * To avoid races, we attempt to take the lock here again - * (by doing a 0 -> TID atomic cmpxchg), while holding all - * the locks. It will most likely not succeed. + * Read the user space value first so we can validate a few + * things before proceeding further. */ - newval = vpid; - if (set_waiters) - newval |= FUTEX_WAITERS; - - if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval))) + if (get_futex_value_locked(&uval, uaddr)) return -EFAULT; /* * Detect deadlocks. */ - if ((unlikely((curval & FUTEX_TID_MASK) == vpid))) + if ((unlikely((uval & FUTEX_TID_MASK) == vpid))) return -EDEADLK; /* - * Surprise - we got the lock, but we do not trust user space at all. - */ - if (unlikely(!curval)) { - /* - * We verify whether there is kernel state for this - * futex. If not, we can safely assume, that the 0 -> - * TID transition is correct. If state exists, we do - * not bother to fixup the user space state as it was - * corrupted already. - */ - return futex_top_waiter(hb, key) ? -EINVAL : 1; - } - - uval = curval; - - /* - * Set the FUTEX_WAITERS flag, so the owner will know it has someone - * to wake at the next unlock. + * Lookup existing state first. If it exists, try to attach to + * its pi_state. */ - newval = curval | FUTEX_WAITERS; + match = futex_top_waiter(hb, key); + if (match) + return attach_to_pi_state(uval, match->pi_state, ps); /* - * Should we force take the futex? See below. + * No waiter and user TID is 0. We are here because the + * waiters or the owner died bit is set or called from + * requeue_cmp_pi or for whatever reason something took the + * syscall. */ - if (unlikely(force_take)) { + if (!(uval & FUTEX_TID_MASK)) { /* - * Keep the OWNER_DIED and the WAITERS bit and set the - * new TID value. + * We take over the futex. No other waiters and the user space + * TID is 0. We preserve the owner died bit. */ - newval = (curval & ~FUTEX_TID_MASK) | vpid; - force_take = 0; - lock_taken = 1; - } + newval = uval & FUTEX_OWNER_DIED; + newval |= vpid; - if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) - return -EFAULT; - if (unlikely(curval != uval)) - goto retry; + /* The futex requeue_pi code can enforce the waiters bit */ + if (set_waiters) + newval |= FUTEX_WAITERS; + + ret = lock_pi_update_atomic(uaddr, uval, newval); + /* If the take over worked, return 1 */ + return ret < 0 ? ret : 1; + } /* - * We took the lock due to forced take over. + * First waiter. Set the waiters bit before attaching ourself to + * the owner. If owner tries to unlock, it will be forced into + * the kernel and blocked on hb->lock. */ - if (unlikely(lock_taken)) - return 1; - + newval = uval | FUTEX_WAITERS; + ret = lock_pi_update_atomic(uaddr, uval, newval); + if (ret) + return ret; /* - * We dont have the lock. Look up the PI state (or create it if - * we are the first waiter): + * If the update of the user space value succeeded, we try to + * attach to the owner. If that fails, no harm done, we only + * set the FUTEX_WAITERS bit in the user space variable. */ - ret = lookup_pi_state(uval, hb, key, ps); - - if (unlikely(ret)) { - switch (ret) { - case -ESRCH: - /* - * We failed to find an owner for this - * futex. So we have no pi_state to block - * on. This can happen in two cases: - * - * 1) The owner died - * 2) A stale FUTEX_WAITERS bit - * - * Re-read the futex value. - */ - if (get_futex_value_locked(&curval, uaddr)) - return -EFAULT; - - /* - * If the owner died or we have a stale - * WAITERS bit the owner TID in the user space - * futex is 0. - */ - if (!(curval & FUTEX_TID_MASK)) { - force_take = 1; - goto retry; - } - default: - break; - } - } - - return ret; + return attach_to_pi_owner(uval, key, ps); } /** @@ -1186,22 +1169,6 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) return 0; } -static int unlock_futex_pi(u32 __user *uaddr, u32 uval) -{ - u32 uninitialized_var(oldval); - - /* - * There is no waiter, so we unlock the futex. The owner died - * bit has not to be preserved here. We are the owner: - */ - if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0)) - return -EFAULT; - if (oldval != uval) - return -EAGAIN; - - return 0; -} - /* * Express the locking dependencies for lockdep: */ @@ -1659,7 +1626,12 @@ retry_private: goto retry; goto out; case -EAGAIN: - /* The owner was exiting, try again. */ + /* + * Two reasons for this: + * - Owner is exiting and we just wait for the + * exit to complete. + * - The user space value changed. + */ double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); put_futex_key(&key2); @@ -1718,7 +1690,7 @@ retry_private: this->pi_state = pi_state; ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, this->rt_waiter, - this->task, 1); + this->task); if (ret == 1) { /* We got the lock. */ requeue_pi_wake_futex(this, &key2, hb2); @@ -2316,8 +2288,10 @@ retry_private: goto uaddr_faulted; case -EAGAIN: /* - * Task is exiting and we just wait for the - * exit to complete. + * Two reasons for this: + * - Task is exiting and we just wait for the + * exit to complete. + * - The user space value changed. */ queue_unlock(hb); put_futex_key(&q.key); @@ -2337,9 +2311,9 @@ retry_private: /* * Block on the PI mutex: */ - if (!trylock) - ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1); - else { + if (!trylock) { + ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to); + } else { ret = rt_mutex_trylock(&q.pi_state->pi_mutex); /* Fixup the trylock return value: */ ret = ret ? 0 : -EWOULDBLOCK; @@ -2401,10 +2375,10 @@ uaddr_faulted: */ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) { - struct futex_hash_bucket *hb; - struct futex_q *this, *next; + u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current); union futex_key key = FUTEX_KEY_INIT; - u32 uval, vpid = task_pid_vnr(current); + struct futex_hash_bucket *hb; + struct futex_q *match; int ret; retry: @@ -2417,57 +2391,47 @@ retry: return -EPERM; ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE); - if (unlikely(ret != 0)) - goto out; + if (ret) + return ret; hb = hash_futex(&key); spin_lock(&hb->lock); /* - * To avoid races, try to do the TID -> 0 atomic transition - * again. If it succeeds then we can return without waking - * anyone else up. We only try this if neither the waiters nor - * the owner died bit are set. - */ - if (!(uval & ~FUTEX_TID_MASK) && - cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0)) - goto pi_faulted; - /* - * Rare case: we managed to release the lock atomically, - * no need to wake anyone else up: - */ - if (unlikely(uval == vpid)) - goto out_unlock; - - /* - * Ok, other tasks may need to be woken up - check waiters - * and do the wakeup if necessary: + * Check waiters first. We do not trust user space values at + * all and we at least want to know if user space fiddled + * with the futex value instead of blindly unlocking. */ - plist_for_each_entry_safe(this, next, &hb->chain, list) { - if (!match_futex (&this->key, &key)) - continue; - ret = wake_futex_pi(uaddr, uval, this); + match = futex_top_waiter(hb, &key); + if (match) { + ret = wake_futex_pi(uaddr, uval, match); /* - * The atomic access to the futex value - * generated a pagefault, so retry the - * user-access and the wakeup: + * The atomic access to the futex value generated a + * pagefault, so retry the user-access and the wakeup: */ if (ret == -EFAULT) goto pi_faulted; goto out_unlock; } + /* - * No waiters - kernel unlocks the futex: + * We have no kernel internal state, i.e. no waiters in the + * kernel. Waiters which are about to queue themselves are stuck + * on hb->lock. So we can safely ignore them. We do neither + * preserve the WAITERS bit not the OWNER_DIED one. We are the + * owner. */ - ret = unlock_futex_pi(uaddr, uval); - if (ret == -EFAULT) + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) goto pi_faulted; + /* + * If uval has changed, let user space handle it. + */ + ret = (curval == uval) ? 0 : -EAGAIN; + out_unlock: spin_unlock(&hb->lock); put_futex_key(&key); - -out: return ret; pi_faulted: @@ -2669,7 +2633,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, */ WARN_ON(!q.pi_state); pi_mutex = &q.pi_state->pi_mutex; - ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1); + ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter); debug_rt_mutex_free_waiter(&rt_waiter); spin_lock(q.lock_ptr); diff --git a/kernel/kexec.c b/kernel/kexec.c index 369f41a94124..4b8f0c925884 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -33,6 +33,7 @@ #include <linux/swap.h> #include <linux/syscore_ops.h> #include <linux/compiler.h> +#include <linux/hugetlb.h> #include <asm/page.h> #include <asm/uaccess.h> @@ -1619,6 +1620,9 @@ static int __init crash_save_vmcoreinfo_init(void) #endif VMCOREINFO_NUMBER(PG_head_mask); VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); +#ifdef CONFIG_HUGETLBFS + VMCOREINFO_SYMBOL(free_huge_page); +#endif arch_crash_save_vmcoreinfo(); update_vmcoreinfo_note(); diff --git a/kernel/kthread.c b/kernel/kthread.c index c2390f41307b..ef483220e855 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -591,7 +591,7 @@ static void insert_kthread_work(struct kthread_worker *worker, list_add_tail(&work->node, pos); work->worker = worker; - if (likely(worker->task)) + if (!worker->current_work && likely(worker->task)) wake_up_process(worker->task); } diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index d24e4339b46d..88d0d4420ad2 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -384,7 +384,9 @@ static void print_lockdep_off(const char *bug_msg) { printk(KERN_DEBUG "%s\n", bug_msg); printk(KERN_DEBUG "turning off the locking correctness validator.\n"); +#ifdef CONFIG_LOCK_STAT printk(KERN_DEBUG "Please attach the output of /proc/lock_stat to the bug report\n"); +#endif } static int save_trace(struct stack_trace *trace) diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c index be9ee1559fca..9887a905a762 100644 --- a/kernel/locking/mcs_spinlock.c +++ b/kernel/locking/mcs_spinlock.c @@ -1,6 +1,4 @@ - #include <linux/percpu.h> -#include <linux/mutex.h> #include <linux/sched.h> #include "mcs_spinlock.h" @@ -79,7 +77,7 @@ osq_wait_next(struct optimistic_spin_queue *lock, break; } - arch_mutex_cpu_relax(); + cpu_relax_lowlatency(); } return next; @@ -120,7 +118,7 @@ bool osq_lock(struct optimistic_spin_queue *lock) if (need_resched()) goto unqueue; - arch_mutex_cpu_relax(); + cpu_relax_lowlatency(); } return true; @@ -146,7 +144,7 @@ unqueue: if (smp_load_acquire(&node->locked)) return true; - arch_mutex_cpu_relax(); + cpu_relax_lowlatency(); /* * Or we race against a concurrent unqueue()'s step-B, in which diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index 74356dc0ce29..23e89c5930e9 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -27,7 +27,7 @@ struct mcs_spinlock { #define arch_mcs_spin_lock_contended(l) \ do { \ while (!(smp_load_acquire(l))) \ - arch_mutex_cpu_relax(); \ + cpu_relax_lowlatency(); \ } while (0) #endif @@ -104,7 +104,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) return; /* Wait until the next pointer is set */ while (!(next = ACCESS_ONCE(node->next))) - arch_mutex_cpu_relax(); + cpu_relax_lowlatency(); } /* Pass lock to next waiter. */ diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index acca2c1a3c5e..ae712b25e492 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -46,12 +46,6 @@ # include <asm/mutex.h> #endif -/* - * A negative mutex count indicates that waiters are sleeping waiting for the - * mutex. - */ -#define MUTEX_SHOW_NO_WAITER(mutex) (atomic_read(&(mutex)->count) >= 0) - void __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) { @@ -152,7 +146,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) if (need_resched()) break; - arch_mutex_cpu_relax(); + cpu_relax_lowlatency(); } rcu_read_unlock(); @@ -388,12 +382,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, /* * Optimistic spinning. * - * We try to spin for acquisition when we find that there are no - * pending waiters and the lock owner is currently running on a - * (different) CPU. - * - * The rationale is that if the lock owner is running, it is likely to - * release the lock soon. + * We try to spin for acquisition when we find that the lock owner + * is currently running on a (different) CPU and while we don't + * need to reschedule. The rationale is that if the lock owner is + * running, it is likely to release the lock soon. * * Since this needs the lock owner, and this mutex implementation * doesn't track the owner atomically in the lock field, we need to @@ -440,7 +432,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, if (owner && !mutex_spin_on_owner(lock, owner)) break; - if ((atomic_read(&lock->count) == 1) && + /* Try to acquire the mutex if it is unlocked. */ + if (!mutex_is_locked(lock) && (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { lock_acquired(&lock->dep_map, ip); if (use_ww_ctx) { @@ -471,7 +464,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * memory barriers as we'll eventually observe the right * values at the cost of a few extra spins. */ - arch_mutex_cpu_relax(); + cpu_relax_lowlatency(); } osq_unlock(&lock->osq); slowpath: @@ -485,8 +478,11 @@ slowpath: #endif spin_lock_mutex(&lock->wait_lock, flags); - /* once more, can we acquire the lock? */ - if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, 0) == 1)) + /* + * Once more, try to acquire the lock. Only try-lock the mutex if + * it is unlocked to reduce unnecessary xchg() operations. + */ + if (!mutex_is_locked(lock) && (atomic_xchg(&lock->count, 0) == 1)) goto skip_wait; debug_mutex_lock_common(lock, &waiter); @@ -506,9 +502,10 @@ slowpath: * it's unlocked. Later on, if we sleep, this is the * operation that gives us the lock. We xchg it to -1, so * that when we release the lock, we properly wake up the - * other waiters: + * other waiters. We only attempt the xchg if the count is + * non-negative in order to avoid unnecessary xchg operations: */ - if (MUTEX_SHOW_NO_WAITER(lock) && + if (atomic_read(&lock->count) >= 0 && (atomic_xchg(&lock->count, -1) == 1)) break; @@ -823,6 +820,10 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count) unsigned long flags; int prev; + /* No need to trylock if the mutex is locked. */ + if (mutex_is_locked(lock)) + return 0; + spin_lock_mutex(&lock->wait_lock, flags); prev = atomic_xchg(&lock->count, -1); diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index fb5b8ac411a5..f956ede7f90d 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -20,7 +20,6 @@ #include <linux/cpumask.h> #include <linux/percpu.h> #include <linux/hardirq.h> -#include <linux/mutex.h> #include <asm/qrwlock.h> /** @@ -35,7 +34,7 @@ static __always_inline void rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts) { while ((cnts & _QW_WMASK) == _QW_LOCKED) { - arch_mutex_cpu_relax(); + cpu_relax_lowlatency(); cnts = smp_load_acquire((u32 *)&lock->cnts); } } @@ -75,7 +74,7 @@ void queue_read_lock_slowpath(struct qrwlock *lock) * to make sure that the write lock isn't taken. */ while (atomic_read(&lock->cnts) & _QW_WMASK) - arch_mutex_cpu_relax(); + cpu_relax_lowlatency(); cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS; rspin_until_writer_unlock(lock, cnts); @@ -114,7 +113,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock) cnts | _QW_WAITING) == cnts)) break; - arch_mutex_cpu_relax(); + cpu_relax_lowlatency(); } /* When no more readers, set the locked flag */ @@ -125,7 +124,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock) _QW_LOCKED) == _QW_WAITING)) break; - arch_mutex_cpu_relax(); + cpu_relax_lowlatency(); } unlock: arch_spin_unlock(&lock->lock); diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index 49b2ed3dced8..62b6cee8ea7f 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -66,12 +66,13 @@ void rt_mutex_debug_task_free(struct task_struct *task) * the deadlock. We print when we return. act_waiter can be NULL in * case of a remove waiter operation. */ -void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter, +void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk, + struct rt_mutex_waiter *act_waiter, struct rt_mutex *lock) { struct task_struct *task; - if (!debug_locks || detect || !act_waiter) + if (!debug_locks || chwalk == RT_MUTEX_FULL_CHAINWALK || !act_waiter) return; task = rt_mutex_owner(act_waiter->lock); diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h index ab29b6a22669..d0519c3432b6 100644 --- a/kernel/locking/rtmutex-debug.h +++ b/kernel/locking/rtmutex-debug.h @@ -20,14 +20,15 @@ extern void debug_rt_mutex_unlock(struct rt_mutex *lock); extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner); extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock); -extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter, +extern void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk, + struct rt_mutex_waiter *waiter, struct rt_mutex *lock); extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter); # define debug_rt_mutex_reset_waiter(w) \ do { (w)->deadlock_lock = NULL; } while (0) -static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, - int detect) +static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, + enum rtmutex_chainwalk walk) { return (waiter != NULL); } diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index fc605941b9b8..a0ea2a141b3b 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -308,6 +308,32 @@ static void rt_mutex_adjust_prio(struct task_struct *task) } /* + * Deadlock detection is conditional: + * + * If CONFIG_DEBUG_RT_MUTEXES=n, deadlock detection is only conducted + * if the detect argument is == RT_MUTEX_FULL_CHAINWALK. + * + * If CONFIG_DEBUG_RT_MUTEXES=y, deadlock detection is always + * conducted independent of the detect argument. + * + * If the waiter argument is NULL this indicates the deboost path and + * deadlock detection is disabled independent of the detect argument + * and the config settings. + */ +static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter, + enum rtmutex_chainwalk chwalk) +{ + /* + * This is just a wrapper function for the following call, + * because debug_rt_mutex_detect_deadlock() smells like a magic + * debug feature and I wanted to keep the cond function in the + * main source file along with the comments instead of having + * two of the same in the headers. + */ + return debug_rt_mutex_detect_deadlock(waiter, chwalk); +} + +/* * Max number of times we'll walk the boosting chain: */ int max_lock_depth = 1024; @@ -337,21 +363,65 @@ static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p) * @top_task: the current top waiter * * Returns 0 or -EDEADLK. + * + * Chain walk basics and protection scope + * + * [R] refcount on task + * [P] task->pi_lock held + * [L] rtmutex->wait_lock held + * + * Step Description Protected by + * function arguments: + * @task [R] + * @orig_lock if != NULL @top_task is blocked on it + * @next_lock Unprotected. Cannot be + * dereferenced. Only used for + * comparison. + * @orig_waiter if != NULL @top_task is blocked on it + * @top_task current, or in case of proxy + * locking protected by calling + * code + * again: + * loop_sanity_check(); + * retry: + * [1] lock(task->pi_lock); [R] acquire [P] + * [2] waiter = task->pi_blocked_on; [P] + * [3] check_exit_conditions_1(); [P] + * [4] lock = waiter->lock; [P] + * [5] if (!try_lock(lock->wait_lock)) { [P] try to acquire [L] + * unlock(task->pi_lock); release [P] + * goto retry; + * } + * [6] check_exit_conditions_2(); [P] + [L] + * [7] requeue_lock_waiter(lock, waiter); [P] + [L] + * [8] unlock(task->pi_lock); release [P] + * put_task_struct(task); release [R] + * [9] check_exit_conditions_3(); [L] + * [10] task = owner(lock); [L] + * get_task_struct(task); [L] acquire [R] + * lock(task->pi_lock); [L] acquire [P] + * [11] requeue_pi_waiter(tsk, waiters(lock));[P] + [L] + * [12] check_exit_conditions_4(); [P] + [L] + * [13] unlock(task->pi_lock); release [P] + * unlock(lock->wait_lock); release [L] + * goto again; */ static int rt_mutex_adjust_prio_chain(struct task_struct *task, - int deadlock_detect, + enum rtmutex_chainwalk chwalk, struct rt_mutex *orig_lock, struct rt_mutex *next_lock, struct rt_mutex_waiter *orig_waiter, struct task_struct *top_task) { - struct rt_mutex *lock; struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; - int detect_deadlock, ret = 0, depth = 0; + struct rt_mutex_waiter *prerequeue_top_waiter; + int ret = 0, depth = 0; + struct rt_mutex *lock; + bool detect_deadlock; unsigned long flags; + bool requeue = true; - detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter, - deadlock_detect); + detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk); /* * The (de)boosting is a step by step approach with a lot of @@ -360,6 +430,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * carefully whether things change under us. */ again: + /* + * We limit the lock chain length for each invocation. + */ if (++depth > max_lock_depth) { static int prev_max; @@ -377,13 +450,28 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, return -EDEADLK; } + + /* + * We are fully preemptible here and only hold the refcount on + * @task. So everything can have changed under us since the + * caller or our own code below (goto retry/again) dropped all + * locks. + */ retry: /* - * Task can not go away as we did a get_task() before ! + * [1] Task cannot go away as we did a get_task() before ! */ raw_spin_lock_irqsave(&task->pi_lock, flags); + /* + * [2] Get the waiter on which @task is blocked on. + */ waiter = task->pi_blocked_on; + + /* + * [3] check_exit_conditions_1() protected by task->pi_lock. + */ + /* * Check whether the end of the boosting chain has been * reached or the state of the chain has changed while we @@ -421,20 +509,41 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, goto out_unlock_pi; /* * If deadlock detection is off, we stop here if we - * are not the top pi waiter of the task. + * are not the top pi waiter of the task. If deadlock + * detection is enabled we continue, but stop the + * requeueing in the chain walk. */ - if (!detect_deadlock && top_waiter != task_top_pi_waiter(task)) - goto out_unlock_pi; + if (top_waiter != task_top_pi_waiter(task)) { + if (!detect_deadlock) + goto out_unlock_pi; + else + requeue = false; + } } /* - * When deadlock detection is off then we check, if further - * priority adjustment is necessary. + * If the waiter priority is the same as the task priority + * then there is no further priority adjustment necessary. If + * deadlock detection is off, we stop the chain walk. If its + * enabled we continue, but stop the requeueing in the chain + * walk. */ - if (!detect_deadlock && waiter->prio == task->prio) - goto out_unlock_pi; + if (waiter->prio == task->prio) { + if (!detect_deadlock) + goto out_unlock_pi; + else + requeue = false; + } + /* + * [4] Get the next lock + */ lock = waiter->lock; + /* + * [5] We need to trylock here as we are holding task->pi_lock, + * which is the reverse lock order versus the other rtmutex + * operations. + */ if (!raw_spin_trylock(&lock->wait_lock)) { raw_spin_unlock_irqrestore(&task->pi_lock, flags); cpu_relax(); @@ -442,79 +551,180 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, } /* + * [6] check_exit_conditions_2() protected by task->pi_lock and + * lock->wait_lock. + * * Deadlock detection. If the lock is the same as the original * lock which caused us to walk the lock chain or if the * current lock is owned by the task which initiated the chain * walk, we detected a deadlock. */ if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { - debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); + debug_rt_mutex_deadlock(chwalk, orig_waiter, lock); raw_spin_unlock(&lock->wait_lock); ret = -EDEADLK; goto out_unlock_pi; } - top_waiter = rt_mutex_top_waiter(lock); + /* + * If we just follow the lock chain for deadlock detection, no + * need to do all the requeue operations. To avoid a truckload + * of conditionals around the various places below, just do the + * minimum chain walk checks. + */ + if (!requeue) { + /* + * No requeue[7] here. Just release @task [8] + */ + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + put_task_struct(task); + + /* + * [9] check_exit_conditions_3 protected by lock->wait_lock. + * If there is no owner of the lock, end of chain. + */ + if (!rt_mutex_owner(lock)) { + raw_spin_unlock(&lock->wait_lock); + return 0; + } + + /* [10] Grab the next task, i.e. owner of @lock */ + task = rt_mutex_owner(lock); + get_task_struct(task); + raw_spin_lock_irqsave(&task->pi_lock, flags); + + /* + * No requeue [11] here. We just do deadlock detection. + * + * [12] Store whether owner is blocked + * itself. Decision is made after dropping the locks + */ + next_lock = task_blocked_on_lock(task); + /* + * Get the top waiter for the next iteration + */ + top_waiter = rt_mutex_top_waiter(lock); + + /* [13] Drop locks */ + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock(&lock->wait_lock); + + /* If owner is not blocked, end of chain. */ + if (!next_lock) + goto out_put_task; + goto again; + } - /* Requeue the waiter */ + /* + * Store the current top waiter before doing the requeue + * operation on @lock. We need it for the boost/deboost + * decision below. + */ + prerequeue_top_waiter = rt_mutex_top_waiter(lock); + + /* [7] Requeue the waiter in the lock waiter list. */ rt_mutex_dequeue(lock, waiter); waiter->prio = task->prio; rt_mutex_enqueue(lock, waiter); - /* Release the task */ + /* [8] Release the task */ raw_spin_unlock_irqrestore(&task->pi_lock, flags); + put_task_struct(task); + + /* + * [9] check_exit_conditions_3 protected by lock->wait_lock. + * + * We must abort the chain walk if there is no lock owner even + * in the dead lock detection case, as we have nothing to + * follow here. This is the end of the chain we are walking. + */ if (!rt_mutex_owner(lock)) { /* - * If the requeue above changed the top waiter, then we need - * to wake the new top waiter up to try to get the lock. + * If the requeue [7] above changed the top waiter, + * then we need to wake the new top waiter up to try + * to get the lock. */ - - if (top_waiter != rt_mutex_top_waiter(lock)) + if (prerequeue_top_waiter != rt_mutex_top_waiter(lock)) wake_up_process(rt_mutex_top_waiter(lock)->task); raw_spin_unlock(&lock->wait_lock); - goto out_put_task; + return 0; } - put_task_struct(task); - /* Grab the next task */ + /* [10] Grab the next task, i.e. the owner of @lock */ task = rt_mutex_owner(lock); get_task_struct(task); raw_spin_lock_irqsave(&task->pi_lock, flags); + /* [11] requeue the pi waiters if necessary */ if (waiter == rt_mutex_top_waiter(lock)) { - /* Boost the owner */ - rt_mutex_dequeue_pi(task, top_waiter); + /* + * The waiter became the new top (highest priority) + * waiter on the lock. Replace the previous top waiter + * in the owner tasks pi waiters list with this waiter + * and adjust the priority of the owner. + */ + rt_mutex_dequeue_pi(task, prerequeue_top_waiter); rt_mutex_enqueue_pi(task, waiter); __rt_mutex_adjust_prio(task); - } else if (top_waiter == waiter) { - /* Deboost the owner */ + } else if (prerequeue_top_waiter == waiter) { + /* + * The waiter was the top waiter on the lock, but is + * no longer the top prority waiter. Replace waiter in + * the owner tasks pi waiters list with the new top + * (highest priority) waiter and adjust the priority + * of the owner. + * The new top waiter is stored in @waiter so that + * @waiter == @top_waiter evaluates to true below and + * we continue to deboost the rest of the chain. + */ rt_mutex_dequeue_pi(task, waiter); waiter = rt_mutex_top_waiter(lock); rt_mutex_enqueue_pi(task, waiter); __rt_mutex_adjust_prio(task); + } else { + /* + * Nothing changed. No need to do any priority + * adjustment. + */ } /* + * [12] check_exit_conditions_4() protected by task->pi_lock + * and lock->wait_lock. The actual decisions are made after we + * dropped the locks. + * * Check whether the task which owns the current lock is pi * blocked itself. If yes we store a pointer to the lock for * the lock chain change detection above. After we dropped * task->pi_lock next_lock cannot be dereferenced anymore. */ next_lock = task_blocked_on_lock(task); + /* + * Store the top waiter of @lock for the end of chain walk + * decision below. + */ + top_waiter = rt_mutex_top_waiter(lock); + /* [13] Drop the locks */ raw_spin_unlock_irqrestore(&task->pi_lock, flags); - - top_waiter = rt_mutex_top_waiter(lock); raw_spin_unlock(&lock->wait_lock); /* + * Make the actual exit decisions [12], based on the stored + * values. + * * We reached the end of the lock chain. Stop right here. No * point to go back just to figure that out. */ if (!next_lock) goto out_put_task; + /* + * If the current waiter is not the top waiter on the lock, + * then we can stop the chain walk here if we are not in full + * deadlock detection mode. + */ if (!detect_deadlock && waiter != top_waiter) goto out_put_task; @@ -533,76 +743,119 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * * Must be called with lock->wait_lock held. * - * @lock: the lock to be acquired. - * @task: the task which wants to acquire the lock - * @waiter: the waiter that is queued to the lock's wait list. (could be NULL) + * @lock: The lock to be acquired. + * @task: The task which wants to acquire the lock + * @waiter: The waiter that is queued to the lock's wait list if the + * callsite called task_blocked_on_lock(), otherwise NULL */ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, - struct rt_mutex_waiter *waiter) + struct rt_mutex_waiter *waiter) { + unsigned long flags; + /* - * We have to be careful here if the atomic speedups are - * enabled, such that, when - * - no other waiter is on the lock - * - the lock has been released since we did the cmpxchg - * the lock can be released or taken while we are doing the - * checks and marking the lock with RT_MUTEX_HAS_WAITERS. + * Before testing whether we can acquire @lock, we set the + * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all + * other tasks which try to modify @lock into the slow path + * and they serialize on @lock->wait_lock. + * + * The RT_MUTEX_HAS_WAITERS bit can have a transitional state + * as explained at the top of this file if and only if: * - * The atomic acquire/release aware variant of - * mark_rt_mutex_waiters uses a cmpxchg loop. After setting - * the WAITERS bit, the atomic release / acquire can not - * happen anymore and lock->wait_lock protects us from the - * non-atomic case. + * - There is a lock owner. The caller must fixup the + * transient state if it does a trylock or leaves the lock + * function due to a signal or timeout. * - * Note, that this might set lock->owner = - * RT_MUTEX_HAS_WAITERS in the case the lock is not contended - * any more. This is fixed up when we take the ownership. - * This is the transitional state explained at the top of this file. + * - @task acquires the lock and there are no other + * waiters. This is undone in rt_mutex_set_owner(@task) at + * the end of this function. */ mark_rt_mutex_waiters(lock); + /* + * If @lock has an owner, give up. + */ if (rt_mutex_owner(lock)) return 0; /* - * It will get the lock because of one of these conditions: - * 1) there is no waiter - * 2) higher priority than waiters - * 3) it is top waiter + * If @waiter != NULL, @task has already enqueued the waiter + * into @lock waiter list. If @waiter == NULL then this is a + * trylock attempt. */ - if (rt_mutex_has_waiters(lock)) { - if (task->prio >= rt_mutex_top_waiter(lock)->prio) { - if (!waiter || waiter != rt_mutex_top_waiter(lock)) - return 0; - } - } - - if (waiter || rt_mutex_has_waiters(lock)) { - unsigned long flags; - struct rt_mutex_waiter *top; - - raw_spin_lock_irqsave(&task->pi_lock, flags); + if (waiter) { + /* + * If waiter is not the highest priority waiter of + * @lock, give up. + */ + if (waiter != rt_mutex_top_waiter(lock)) + return 0; - /* remove the queued waiter. */ - if (waiter) { - rt_mutex_dequeue(lock, waiter); - task->pi_blocked_on = NULL; - } + /* + * We can acquire the lock. Remove the waiter from the + * lock waiters list. + */ + rt_mutex_dequeue(lock, waiter); + } else { /* - * We have to enqueue the top waiter(if it exists) into - * task->pi_waiters list. + * If the lock has waiters already we check whether @task is + * eligible to take over the lock. + * + * If there are no other waiters, @task can acquire + * the lock. @task->pi_blocked_on is NULL, so it does + * not need to be dequeued. */ if (rt_mutex_has_waiters(lock)) { - top = rt_mutex_top_waiter(lock); - rt_mutex_enqueue_pi(task, top); + /* + * If @task->prio is greater than or equal to + * the top waiter priority (kernel view), + * @task lost. + */ + if (task->prio >= rt_mutex_top_waiter(lock)->prio) + return 0; + + /* + * The current top waiter stays enqueued. We + * don't have to change anything in the lock + * waiters order. + */ + } else { + /* + * No waiters. Take the lock without the + * pi_lock dance.@task->pi_blocked_on is NULL + * and we have no waiters to enqueue in @task + * pi waiters list. + */ + goto takeit; } - raw_spin_unlock_irqrestore(&task->pi_lock, flags); } + /* + * Clear @task->pi_blocked_on. Requires protection by + * @task->pi_lock. Redundant operation for the @waiter == NULL + * case, but conditionals are more expensive than a redundant + * store. + */ + raw_spin_lock_irqsave(&task->pi_lock, flags); + task->pi_blocked_on = NULL; + /* + * Finish the lock acquisition. @task is the new owner. If + * other waiters exist we have to insert the highest priority + * waiter into @task->pi_waiters list. + */ + if (rt_mutex_has_waiters(lock)) + rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock)); + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + +takeit: /* We got the lock. */ debug_rt_mutex_lock(lock); + /* + * This either preserves the RT_MUTEX_HAS_WAITERS bit if there + * are still waiters or clears it. + */ rt_mutex_set_owner(lock, task); rt_mutex_deadlock_account_lock(lock, task); @@ -620,7 +873,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, static int task_blocks_on_rt_mutex(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct task_struct *task, - int detect_deadlock) + enum rtmutex_chainwalk chwalk) { struct task_struct *owner = rt_mutex_owner(lock); struct rt_mutex_waiter *top_waiter = waiter; @@ -666,7 +919,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, __rt_mutex_adjust_prio(owner); if (owner->pi_blocked_on) chain_walk = 1; - } else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) { + } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) { chain_walk = 1; } @@ -691,7 +944,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, raw_spin_unlock(&lock->wait_lock); - res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, + res = rt_mutex_adjust_prio_chain(owner, chwalk, lock, next_lock, waiter, task); raw_spin_lock(&lock->wait_lock); @@ -753,9 +1006,9 @@ static void wakeup_next_waiter(struct rt_mutex *lock) static void remove_waiter(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) { - int first = (waiter == rt_mutex_top_waiter(lock)); + bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock)); struct task_struct *owner = rt_mutex_owner(lock); - struct rt_mutex *next_lock = NULL; + struct rt_mutex *next_lock; unsigned long flags; raw_spin_lock_irqsave(¤t->pi_lock, flags); @@ -763,29 +1016,31 @@ static void remove_waiter(struct rt_mutex *lock, current->pi_blocked_on = NULL; raw_spin_unlock_irqrestore(¤t->pi_lock, flags); - if (!owner) + /* + * Only update priority if the waiter was the highest priority + * waiter of the lock and there is an owner to update. + */ + if (!owner || !is_top_waiter) return; - if (first) { - - raw_spin_lock_irqsave(&owner->pi_lock, flags); + raw_spin_lock_irqsave(&owner->pi_lock, flags); - rt_mutex_dequeue_pi(owner, waiter); + rt_mutex_dequeue_pi(owner, waiter); - if (rt_mutex_has_waiters(lock)) { - struct rt_mutex_waiter *next; + if (rt_mutex_has_waiters(lock)) + rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock)); - next = rt_mutex_top_waiter(lock); - rt_mutex_enqueue_pi(owner, next); - } - __rt_mutex_adjust_prio(owner); + __rt_mutex_adjust_prio(owner); - /* Store the lock on which owner is blocked or NULL */ - next_lock = task_blocked_on_lock(owner); + /* Store the lock on which owner is blocked or NULL */ + next_lock = task_blocked_on_lock(owner); - raw_spin_unlock_irqrestore(&owner->pi_lock, flags); - } + raw_spin_unlock_irqrestore(&owner->pi_lock, flags); + /* + * Don't walk the chain, if the owner task is not blocked + * itself. + */ if (!next_lock) return; @@ -794,7 +1049,8 @@ static void remove_waiter(struct rt_mutex *lock, raw_spin_unlock(&lock->wait_lock); - rt_mutex_adjust_prio_chain(owner, 0, lock, next_lock, NULL, current); + rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock, + next_lock, NULL, current); raw_spin_lock(&lock->wait_lock); } @@ -824,7 +1080,8 @@ void rt_mutex_adjust_pi(struct task_struct *task) /* gets dropped in rt_mutex_adjust_prio_chain()! */ get_task_struct(task); - rt_mutex_adjust_prio_chain(task, 0, NULL, next_lock, NULL, task); + rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL, + next_lock, NULL, task); } /** @@ -902,7 +1159,7 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock, static int __sched rt_mutex_slowlock(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, - int detect_deadlock) + enum rtmutex_chainwalk chwalk) { struct rt_mutex_waiter waiter; int ret = 0; @@ -928,7 +1185,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, timeout->task = NULL; } - ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock); + ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk); if (likely(!ret)) ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); @@ -937,7 +1194,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, if (unlikely(ret)) { remove_waiter(lock, &waiter); - rt_mutex_handle_deadlock(ret, detect_deadlock, &waiter); + rt_mutex_handle_deadlock(ret, chwalk, &waiter); } /* @@ -960,22 +1217,31 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, /* * Slow path try-lock function: */ -static inline int -rt_mutex_slowtrylock(struct rt_mutex *lock) +static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) { - int ret = 0; + int ret; + + /* + * If the lock already has an owner we fail to get the lock. + * This can be done without taking the @lock->wait_lock as + * it is only being read, and this is a trylock anyway. + */ + if (rt_mutex_owner(lock)) + return 0; + /* + * The mutex has currently no owner. Lock the wait lock and + * try to acquire the lock. + */ raw_spin_lock(&lock->wait_lock); - if (likely(rt_mutex_owner(lock) != current)) { + ret = try_to_take_rt_mutex(lock, current, NULL); - ret = try_to_take_rt_mutex(lock, current, NULL); - /* - * try_to_take_rt_mutex() sets the lock waiters - * bit unconditionally. Clean this up. - */ - fixup_rt_mutex_waiters(lock); - } + /* + * try_to_take_rt_mutex() sets the lock waiters bit + * unconditionally. Clean this up. + */ + fixup_rt_mutex_waiters(lock); raw_spin_unlock(&lock->wait_lock); @@ -1053,30 +1319,31 @@ rt_mutex_slowunlock(struct rt_mutex *lock) */ static inline int rt_mutex_fastlock(struct rt_mutex *lock, int state, - int detect_deadlock, int (*slowfn)(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, - int detect_deadlock)) + enum rtmutex_chainwalk chwalk)) { - if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { + if (likely(rt_mutex_cmpxchg(lock, NULL, current))) { rt_mutex_deadlock_account_lock(lock, current); return 0; } else - return slowfn(lock, state, NULL, detect_deadlock); + return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK); } static inline int rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, int detect_deadlock, + struct hrtimer_sleeper *timeout, + enum rtmutex_chainwalk chwalk, int (*slowfn)(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, - int detect_deadlock)) + enum rtmutex_chainwalk chwalk)) { - if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { + if (chwalk == RT_MUTEX_MIN_CHAINWALK && + likely(rt_mutex_cmpxchg(lock, NULL, current))) { rt_mutex_deadlock_account_lock(lock, current); return 0; } else - return slowfn(lock, state, timeout, detect_deadlock); + return slowfn(lock, state, timeout, chwalk); } static inline int @@ -1109,54 +1376,61 @@ void __sched rt_mutex_lock(struct rt_mutex *lock) { might_sleep(); - rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock); + rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock); } EXPORT_SYMBOL_GPL(rt_mutex_lock); /** * rt_mutex_lock_interruptible - lock a rt_mutex interruptible * - * @lock: the rt_mutex to be locked - * @detect_deadlock: deadlock detection on/off + * @lock: the rt_mutex to be locked * * Returns: - * 0 on success - * -EINTR when interrupted by a signal - * -EDEADLK when the lock would deadlock (when deadlock detection is on) + * 0 on success + * -EINTR when interrupted by a signal */ -int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock, - int detect_deadlock) +int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock) { might_sleep(); - return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, - detect_deadlock, rt_mutex_slowlock); + return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock); } EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); +/* + * Futex variant with full deadlock detection. + */ +int rt_mutex_timed_futex_lock(struct rt_mutex *lock, + struct hrtimer_sleeper *timeout) +{ + might_sleep(); + + return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, + RT_MUTEX_FULL_CHAINWALK, + rt_mutex_slowlock); +} + /** * rt_mutex_timed_lock - lock a rt_mutex interruptible * the timeout structure is provided * by the caller * - * @lock: the rt_mutex to be locked + * @lock: the rt_mutex to be locked * @timeout: timeout structure or NULL (no timeout) - * @detect_deadlock: deadlock detection on/off * * Returns: - * 0 on success - * -EINTR when interrupted by a signal + * 0 on success + * -EINTR when interrupted by a signal * -ETIMEDOUT when the timeout expired - * -EDEADLK when the lock would deadlock (when deadlock detection is on) */ int -rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout, - int detect_deadlock) +rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout) { might_sleep(); return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, - detect_deadlock, rt_mutex_slowlock); + RT_MUTEX_MIN_CHAINWALK, + rt_mutex_slowlock); } EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); @@ -1262,7 +1536,6 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock, * @lock: the rt_mutex to take * @waiter: the pre-initialized rt_mutex_waiter * @task: the task to prepare - * @detect_deadlock: perform deadlock detection (1) or not (0) * * Returns: * 0 - task blocked on lock @@ -1273,7 +1546,7 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock, */ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, - struct task_struct *task, int detect_deadlock) + struct task_struct *task) { int ret; @@ -1285,7 +1558,8 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, } /* We enforce deadlock detection for futexes */ - ret = task_blocks_on_rt_mutex(lock, waiter, task, 1); + ret = task_blocks_on_rt_mutex(lock, waiter, task, + RT_MUTEX_FULL_CHAINWALK); if (ret && !rt_mutex_owner(lock)) { /* @@ -1331,22 +1605,20 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock) * rt_mutex_finish_proxy_lock() - Complete lock acquisition * @lock: the rt_mutex we were woken on * @to: the timeout, null if none. hrtimer should already have - * been started. + * been started. * @waiter: the pre-initialized rt_mutex_waiter - * @detect_deadlock: perform deadlock detection (1) or not (0) * * Complete the lock acquisition started our behalf by another thread. * * Returns: * 0 - success - * <0 - error, one of -EINTR, -ETIMEDOUT, or -EDEADLK + * <0 - error, one of -EINTR, -ETIMEDOUT * * Special API call for PI-futex requeue support */ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, struct hrtimer_sleeper *to, - struct rt_mutex_waiter *waiter, - int detect_deadlock) + struct rt_mutex_waiter *waiter) { int ret; diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h index f6a1f3c133b1..c4060584c407 100644 --- a/kernel/locking/rtmutex.h +++ b/kernel/locking/rtmutex.h @@ -22,10 +22,15 @@ #define debug_rt_mutex_init(m, n) do { } while (0) #define debug_rt_mutex_deadlock(d, a ,l) do { } while (0) #define debug_rt_mutex_print_deadlock(w) do { } while (0) -#define debug_rt_mutex_detect_deadlock(w,d) (d) #define debug_rt_mutex_reset_waiter(w) do { } while (0) static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w) { WARN(1, "rtmutex deadlock detected\n"); } + +static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *w, + enum rtmutex_chainwalk walk) +{ + return walk == RT_MUTEX_FULL_CHAINWALK; +} diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 7431a9c86f35..855212501407 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -102,6 +102,21 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) } /* + * Constants for rt mutex functions which have a selectable deadlock + * detection. + * + * RT_MUTEX_MIN_CHAINWALK: Stops the lock chain walk when there are + * no further PI adjustments to be made. + * + * RT_MUTEX_FULL_CHAINWALK: Invoke deadlock detection with a full + * walk of the lock chain. + */ +enum rtmutex_chainwalk { + RT_MUTEX_MIN_CHAINWALK, + RT_MUTEX_FULL_CHAINWALK, +}; + +/* * PI-futex support (proxy locking functions, etc.): */ extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); @@ -111,12 +126,11 @@ extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, struct task_struct *proxy_owner); extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, - struct task_struct *task, - int detect_deadlock); + struct task_struct *task); extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, struct hrtimer_sleeper *to, - struct rt_mutex_waiter *waiter, - int detect_deadlock); + struct rt_mutex_waiter *waiter); +extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to); #ifdef CONFIG_DEBUG_RT_MUTEXES # include "rtmutex-debug.h" diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index a2391ac135c8..d6203faf2eb1 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -329,7 +329,7 @@ bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) if (need_resched()) break; - arch_mutex_cpu_relax(); + cpu_relax_lowlatency(); } rcu_read_unlock(); @@ -381,7 +381,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) * memory barriers as we'll eventually observe the right * values at the cost of a few extra spins. */ - arch_mutex_cpu_relax(); + cpu_relax_lowlatency(); } osq_unlock(&sem->osq); done: diff --git a/kernel/module.c b/kernel/module.c index 81e727cf6df9..ae79ce615cb9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -60,7 +60,6 @@ #include <linux/jump_label.h> #include <linux/pfn.h> #include <linux/bsearch.h> -#include <linux/fips.h> #include <uapi/linux/module.h> #include "module-internal.h" @@ -2448,9 +2447,6 @@ static int module_sig_check(struct load_info *info) } /* Not having a signature is only an error if we're strict. */ - if (err < 0 && fips_enabled) - panic("Module verification failed with error %d in FIPS mode\n", - err); if (err == -ENOKEY && !sig_enforce) err = 0; diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index fcc2611d3f14..a9dfa79b6bab 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -371,7 +371,6 @@ int hibernation_snapshot(int platform_mode) } suspend_console(); - ftrace_stop(); pm_restrict_gfp_mask(); error = dpm_suspend(PMSG_FREEZE); @@ -397,7 +396,6 @@ int hibernation_snapshot(int platform_mode) if (error || !in_suspend) pm_restore_gfp_mask(); - ftrace_start(); resume_console(); dpm_complete(msg); @@ -500,7 +498,6 @@ int hibernation_restore(int platform_mode) pm_prepare_console(); suspend_console(); - ftrace_stop(); pm_restrict_gfp_mask(); error = dpm_suspend_start(PMSG_QUIESCE); if (!error) { @@ -508,7 +505,6 @@ int hibernation_restore(int platform_mode) dpm_resume_end(PMSG_RECOVER); } pm_restore_gfp_mask(); - ftrace_start(); resume_console(); pm_restore_console(); return error; @@ -535,7 +531,6 @@ int hibernation_platform_enter(void) entering_platform_hibernation = true; suspend_console(); - ftrace_stop(); error = dpm_suspend_start(PMSG_HIBERNATE); if (error) { if (hibernation_ops->recover) @@ -579,7 +574,6 @@ int hibernation_platform_enter(void) Resume_devices: entering_platform_hibernation = false; dpm_resume_end(PMSG_RESTORE); - ftrace_start(); resume_console(); Close: diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index ed35a4790afe..4b736b4dfa96 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -248,7 +248,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) goto Platform_wake; } - ftrace_stop(); error = disable_nonboot_cpus(); if (error || suspend_test(TEST_CPUS)) goto Enable_cpus; @@ -275,7 +274,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) Enable_cpus: enable_nonboot_cpus(); - ftrace_start(); Platform_wake: if (need_suspend_ops(state) && suspend_ops->wake) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index bfda2726ca45..ff1a6de62f17 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -99,6 +99,10 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) void kfree(const void *); +/* + * Reclaim the specified callback, either by invoking it (non-lazy case) + * or freeing it directly (lazy case). Return true if lazy, false otherwise. + */ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) { unsigned long offset = (unsigned long)head->func; @@ -108,12 +112,12 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); kfree((void *)head - offset); rcu_lock_release(&rcu_callback_map); - return 1; + return true; } else { RCU_TRACE(trace_rcu_invoke_callback(rn, head)); head->func(head); rcu_lock_release(&rcu_callback_map); - return 0; + return false; } } diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 7fa34f86e5ba..948a7693748e 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -18,7 +18,7 @@ * Copyright (C) IBM Corporation, 2005, 2006 * * Authors: Paul E. McKenney <paulmck@us.ibm.com> - * Josh Triplett <josh@freedesktop.org> + * Josh Triplett <josh@joshtriplett.org> * * See also: Documentation/RCU/torture.txt */ @@ -51,7 +51,7 @@ #include <linux/torture.h> MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); torture_param(int, fqs_duration, 0, diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index c639556f3fa0..e037f3eb2f7b 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -298,9 +298,9 @@ int __srcu_read_lock(struct srcu_struct *sp) idx = ACCESS_ONCE(sp->completed) & 0x1; preempt_disable(); - ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1; + __this_cpu_inc(sp->per_cpu_ref->c[idx]); smp_mb(); /* B */ /* Avoid leaking the critical section. */ - ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1; + __this_cpu_inc(sp->per_cpu_ref->seq[idx]); preempt_enable(); return idx; } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 625d0b0cd75a..1b70cb6fbe3c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1013,10 +1013,7 @@ static void record_gp_stall_check_time(struct rcu_state *rsp) } /* - * Dump stacks of all tasks running on stalled CPUs. This is a fallback - * for architectures that do not implement trigger_all_cpu_backtrace(). - * The NMI-triggered stack traces are more accurate because they are - * printed by the target CPU. + * Dump stacks of all tasks running on stalled CPUs. */ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) { @@ -1094,7 +1091,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) (long)rsp->gpnum, (long)rsp->completed, totqlen); if (ndetected == 0) pr_err("INFO: Stall ended before state dump start\n"); - else if (!trigger_all_cpu_backtrace()) + else rcu_dump_cpu_stacks(rsp); /* Complain about tasks blocking the grace period. */ @@ -1125,8 +1122,7 @@ static void print_cpu_stall(struct rcu_state *rsp) pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", jiffies - rsp->gp_start, (long)rsp->gpnum, (long)rsp->completed, totqlen); - if (!trigger_all_cpu_backtrace()) - dump_stack(); + rcu_dump_cpu_stacks(rsp); raw_spin_lock_irqsave(&rnp->lock, flags); if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall))) @@ -1305,10 +1301,16 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, * believe that a grace period is in progress, then we must wait * for the one following, which is in "c". Because our request * will be noticed at the end of the current grace period, we don't - * need to explicitly start one. + * need to explicitly start one. We only do the lockless check + * of rnp_root's fields if the current rcu_node structure thinks + * there is no grace period in flight, and because we hold rnp->lock, + * the only possible change is when rnp_root's two fields are + * equal, in which case rnp_root->gpnum might be concurrently + * incremented. But that is OK, as it will just result in our + * doing some extra useless work. */ if (rnp->gpnum != rnp->completed || - ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { + ACCESS_ONCE(rnp_root->gpnum) != ACCESS_ONCE(rnp_root->completed)) { rnp->need_future_gp[c & 0x1]++; trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); goto out; @@ -1645,11 +1647,6 @@ static int rcu_gp_init(struct rcu_state *rsp) rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); raw_spin_unlock_irq(&rnp->lock); -#ifdef CONFIG_PROVE_RCU_DELAY - if ((prandom_u32() % (rcu_num_nodes + 1)) == 0 && - system_state == SYSTEM_RUNNING) - udelay(200); -#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ cond_resched(); } @@ -2347,7 +2344,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) } smp_mb(); /* List handling before counting for rcu_barrier(). */ rdp->qlen_lazy -= count_lazy; - ACCESS_ONCE(rdp->qlen) -= count; + ACCESS_ONCE(rdp->qlen) = rdp->qlen - count; rdp->n_cbs_invoked += count; /* Reinstate batch limit if we have worked down the excess. */ @@ -2485,14 +2482,14 @@ static void force_quiescent_state(struct rcu_state *rsp) struct rcu_node *rnp_old = NULL; /* Funnel through hierarchy to reduce memory contention. */ - rnp = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode; + rnp = __this_cpu_read(rsp->rda->mynode); for (; rnp != NULL; rnp = rnp->parent) { ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) || !raw_spin_trylock(&rnp->fqslock); if (rnp_old != NULL) raw_spin_unlock(&rnp_old->fqslock); if (ret) { - ACCESS_ONCE(rsp->n_force_qs_lh)++; + rsp->n_force_qs_lh++; return; } rnp_old = rnp; @@ -2504,7 +2501,7 @@ static void force_quiescent_state(struct rcu_state *rsp) smp_mb__after_unlock_lock(); raw_spin_unlock(&rnp_old->fqslock); if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { - ACCESS_ONCE(rsp->n_force_qs_lh)++; + rsp->n_force_qs_lh++; raw_spin_unlock_irqrestore(&rnp_old->lock, flags); return; /* Someone beat us to it. */ } @@ -2662,7 +2659,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), unsigned long flags; struct rcu_data *rdp; - WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */ + WARN_ON_ONCE((unsigned long)head & 0x1); /* Misaligned rcu_head! */ if (debug_rcu_head_queue(head)) { /* Probable double call_rcu(), so leak the callback. */ ACCESS_ONCE(head->func) = rcu_leak_callback; @@ -2693,7 +2690,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), local_irq_restore(flags); return; } - ACCESS_ONCE(rdp->qlen)++; + ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1; if (lazy) rdp->qlen_lazy++; else @@ -3257,7 +3254,7 @@ static void _rcu_barrier(struct rcu_state *rsp) * ACCESS_ONCE() to prevent the compiler from speculating * the increment to precede the early-exit check. */ - ACCESS_ONCE(rsp->n_barrier_done)++; + ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1; WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1); _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done); smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */ @@ -3307,7 +3304,7 @@ static void _rcu_barrier(struct rcu_state *rsp) /* Increment ->n_barrier_done to prevent duplicate work. */ smp_mb(); /* Keep increment after above mechanism. */ - ACCESS_ONCE(rsp->n_barrier_done)++; + ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1; WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0); _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done); smp_mb(); /* Keep increment before caller's subsequent code. */ @@ -3564,14 +3561,16 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) static void __init rcu_init_one(struct rcu_state *rsp, struct rcu_data __percpu *rda) { - static char *buf[] = { "rcu_node_0", - "rcu_node_1", - "rcu_node_2", - "rcu_node_3" }; /* Match MAX_RCU_LVLS */ - static char *fqs[] = { "rcu_node_fqs_0", - "rcu_node_fqs_1", - "rcu_node_fqs_2", - "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */ + static const char * const buf[] = { + "rcu_node_0", + "rcu_node_1", + "rcu_node_2", + "rcu_node_3" }; /* Match MAX_RCU_LVLS */ + static const char * const fqs[] = { + "rcu_node_fqs_0", + "rcu_node_fqs_1", + "rcu_node_fqs_2", + "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */ static u8 fl_mask = 0x1; int cpustride = 1; int i; diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 0f69a79c5b7d..71e64c718f75 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -172,6 +172,14 @@ struct rcu_node { /* queued on this rcu_node structure that */ /* are blocking the current grace period, */ /* there can be no such task. */ + struct completion boost_completion; + /* Used to ensure that the rt_mutex used */ + /* to carry out the boosting is fully */ + /* released with no future boostee accesses */ + /* before that rt_mutex is re-initialized. */ + struct rt_mutex boost_mtx; + /* Used only for the priority-boosting */ + /* side effect, not as a lock. */ unsigned long boost_time; /* When to start boosting (jiffies). */ struct task_struct *boost_kthread_task; @@ -334,11 +342,29 @@ struct rcu_data { struct rcu_head **nocb_tail; atomic_long_t nocb_q_count; /* # CBs waiting for kthread */ atomic_long_t nocb_q_count_lazy; /* (approximate). */ + struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */ + struct rcu_head **nocb_follower_tail; + atomic_long_t nocb_follower_count; /* # CBs ready to invoke. */ + atomic_long_t nocb_follower_count_lazy; /* (approximate). */ int nocb_p_count; /* # CBs being invoked by kthread */ int nocb_p_count_lazy; /* (approximate). */ wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ struct task_struct *nocb_kthread; bool nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ + + /* The following fields are used by the leader, hence own cacheline. */ + struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp; + /* CBs waiting for GP. */ + struct rcu_head **nocb_gp_tail; + long nocb_gp_count; + long nocb_gp_count_lazy; + bool nocb_leader_wake; /* Is the nocb leader thread awake? */ + struct rcu_data *nocb_next_follower; + /* Next follower in wakeup chain. */ + + /* The following fields are used by the follower, hence new cachline. */ + struct rcu_data *nocb_leader ____cacheline_internodealigned_in_smp; + /* Leader CPU takes GP-end wakeups. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ /* 8) RCU CPU stall data. */ @@ -587,8 +613,14 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp); /* Sum up queue lengths for tracing. */ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) { - *ql = atomic_long_read(&rdp->nocb_q_count) + rdp->nocb_p_count; - *qll = atomic_long_read(&rdp->nocb_q_count_lazy) + rdp->nocb_p_count_lazy; + *ql = atomic_long_read(&rdp->nocb_q_count) + + rdp->nocb_p_count + + atomic_long_read(&rdp->nocb_follower_count) + + rdp->nocb_p_count + rdp->nocb_gp_count; + *qll = atomic_long_read(&rdp->nocb_q_count_lazy) + + rdp->nocb_p_count_lazy + + atomic_long_read(&rdp->nocb_follower_count_lazy) + + rdp->nocb_p_count_lazy + rdp->nocb_gp_count_lazy; } #else /* #ifdef CONFIG_RCU_NOCB_CPU */ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 02ac0fb186b8..00dc411e9676 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -33,6 +33,7 @@ #define RCU_KTHREAD_PRIO 1 #ifdef CONFIG_RCU_BOOST +#include "../locking/rtmutex_common.h" #define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO #else #define RCU_BOOST_PRIO RCU_KTHREAD_PRIO @@ -336,7 +337,7 @@ void rcu_read_unlock_special(struct task_struct *t) unsigned long flags; struct list_head *np; #ifdef CONFIG_RCU_BOOST - struct rt_mutex *rbmp = NULL; + bool drop_boost_mutex = false; #endif /* #ifdef CONFIG_RCU_BOOST */ struct rcu_node *rnp; int special; @@ -398,11 +399,8 @@ void rcu_read_unlock_special(struct task_struct *t) #ifdef CONFIG_RCU_BOOST if (&t->rcu_node_entry == rnp->boost_tasks) rnp->boost_tasks = np; - /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */ - if (t->rcu_boost_mutex) { - rbmp = t->rcu_boost_mutex; - t->rcu_boost_mutex = NULL; - } + /* Snapshot ->boost_mtx ownership with rcu_node lock held. */ + drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t; #endif /* #ifdef CONFIG_RCU_BOOST */ /* @@ -427,8 +425,10 @@ void rcu_read_unlock_special(struct task_struct *t) #ifdef CONFIG_RCU_BOOST /* Unboost if we were boosted. */ - if (rbmp) - rt_mutex_unlock(rbmp); + if (drop_boost_mutex) { + rt_mutex_unlock(&rnp->boost_mtx); + complete(&rnp->boost_completion); + } #endif /* #ifdef CONFIG_RCU_BOOST */ /* @@ -988,6 +988,7 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) /* Because preemptible RCU does not exist, no quieting of tasks. */ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) + __releases(rnp->lock) { raw_spin_unlock_irqrestore(&rnp->lock, flags); } @@ -1149,7 +1150,6 @@ static void rcu_wake_cond(struct task_struct *t, int status) static int rcu_boost(struct rcu_node *rnp) { unsigned long flags; - struct rt_mutex mtx; struct task_struct *t; struct list_head *tb; @@ -1200,11 +1200,15 @@ static int rcu_boost(struct rcu_node *rnp) * section. */ t = container_of(tb, struct task_struct, rcu_node_entry); - rt_mutex_init_proxy_locked(&mtx, t); - t->rcu_boost_mutex = &mtx; + rt_mutex_init_proxy_locked(&rnp->boost_mtx, t); + init_completion(&rnp->boost_completion); raw_spin_unlock_irqrestore(&rnp->lock, flags); - rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ - rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ + /* Lock only for side effect: boosts task t's priority. */ + rt_mutex_lock(&rnp->boost_mtx); + rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */ + + /* Wait for boostee to be done w/boost_mtx before reinitializing. */ + wait_for_completion(&rnp->boost_completion); return ACCESS_ONCE(rnp->exp_tasks) != NULL || ACCESS_ONCE(rnp->boost_tasks) != NULL; @@ -1256,6 +1260,7 @@ static int rcu_boost_kthread(void *arg) * about it going away. */ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) + __releases(rnp->lock) { struct task_struct *t; @@ -1491,6 +1496,7 @@ static void rcu_prepare_kthreads(int cpu) #else /* #ifdef CONFIG_RCU_BOOST */ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) + __releases(rnp->lock) { raw_spin_unlock_irqrestore(&rnp->lock, flags); } @@ -2060,6 +2066,22 @@ bool rcu_is_nocb_cpu(int cpu) #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ /* + * Kick the leader kthread for this NOCB group. + */ +static void wake_nocb_leader(struct rcu_data *rdp, bool force) +{ + struct rcu_data *rdp_leader = rdp->nocb_leader; + + if (!ACCESS_ONCE(rdp_leader->nocb_kthread)) + return; + if (!ACCESS_ONCE(rdp_leader->nocb_leader_wake) || force) { + /* Prior xchg orders against prior callback enqueue. */ + ACCESS_ONCE(rdp_leader->nocb_leader_wake) = true; + wake_up(&rdp_leader->nocb_wq); + } +} + +/* * Enqueue the specified string of rcu_head structures onto the specified * CPU's no-CBs lists. The CPU is specified by rdp, the head of the * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy @@ -2093,7 +2115,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, len = atomic_long_read(&rdp->nocb_q_count); if (old_rhpp == &rdp->nocb_head) { if (!irqs_disabled_flags(flags)) { - wake_up(&rdp->nocb_wq); /* ... if queue was empty ... */ + /* ... if queue was empty ... */ + wake_nocb_leader(rdp, false); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty")); } else { @@ -2103,7 +2126,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, } rdp->qlen_last_fqs_check = 0; } else if (len > rdp->qlen_last_fqs_check + qhimark) { - wake_up_process(t); /* ... or if many callbacks queued. */ + /* ... or if many callbacks queued. */ + wake_nocb_leader(rdp, true); rdp->qlen_last_fqs_check = LONG_MAX / 2; trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); } else { @@ -2213,13 +2237,150 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) } /* + * Leaders come here to wait for additional callbacks to show up. + * This function does not return until callbacks appear. + */ +static void nocb_leader_wait(struct rcu_data *my_rdp) +{ + bool firsttime = true; + bool gotcbs; + struct rcu_data *rdp; + struct rcu_head **tail; + +wait_again: + + /* Wait for callbacks to appear. */ + if (!rcu_nocb_poll) { + trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep"); + wait_event_interruptible(my_rdp->nocb_wq, + ACCESS_ONCE(my_rdp->nocb_leader_wake)); + /* Memory barrier handled by smp_mb() calls below and repoll. */ + } else if (firsttime) { + firsttime = false; /* Don't drown trace log with "Poll"! */ + trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Poll"); + } + + /* + * Each pass through the following loop checks a follower for CBs. + * We are our own first follower. Any CBs found are moved to + * nocb_gp_head, where they await a grace period. + */ + gotcbs = false; + for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { + rdp->nocb_gp_head = ACCESS_ONCE(rdp->nocb_head); + if (!rdp->nocb_gp_head) + continue; /* No CBs here, try next follower. */ + + /* Move callbacks to wait-for-GP list, which is empty. */ + ACCESS_ONCE(rdp->nocb_head) = NULL; + rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); + rdp->nocb_gp_count = atomic_long_xchg(&rdp->nocb_q_count, 0); + rdp->nocb_gp_count_lazy = + atomic_long_xchg(&rdp->nocb_q_count_lazy, 0); + gotcbs = true; + } + + /* + * If there were no callbacks, sleep a bit, rescan after a + * memory barrier, and go retry. + */ + if (unlikely(!gotcbs)) { + if (!rcu_nocb_poll) + trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, + "WokeEmpty"); + flush_signals(current); + schedule_timeout_interruptible(1); + + /* Rescan in case we were a victim of memory ordering. */ + my_rdp->nocb_leader_wake = false; + smp_mb(); /* Ensure _wake false before scan. */ + for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) + if (ACCESS_ONCE(rdp->nocb_head)) { + /* Found CB, so short-circuit next wait. */ + my_rdp->nocb_leader_wake = true; + break; + } + goto wait_again; + } + + /* Wait for one grace period. */ + rcu_nocb_wait_gp(my_rdp); + + /* + * We left ->nocb_leader_wake set to reduce cache thrashing. + * We clear it now, but recheck for new callbacks while + * traversing our follower list. + */ + my_rdp->nocb_leader_wake = false; + smp_mb(); /* Ensure _wake false before scan of ->nocb_head. */ + + /* Each pass through the following loop wakes a follower, if needed. */ + for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { + if (ACCESS_ONCE(rdp->nocb_head)) + my_rdp->nocb_leader_wake = true; /* No need to wait. */ + if (!rdp->nocb_gp_head) + continue; /* No CBs, so no need to wake follower. */ + + /* Append callbacks to follower's "done" list. */ + tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail); + *tail = rdp->nocb_gp_head; + atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count); + atomic_long_add(rdp->nocb_gp_count_lazy, + &rdp->nocb_follower_count_lazy); + if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { + /* + * List was empty, wake up the follower. + * Memory barriers supplied by atomic_long_add(). + */ + wake_up(&rdp->nocb_wq); + } + } + + /* If we (the leader) don't have CBs, go wait some more. */ + if (!my_rdp->nocb_follower_head) + goto wait_again; +} + +/* + * Followers come here to wait for additional callbacks to show up. + * This function does not return until callbacks appear. + */ +static void nocb_follower_wait(struct rcu_data *rdp) +{ + bool firsttime = true; + + for (;;) { + if (!rcu_nocb_poll) { + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + "FollowerSleep"); + wait_event_interruptible(rdp->nocb_wq, + ACCESS_ONCE(rdp->nocb_follower_head)); + } else if (firsttime) { + /* Don't drown trace log with "Poll"! */ + firsttime = false; + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "Poll"); + } + if (smp_load_acquire(&rdp->nocb_follower_head)) { + /* ^^^ Ensure CB invocation follows _head test. */ + return; + } + if (!rcu_nocb_poll) + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + "WokeEmpty"); + flush_signals(current); + schedule_timeout_interruptible(1); + } +} + +/* * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes - * callbacks queued by the corresponding no-CBs CPU. + * callbacks queued by the corresponding no-CBs CPU, however, there is + * an optional leader-follower relationship so that the grace-period + * kthreads don't have to do quite so many wakeups. */ static int rcu_nocb_kthread(void *arg) { int c, cl; - bool firsttime = 1; struct rcu_head *list; struct rcu_head *next; struct rcu_head **tail; @@ -2227,41 +2388,22 @@ static int rcu_nocb_kthread(void *arg) /* Each pass through this loop invokes one batch of callbacks */ for (;;) { - /* If not polling, wait for next batch of callbacks. */ - if (!rcu_nocb_poll) { - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("Sleep")); - wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head); - /* Memory barrier provide by xchg() below. */ - } else if (firsttime) { - firsttime = 0; - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("Poll")); - } - list = ACCESS_ONCE(rdp->nocb_head); - if (!list) { - if (!rcu_nocb_poll) - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("WokeEmpty")); - schedule_timeout_interruptible(1); - flush_signals(current); - continue; - } - firsttime = 1; - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("WokeNonEmpty")); - - /* - * Extract queued callbacks, update counts, and wait - * for a grace period to elapse. - */ - ACCESS_ONCE(rdp->nocb_head) = NULL; - tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); - c = atomic_long_xchg(&rdp->nocb_q_count, 0); - cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0); - ACCESS_ONCE(rdp->nocb_p_count) += c; - ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl; - rcu_nocb_wait_gp(rdp); + /* Wait for callbacks. */ + if (rdp->nocb_leader == rdp) + nocb_leader_wait(rdp); + else + nocb_follower_wait(rdp); + + /* Pull the ready-to-invoke callbacks onto local list. */ + list = ACCESS_ONCE(rdp->nocb_follower_head); + BUG_ON(!list); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty"); + ACCESS_ONCE(rdp->nocb_follower_head) = NULL; + tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head); + c = atomic_long_xchg(&rdp->nocb_follower_count, 0); + cl = atomic_long_xchg(&rdp->nocb_follower_count_lazy, 0); + rdp->nocb_p_count += c; + rdp->nocb_p_count_lazy += cl; /* Each pass through the following loop invokes a callback. */ trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); @@ -2305,7 +2447,7 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) if (!rcu_nocb_need_deferred_wakeup(rdp)) return; ACCESS_ONCE(rdp->nocb_defer_wakeup) = false; - wake_up(&rdp->nocb_wq); + wake_nocb_leader(rdp, false); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWakeEmpty")); } @@ -2314,19 +2456,57 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) { rdp->nocb_tail = &rdp->nocb_head; init_waitqueue_head(&rdp->nocb_wq); + rdp->nocb_follower_tail = &rdp->nocb_follower_head; } -/* Create a kthread for each RCU flavor for each no-CBs CPU. */ +/* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */ +static int rcu_nocb_leader_stride = -1; +module_param(rcu_nocb_leader_stride, int, 0444); + +/* + * Create a kthread for each RCU flavor for each no-CBs CPU. + * Also initialize leader-follower relationships. + */ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) { int cpu; + int ls = rcu_nocb_leader_stride; + int nl = 0; /* Next leader. */ struct rcu_data *rdp; + struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */ + struct rcu_data *rdp_prev = NULL; struct task_struct *t; if (rcu_nocb_mask == NULL) return; +#if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL) + if (tick_nohz_full_running) + cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); +#endif /* #if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL) */ + if (ls == -1) { + ls = int_sqrt(nr_cpu_ids); + rcu_nocb_leader_stride = ls; + } + + /* + * Each pass through this loop sets up one rcu_data structure and + * spawns one rcu_nocb_kthread(). + */ for_each_cpu(cpu, rcu_nocb_mask) { rdp = per_cpu_ptr(rsp->rda, cpu); + if (rdp->cpu >= nl) { + /* New leader, set up for followers & next leader. */ + nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls; + rdp->nocb_leader = rdp; + rdp_leader = rdp; + } else { + /* Another follower, link to previous leader. */ + rdp->nocb_leader = rdp_leader; + rdp_prev->nocb_next_follower = rdp; + } + rdp_prev = rdp; + + /* Spawn the kthread for this CPU. */ t = kthread_run(rcu_nocb_kthread, rdp, "rcuo%c/%d", rsp->abbr, cpu); BUG_ON(IS_ERR(t)); @@ -2843,12 +3023,16 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp) */ static void rcu_bind_gp_kthread(void) { -#ifdef CONFIG_NO_HZ_FULL - int cpu = ACCESS_ONCE(tick_do_timer_cpu); + int __maybe_unused cpu; - if (cpu < 0 || cpu >= nr_cpu_ids) + if (!tick_nohz_full_enabled()) return; - if (raw_smp_processor_id() != cpu) +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE + cpu = tick_do_timer_cpu; + if (cpu >= 0 && cpu < nr_cpu_ids && raw_smp_processor_id() != cpu) set_cpus_allowed_ptr(current, cpumask_of(cpu)); -#endif /* #ifdef CONFIG_NO_HZ_FULL */ +#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ + if (!is_housekeeping_cpu(raw_smp_processor_id())) + housekeeping_affine(current); +#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ } diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index bc7883570530..4056d7992a6c 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -90,9 +90,6 @@ void __rcu_read_unlock(void) } else { barrier(); /* critical section before exit code. */ t->rcu_read_lock_nesting = INT_MIN; -#ifdef CONFIG_PROVE_RCU_DELAY - udelay(10); /* Make preemption more probable. */ -#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ barrier(); /* assign before ->rcu_read_unlock_special load */ if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) rcu_read_unlock_special(t); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bc1638b33449..126f7e3f04e7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8083,7 +8083,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, .exit = cpu_cgroup_exit, - .base_cftypes = cpu_files, + .legacy_cftypes = cpu_files, .early_init = 1, }; diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 9cf350c94ec4..dd7cbb55bbf2 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -278,6 +278,6 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val) struct cgroup_subsys cpuacct_cgrp_subsys = { .css_alloc = cpuacct_css_alloc, .css_free = cpuacct_css_free, - .base_cftypes = files, + .legacy_cftypes = files, .early_init = 1, }; diff --git a/kernel/signal.c b/kernel/signal.c index a4077e90f19f..40b76e351e64 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1263,6 +1263,10 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, struct sighand_struct *sighand; for (;;) { + /* + * Disable interrupts early to avoid deadlocks. + * See rcu_read_unlock() comment header for details. + */ local_irq_save(*flags); rcu_read_lock(); sighand = rcu_dereference(tsk->sighand); diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index ad362c260ef4..9c94c19f1305 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -146,7 +146,8 @@ static int clockevents_increase_min_delta(struct clock_event_device *dev) { /* Nothing to do if we already reached the limit */ if (dev->min_delta_ns >= MIN_DELTA_LIMIT) { - printk(KERN_WARNING "CE: Reprogramming failure. Giving up\n"); + printk_deferred(KERN_WARNING + "CE: Reprogramming failure. Giving up\n"); dev->next_event.tv64 = KTIME_MAX; return -ETIME; } @@ -159,9 +160,10 @@ static int clockevents_increase_min_delta(struct clock_event_device *dev) if (dev->min_delta_ns > MIN_DELTA_LIMIT) dev->min_delta_ns = MIN_DELTA_LIMIT; - printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n", - dev->name ? dev->name : "?", - (unsigned long long) dev->min_delta_ns); + printk_deferred(KERN_WARNING + "CE: %s increased min_delta_ns to %llu nsec\n", + dev->name ? dev->name : "?", + (unsigned long long) dev->min_delta_ns); return 0; } diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 445106d2c729..01d2d15aa662 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -191,7 +191,8 @@ void __init sched_clock_postinit(void) static int sched_clock_suspend(void) { - sched_clock_poll(&sched_clock_timer); + update_sched_clock(); + hrtimer_cancel(&sched_clock_timer); cd.suspended = true; return 0; } @@ -199,6 +200,7 @@ static int sched_clock_suspend(void) static void sched_clock_resume(void) { cd.epoch_cyc = read_sched_clock(); + hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); cd.suspended = false; } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6558b7ac112d..f784d83e29f1 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -154,6 +154,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) #ifdef CONFIG_NO_HZ_FULL cpumask_var_t tick_nohz_full_mask; +cpumask_var_t housekeeping_mask; bool tick_nohz_full_running; static bool can_stop_full_tick(void) @@ -281,6 +282,7 @@ static int __init tick_nohz_full_setup(char *str) int cpu; alloc_bootmem_cpumask_var(&tick_nohz_full_mask); + alloc_bootmem_cpumask_var(&housekeeping_mask); if (cpulist_parse(str, tick_nohz_full_mask) < 0) { pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); return 1; @@ -291,6 +293,8 @@ static int __init tick_nohz_full_setup(char *str) pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); cpumask_clear_cpu(cpu, tick_nohz_full_mask); } + cpumask_andnot(housekeeping_mask, + cpu_possible_mask, tick_nohz_full_mask); tick_nohz_full_running = true; return 1; @@ -332,9 +336,15 @@ static int tick_nohz_init_all(void) pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); return err; } + if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) { + pr_err("NO_HZ: Can't allocate not-full dynticks cpumask\n"); + return err; + } err = 0; cpumask_setall(tick_nohz_full_mask); cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask); + cpumask_clear(housekeeping_mask); + cpumask_set_cpu(smp_processor_id(), housekeeping_mask); tick_nohz_full_running = true; #endif return err; diff --git a/kernel/torture.c b/kernel/torture.c index 40bb511cca48..d600af21f022 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -708,7 +708,7 @@ int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m, int ret = 0; VERBOSE_TOROUT_STRING(m); - *tp = kthread_run(fn, arg, s); + *tp = kthread_run(fn, arg, "%s", s); if (IS_ERR(*tp)) { ret = PTR_ERR(*tp); VERBOSE_TOROUT_ERRSTRING(f); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d4409356f40d..a5da09c899dd 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -29,11 +29,6 @@ config HAVE_FUNCTION_GRAPH_FP_TEST help See Documentation/trace/ftrace-design.txt -config HAVE_FUNCTION_TRACE_MCOUNT_TEST - bool - help - See Documentation/trace/ftrace-design.txt - config HAVE_DYNAMIC_FTRACE bool help diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 2611613f14f1..67d6369ddf83 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -28,6 +28,7 @@ obj-$(CONFIG_RING_BUFFER_BENCHMARK) += ring_buffer_benchmark.o obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_TRACING) += trace_output.o +obj-$(CONFIG_TRACING) += trace_seq.o obj-$(CONFIG_TRACING) += trace_stat.o obj-$(CONFIG_TRACING) += trace_printk.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ac9d1dad630b..1654b12c891a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -80,9 +80,6 @@ static struct ftrace_ops ftrace_list_end __read_mostly = { int ftrace_enabled __read_mostly; static int last_ftrace_enabled; -/* Quick disabling of function tracer. */ -int function_trace_stop __read_mostly; - /* Current function tracing op */ struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end; /* What to set function_trace_op to */ @@ -1042,6 +1039,8 @@ static struct pid * const ftrace_swapper_pid = &init_struct_pid; #ifdef CONFIG_DYNAMIC_FTRACE +static struct ftrace_ops *removed_ops; + #ifndef CONFIG_FTRACE_MCOUNT_RECORD # error Dynamic ftrace depends on MCOUNT_RECORD #endif @@ -1304,25 +1303,15 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, struct ftrace_hash *new_hash; int size = src->count; int bits = 0; - int ret; int i; /* - * Remove the current set, update the hash and add - * them back. - */ - ftrace_hash_rec_disable(ops, enable); - - /* * If the new source is empty, just free dst and assign it * the empty_hash. */ if (!src->count) { - free_ftrace_hash_rcu(*dst); - rcu_assign_pointer(*dst, EMPTY_HASH); - /* still need to update the function records */ - ret = 0; - goto out; + new_hash = EMPTY_HASH; + goto update; } /* @@ -1335,10 +1324,9 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, if (bits > FTRACE_HASH_MAX_BITS) bits = FTRACE_HASH_MAX_BITS; - ret = -ENOMEM; new_hash = alloc_ftrace_hash(bits); if (!new_hash) - goto out; + return -ENOMEM; size = 1 << src->size_bits; for (i = 0; i < size; i++) { @@ -1349,20 +1337,20 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, } } +update: + /* + * Remove the current set, update the hash and add + * them back. + */ + ftrace_hash_rec_disable(ops, enable); + old_hash = *dst; rcu_assign_pointer(*dst, new_hash); free_ftrace_hash_rcu(old_hash); - ret = 0; - out: - /* - * Enable regardless of ret: - * On success, we enable the new hash. - * On failure, we re-enable the original hash. - */ ftrace_hash_rec_enable(ops, enable); - return ret; + return 0; } /* @@ -1492,6 +1480,53 @@ int ftrace_text_reserved(const void *start, const void *end) return (int)!!ret; } +/* Test if ops registered to this rec needs regs */ +static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec) +{ + struct ftrace_ops *ops; + bool keep_regs = false; + + for (ops = ftrace_ops_list; + ops != &ftrace_list_end; ops = ops->next) { + /* pass rec in as regs to have non-NULL val */ + if (ftrace_ops_test(ops, rec->ip, rec)) { + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { + keep_regs = true; + break; + } + } + } + + return keep_regs; +} + +static void ftrace_remove_tramp(struct ftrace_ops *ops, + struct dyn_ftrace *rec) +{ + struct ftrace_func_entry *entry; + + entry = ftrace_lookup_ip(ops->tramp_hash, rec->ip); + if (!entry) + return; + + /* + * The tramp_hash entry will be removed at time + * of update. + */ + ops->nr_trampolines--; + rec->flags &= ~FTRACE_FL_TRAMP; +} + +static void ftrace_clear_tramps(struct dyn_ftrace *rec) +{ + struct ftrace_ops *op; + + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (op->nr_trampolines) + ftrace_remove_tramp(op, rec); + } while_for_each_ftrace_op(op); +} + static void __ftrace_hash_rec_update(struct ftrace_ops *ops, int filter_hash, bool inc) @@ -1572,8 +1607,30 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, if (inc) { rec->flags++; - if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX)) + if (FTRACE_WARN_ON(ftrace_rec_count(rec) == FTRACE_REF_MAX)) return; + + /* + * If there's only a single callback registered to a + * function, and the ops has a trampoline registered + * for it, then we can call it directly. + */ + if (ftrace_rec_count(rec) == 1 && ops->trampoline) { + rec->flags |= FTRACE_FL_TRAMP; + ops->nr_trampolines++; + } else { + /* + * If we are adding another function callback + * to this function, and the previous had a + * trampoline used, then we need to go back to + * the default trampoline. + */ + rec->flags &= ~FTRACE_FL_TRAMP; + + /* remove trampolines from any ops for this rec */ + ftrace_clear_tramps(rec); + } + /* * If any ops wants regs saved for this function * then all ops will get saved regs. @@ -1581,9 +1638,30 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) rec->flags |= FTRACE_FL_REGS; } else { - if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0)) + if (FTRACE_WARN_ON(ftrace_rec_count(rec) == 0)) return; rec->flags--; + + if (ops->trampoline && !ftrace_rec_count(rec)) + ftrace_remove_tramp(ops, rec); + + /* + * If the rec had REGS enabled and the ops that is + * being removed had REGS set, then see if there is + * still any ops for this record that wants regs. + * If not, we can stop recording them. + */ + if (ftrace_rec_count(rec) > 0 && + rec->flags & FTRACE_FL_REGS && + ops->flags & FTRACE_OPS_FL_SAVE_REGS) { + if (!test_rec_ops_needs_regs(rec)) + rec->flags &= ~FTRACE_FL_REGS; + } + + /* + * flags will be cleared in ftrace_check_record() + * if rec count is zero. + */ } count++; /* Shortcut, if we handled all records, we are done. */ @@ -1668,17 +1746,23 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) * If we are disabling calls, then disable all records that * are enabled. */ - if (enable && (rec->flags & ~FTRACE_FL_MASK)) + if (enable && ftrace_rec_count(rec)) flag = FTRACE_FL_ENABLED; /* - * If enabling and the REGS flag does not match the REGS_EN, then - * do not ignore this record. Set flags to fail the compare against - * ENABLED. + * If enabling and the REGS flag does not match the REGS_EN, or + * the TRAMP flag doesn't match the TRAMP_EN, then do not ignore + * this record. Set flags to fail the compare against ENABLED. */ - if (flag && - (!(rec->flags & FTRACE_FL_REGS) != !(rec->flags & FTRACE_FL_REGS_EN))) - flag |= FTRACE_FL_REGS; + if (flag) { + if (!(rec->flags & FTRACE_FL_REGS) != + !(rec->flags & FTRACE_FL_REGS_EN)) + flag |= FTRACE_FL_REGS; + + if (!(rec->flags & FTRACE_FL_TRAMP) != + !(rec->flags & FTRACE_FL_TRAMP_EN)) + flag |= FTRACE_FL_TRAMP; + } /* If the state of this record hasn't changed, then do nothing */ if ((rec->flags & FTRACE_FL_ENABLED) == flag) @@ -1696,6 +1780,12 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) else rec->flags &= ~FTRACE_FL_REGS_EN; } + if (flag & FTRACE_FL_TRAMP) { + if (rec->flags & FTRACE_FL_TRAMP) + rec->flags |= FTRACE_FL_TRAMP_EN; + else + rec->flags &= ~FTRACE_FL_TRAMP_EN; + } } /* @@ -1704,7 +1794,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) * Otherwise, * return UPDATE_MODIFY_CALL to tell the caller to convert * from the save regs, to a non-save regs function or - * vice versa. + * vice versa, or from a trampoline call. */ if (flag & FTRACE_FL_ENABLED) return FTRACE_UPDATE_MAKE_CALL; @@ -1714,7 +1804,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) if (update) { /* If there's no more users, clear all flags */ - if (!(rec->flags & ~FTRACE_FL_MASK)) + if (!ftrace_rec_count(rec)) rec->flags = 0; else /* Just disable the record (keep REGS state) */ @@ -1751,6 +1841,43 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable) return ftrace_check_record(rec, enable, 0); } +static struct ftrace_ops * +ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec) +{ + struct ftrace_ops *op; + + /* Removed ops need to be tested first */ + if (removed_ops && removed_ops->tramp_hash) { + if (ftrace_lookup_ip(removed_ops->tramp_hash, rec->ip)) + return removed_ops; + } + + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (!op->tramp_hash) + continue; + + if (ftrace_lookup_ip(op->tramp_hash, rec->ip)) + return op; + + } while_for_each_ftrace_op(op); + + return NULL; +} + +static struct ftrace_ops * +ftrace_find_tramp_ops_new(struct dyn_ftrace *rec) +{ + struct ftrace_ops *op; + + do_for_each_ftrace_op(op, ftrace_ops_list) { + /* pass rec in as regs to have non-NULL val */ + if (ftrace_ops_test(op, rec->ip, rec)) + return op; + } while_for_each_ftrace_op(op); + + return NULL; +} + /** * ftrace_get_addr_new - Get the call address to set to * @rec: The ftrace record descriptor @@ -1763,6 +1890,20 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable) */ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) { + struct ftrace_ops *ops; + + /* Trampolines take precedence over regs */ + if (rec->flags & FTRACE_FL_TRAMP) { + ops = ftrace_find_tramp_ops_new(rec); + if (FTRACE_WARN_ON(!ops || !ops->trampoline)) { + pr_warning("Bad trampoline accounting at: %p (%pS)\n", + (void *)rec->ip, (void *)rec->ip); + /* Ftrace is shutting down, return anything */ + return (unsigned long)FTRACE_ADDR; + } + return ops->trampoline; + } + if (rec->flags & FTRACE_FL_REGS) return (unsigned long)FTRACE_REGS_ADDR; else @@ -1781,6 +1922,20 @@ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) */ unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) { + struct ftrace_ops *ops; + + /* Trampolines take precedence over regs */ + if (rec->flags & FTRACE_FL_TRAMP_EN) { + ops = ftrace_find_tramp_ops_curr(rec); + if (FTRACE_WARN_ON(!ops)) { + pr_warning("Bad trampoline accounting at: %p (%pS)\n", + (void *)rec->ip, (void *)rec->ip); + /* Ftrace is shutting down, return anything */ + return (unsigned long)FTRACE_ADDR; + } + return ops->trampoline; + } + if (rec->flags & FTRACE_FL_REGS_EN) return (unsigned long)FTRACE_REGS_ADDR; else @@ -2023,6 +2178,89 @@ void __weak arch_ftrace_update_code(int command) ftrace_run_stop_machine(command); } +static int ftrace_save_ops_tramp_hash(struct ftrace_ops *ops) +{ + struct ftrace_page *pg; + struct dyn_ftrace *rec; + int size, bits; + int ret; + + size = ops->nr_trampolines; + bits = 0; + /* + * Make the hash size about 1/2 the # found + */ + for (size /= 2; size; size >>= 1) + bits++; + + ops->tramp_hash = alloc_ftrace_hash(bits); + /* + * TODO: a failed allocation is going to screw up + * the accounting of what needs to be modified + * and not. For now, we kill ftrace if we fail + * to allocate here. But there are ways around this, + * but that will take a little more work. + */ + if (!ops->tramp_hash) + return -ENOMEM; + + do_for_each_ftrace_rec(pg, rec) { + if (ftrace_rec_count(rec) == 1 && + ftrace_ops_test(ops, rec->ip, rec)) { + + /* + * If another ops adds to a rec, the rec will + * lose its trampoline and never get it back + * until all ops are off of it. + */ + if (!(rec->flags & FTRACE_FL_TRAMP)) + continue; + + /* This record had better have a trampoline */ + if (FTRACE_WARN_ON(!(rec->flags & FTRACE_FL_TRAMP_EN))) + return -1; + + ret = add_hash_entry(ops->tramp_hash, rec->ip); + if (ret < 0) + return ret; + } + } while_for_each_ftrace_rec(); + + /* The number of recs in the hash must match nr_trampolines */ + FTRACE_WARN_ON(ops->tramp_hash->count != ops->nr_trampolines); + + return 0; +} + +static int ftrace_save_tramp_hashes(void) +{ + struct ftrace_ops *op; + int ret; + + /* + * Now that any trampoline is being used, we need to save the + * hashes for the ops that have them. This allows the mapping + * back from the record to the ops that has the trampoline to + * know what code is being replaced. Modifying code must always + * verify what it is changing. + */ + do_for_each_ftrace_op(op, ftrace_ops_list) { + + /* The tramp_hash is recreated each time. */ + free_ftrace_hash(op->tramp_hash); + op->tramp_hash = NULL; + + if (op->nr_trampolines) { + ret = ftrace_save_ops_tramp_hash(op); + if (ret) + return ret; + } + + } while_for_each_ftrace_op(op); + + return 0; +} + static void ftrace_run_update_code(int command) { int ret; @@ -2031,11 +2269,6 @@ static void ftrace_run_update_code(int command) FTRACE_WARN_ON(ret); if (ret) return; - /* - * Do not call function tracer while we update the code. - * We are in stop machine. - */ - function_trace_stop++; /* * By default we use stop_machine() to modify the code. @@ -2045,15 +2278,15 @@ static void ftrace_run_update_code(int command) */ arch_ftrace_update_code(command); - function_trace_stop--; - ret = ftrace_arch_code_modify_post_process(); FTRACE_WARN_ON(ret); + + ret = ftrace_save_tramp_hashes(); + FTRACE_WARN_ON(ret); } static ftrace_func_t saved_ftrace_func; static int ftrace_start_up; -static int global_start_up; static void control_ops_free(struct ftrace_ops *ops) { @@ -2117,8 +2350,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) ftrace_hash_rec_disable(ops, 1); - if (!global_start_up) - ops->flags &= ~FTRACE_OPS_FL_ENABLED; + ops->flags &= ~FTRACE_OPS_FL_ENABLED; command |= FTRACE_UPDATE_CALLS; @@ -2139,8 +2371,16 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) return 0; } + /* + * If the ops uses a trampoline, then it needs to be + * tested first on update. + */ + removed_ops = ops; + ftrace_run_update_code(command); + removed_ops = NULL; + /* * Dynamic ops may be freed, we must make sure that all * callers are done before leaving this function. @@ -2398,7 +2638,8 @@ ftrace_allocate_pages(unsigned long num_to_init) return start_pg; free_pages: - while (start_pg) { + pg = start_pg; + while (pg) { order = get_count_order(pg->size / ENTRIES_PER_PAGE); free_pages((unsigned long)pg->records, order); start_pg = pg->next; @@ -2595,8 +2836,10 @@ static void *t_start(struct seq_file *m, loff_t *pos) * off, we can short cut and just print out that all * functions are enabled. */ - if (iter->flags & FTRACE_ITER_FILTER && - ftrace_hash_empty(ops->filter_hash)) { + if ((iter->flags & FTRACE_ITER_FILTER && + ftrace_hash_empty(ops->filter_hash)) || + (iter->flags & FTRACE_ITER_NOTRACE && + ftrace_hash_empty(ops->notrace_hash))) { if (*pos > 0) return t_hash_start(m, pos); iter->flags |= FTRACE_ITER_PRINTALL; @@ -2641,7 +2884,10 @@ static int t_show(struct seq_file *m, void *v) return t_hash_show(m, iter); if (iter->flags & FTRACE_ITER_PRINTALL) { - seq_printf(m, "#### all functions enabled ####\n"); + if (iter->flags & FTRACE_ITER_NOTRACE) + seq_printf(m, "#### no functions disabled ####\n"); + else + seq_printf(m, "#### all functions enabled ####\n"); return 0; } @@ -2651,10 +2897,22 @@ static int t_show(struct seq_file *m, void *v) return 0; seq_printf(m, "%ps", (void *)rec->ip); - if (iter->flags & FTRACE_ITER_ENABLED) + if (iter->flags & FTRACE_ITER_ENABLED) { seq_printf(m, " (%ld)%s", - rec->flags & ~FTRACE_FL_MASK, - rec->flags & FTRACE_FL_REGS ? " R" : ""); + ftrace_rec_count(rec), + rec->flags & FTRACE_FL_REGS ? " R" : " "); + if (rec->flags & FTRACE_FL_TRAMP_EN) { + struct ftrace_ops *ops; + + ops = ftrace_find_tramp_ops_curr(rec); + if (ops && ops->trampoline) + seq_printf(m, "\ttramp: %pS", + (void *)ops->trampoline); + else + seq_printf(m, "\ttramp: ERROR!"); + } + } + seq_printf(m, "\n"); return 0; @@ -2702,13 +2960,6 @@ ftrace_enabled_open(struct inode *inode, struct file *file) return iter ? 0 : -ENOMEM; } -static void ftrace_filter_reset(struct ftrace_hash *hash) -{ - mutex_lock(&ftrace_lock); - ftrace_hash_clear(hash); - mutex_unlock(&ftrace_lock); -} - /** * ftrace_regex_open - initialize function tracer filter files * @ops: The ftrace_ops that hold the hash filters @@ -2758,7 +3009,13 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, hash = ops->filter_hash; if (file->f_mode & FMODE_WRITE) { - iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash); + const int size_bits = FTRACE_HASH_DEFAULT_BITS; + + if (file->f_flags & O_TRUNC) + iter->hash = alloc_ftrace_hash(size_bits); + else + iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash); + if (!iter->hash) { trace_parser_put(&iter->parser); kfree(iter); @@ -2767,10 +3024,6 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, } } - if ((file->f_mode & FMODE_WRITE) && - (file->f_flags & O_TRUNC)) - ftrace_filter_reset(iter->hash); - if (file->f_mode & FMODE_READ) { iter->pg = ftrace_pages_start; @@ -3471,14 +3724,16 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, else orig_hash = &ops->notrace_hash; - hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); + if (reset) + hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); + else + hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); + if (!hash) { ret = -ENOMEM; goto out_regex_unlock; } - if (reset) - ftrace_filter_reset(hash); if (buf && !ftrace_match_records(hash, buf, len)) { ret = -EINVAL; goto out_regex_unlock; @@ -3630,6 +3885,7 @@ __setup("ftrace_filter=", set_ftrace_filter); #ifdef CONFIG_FUNCTION_GRAPH_TRACER static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; +static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata; static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer); static int __init set_graph_function(char *str) @@ -3639,16 +3895,29 @@ static int __init set_graph_function(char *str) } __setup("ftrace_graph_filter=", set_graph_function); -static void __init set_ftrace_early_graph(char *buf) +static int __init set_graph_notrace_function(char *str) +{ + strlcpy(ftrace_graph_notrace_buf, str, FTRACE_FILTER_SIZE); + return 1; +} +__setup("ftrace_graph_notrace=", set_graph_notrace_function); + +static void __init set_ftrace_early_graph(char *buf, int enable) { int ret; char *func; + unsigned long *table = ftrace_graph_funcs; + int *count = &ftrace_graph_count; + + if (!enable) { + table = ftrace_graph_notrace_funcs; + count = &ftrace_graph_notrace_count; + } while (buf) { func = strsep(&buf, ","); /* we allow only one expression at a time */ - ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, - FTRACE_GRAPH_MAX_FUNCS, func); + ret = ftrace_set_func(table, count, FTRACE_GRAPH_MAX_FUNCS, func); if (ret) printk(KERN_DEBUG "ftrace: function %s not " "traceable\n", func); @@ -3677,7 +3946,9 @@ static void __init set_ftrace_early_filters(void) ftrace_set_early_filter(&global_ops, ftrace_notrace_buf, 0); #ifdef CONFIG_FUNCTION_GRAPH_TRACER if (ftrace_graph_buf[0]) - set_ftrace_early_graph(ftrace_graph_buf); + set_ftrace_early_graph(ftrace_graph_buf, 1); + if (ftrace_graph_notrace_buf[0]) + set_ftrace_early_graph(ftrace_graph_notrace_buf, 0); #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ } @@ -3819,7 +4090,12 @@ static int g_show(struct seq_file *m, void *v) return 0; if (ptr == (unsigned long *)1) { - seq_printf(m, "#### all functions enabled ####\n"); + struct ftrace_graph_data *fgd = m->private; + + if (fgd->table == ftrace_graph_funcs) + seq_printf(m, "#### all functions enabled ####\n"); + else + seq_printf(m, "#### no functions disabled ####\n"); return 0; } @@ -4447,9 +4723,6 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op; int bit; - if (function_trace_stop) - return; - bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX); if (bit < 0) return; @@ -4461,9 +4734,8 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, preempt_disable_notrace(); do_for_each_ftrace_op(op, ftrace_ops_list) { if (ftrace_ops_test(op, ip, regs)) { - if (WARN_ON(!op->func)) { - function_trace_stop = 1; - printk("op=%p %pS\n", op, op); + if (FTRACE_WARN_ON(!op->func)) { + pr_warn("op=%p %pS\n", op, op); goto out; } op->func(ip, parent_ip, op, regs); @@ -5084,6 +5356,12 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, /* Function graph doesn't use the .func field of global_ops */ global_ops.flags |= FTRACE_OPS_FL_STUB; +#ifdef CONFIG_DYNAMIC_FTRACE + /* Optimize function graph calling (if implemented by arch) */ + if (FTRACE_GRAPH_TRAMP_ADDR != 0) + global_ops.trampoline = FTRACE_GRAPH_TRAMP_ADDR; +#endif + ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); out: @@ -5104,6 +5382,10 @@ void unregister_ftrace_graph(void) __ftrace_graph_entry = ftrace_graph_entry_stub; ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); global_ops.flags &= ~FTRACE_OPS_FL_STUB; +#ifdef CONFIG_DYNAMIC_FTRACE + if (FTRACE_GRAPH_TRAMP_ADDR != 0) + global_ops.trampoline = 0; +#endif unregister_pm_notifier(&ftrace_suspend_notifier); unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); @@ -5183,9 +5465,4 @@ void ftrace_graph_exit_task(struct task_struct *t) kfree(ret_stack); } - -void ftrace_graph_stop(void) -{ - ftrace_stop(); -} #endif diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index ff7027199a9a..925f629658d6 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1689,22 +1689,14 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, if (!cpu_buffer->nr_pages_to_update) continue; - /* The update must run on the CPU that is being updated. */ - preempt_disable(); - if (cpu == smp_processor_id() || !cpu_online(cpu)) { + /* Can't run something on an offline CPU. */ + if (!cpu_online(cpu)) { rb_update_pages(cpu_buffer); cpu_buffer->nr_pages_to_update = 0; } else { - /* - * Can not disable preemption for schedule_work_on() - * on PREEMPT_RT. - */ - preempt_enable(); schedule_work_on(cpu, &cpu_buffer->update_pages_work); - preempt_disable(); } - preempt_enable(); } /* wait for all the updates to complete */ @@ -1742,22 +1734,14 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, get_online_cpus(); - preempt_disable(); - /* The update must run on the CPU that is being updated. */ - if (cpu_id == smp_processor_id() || !cpu_online(cpu_id)) + /* Can't run something on an offline CPU. */ + if (!cpu_online(cpu_id)) rb_update_pages(cpu_buffer); else { - /* - * Can not disable preemption for schedule_work_on() - * on PREEMPT_RT. - */ - preempt_enable(); schedule_work_on(cpu_id, &cpu_buffer->update_pages_work); wait_for_completion(&cpu_buffer->update_done); - preempt_disable(); } - preempt_enable(); cpu_buffer->nr_pages_to_update = 0; put_online_cpus(); @@ -3775,7 +3759,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) if (rb_per_cpu_empty(cpu_buffer)) return NULL; - if (iter->head >= local_read(&iter->head_page->page->commit)) { + if (iter->head >= rb_page_size(iter->head_page)) { rb_inc_iter(iter); goto again; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 291397e66669..8bb80fe08767 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -937,30 +937,6 @@ out: return ret; } -ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) -{ - int len; - int ret; - - if (!cnt) - return 0; - - if (s->len <= s->readpos) - return -EBUSY; - - len = s->len - s->readpos; - if (cnt > len) - cnt = len; - ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt); - if (ret == cnt) - return -EFAULT; - - cnt -= ret; - - s->readpos += cnt; - return cnt; -} - static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) { int len; @@ -3699,6 +3675,7 @@ static const char readme_msg[] = #endif #ifdef CONFIG_FUNCTION_GRAPH_TRACER " set_graph_function\t- Trace the nested calls of a function (function_graph)\n" + " set_graph_notrace\t- Do not trace the nested calls of a function (function_graph)\n" " max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n" #endif #ifdef CONFIG_TRACER_SNAPSHOT @@ -4238,10 +4215,9 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, } static ssize_t -tracing_max_lat_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) +tracing_nsecs_read(unsigned long *ptr, char __user *ubuf, + size_t cnt, loff_t *ppos) { - unsigned long *ptr = filp->private_data; char buf[64]; int r; @@ -4253,10 +4229,9 @@ tracing_max_lat_read(struct file *filp, char __user *ubuf, } static ssize_t -tracing_max_lat_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) +tracing_nsecs_write(unsigned long *ptr, const char __user *ubuf, + size_t cnt, loff_t *ppos) { - unsigned long *ptr = filp->private_data; unsigned long val; int ret; @@ -4269,6 +4244,52 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf, return cnt; } +static ssize_t +tracing_thresh_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return tracing_nsecs_read(&tracing_thresh, ubuf, cnt, ppos); +} + +static ssize_t +tracing_thresh_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + int ret; + + mutex_lock(&trace_types_lock); + ret = tracing_nsecs_write(&tracing_thresh, ubuf, cnt, ppos); + if (ret < 0) + goto out; + + if (tr->current_trace->update_thresh) { + ret = tr->current_trace->update_thresh(tr); + if (ret < 0) + goto out; + } + + ret = cnt; +out: + mutex_unlock(&trace_types_lock); + + return ret; +} + +static ssize_t +tracing_max_lat_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return tracing_nsecs_read(filp->private_data, ubuf, cnt, ppos); +} + +static ssize_t +tracing_max_lat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return tracing_nsecs_write(filp->private_data, ubuf, cnt, ppos); +} + static int tracing_open_pipe(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; @@ -5170,6 +5191,13 @@ static int snapshot_raw_open(struct inode *inode, struct file *filp) #endif /* CONFIG_TRACER_SNAPSHOT */ +static const struct file_operations tracing_thresh_fops = { + .open = tracing_open_generic, + .read = tracing_thresh_read, + .write = tracing_thresh_write, + .llseek = generic_file_llseek, +}; + static const struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic, .read = tracing_max_lat_read, @@ -6107,10 +6135,8 @@ destroy_trace_option_files(struct trace_option_dentry *topts) if (!topts) return; - for (cnt = 0; topts[cnt].opt; cnt++) { - if (topts[cnt].entry) - debugfs_remove(topts[cnt].entry); - } + for (cnt = 0; topts[cnt].opt; cnt++) + debugfs_remove(topts[cnt].entry); kfree(topts); } @@ -6533,7 +6559,7 @@ static __init int tracer_init_debugfs(void) init_tracer_debugfs(&global_trace, d_tracer); trace_create_file("tracing_thresh", 0644, d_tracer, - &tracing_thresh, &tracing_max_lat_fops); + &global_trace, &tracing_thresh_fops); trace_create_file("README", 0444, d_tracer, NULL, &tracing_readme_fops); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9258f5a815db..385391fb1d3b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -339,6 +339,7 @@ struct tracer_flags { * @reset: called when one switches to another tracer * @start: called when tracing is unpaused (echo 1 > tracing_enabled) * @stop: called when tracing is paused (echo 0 > tracing_enabled) + * @update_thresh: called when tracing_thresh is updated * @open: called when the trace file is opened * @pipe_open: called when the trace_pipe file is opened * @close: called when the trace file is released @@ -357,6 +358,7 @@ struct tracer { void (*reset)(struct trace_array *tr); void (*start)(struct trace_array *tr); void (*stop)(struct trace_array *tr); + int (*update_thresh)(struct trace_array *tr); void (*open)(struct trace_iterator *iter); void (*pipe_open)(struct trace_iterator *iter); void (*close)(struct trace_iterator *iter); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 2de53628689f..ef06ce7e9cf8 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -8,6 +8,8 @@ * */ +#define pr_fmt(fmt) fmt + #include <linux/workqueue.h> #include <linux/spinlock.h> #include <linux/kthread.h> @@ -1491,7 +1493,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name, dir->entry = debugfs_create_dir(name, parent); if (!dir->entry) { - pr_warning("Failed to create system directory %s\n", name); + pr_warn("Failed to create system directory %s\n", name); __put_system(system); goto out_free; } @@ -1507,7 +1509,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name, if (!entry) { kfree(system->filter); system->filter = NULL; - pr_warning("Could not create debugfs '%s/filter' entry\n", name); + pr_warn("Could not create debugfs '%s/filter' entry\n", name); } trace_create_file("enable", 0644, dir->entry, dir, @@ -1522,8 +1524,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name, out_fail: /* Only print this message if failed on memory allocation */ if (!dir || !system) - pr_warning("No memory to create event subsystem %s\n", - name); + pr_warn("No memory to create event subsystem %s\n", name); return NULL; } @@ -1551,8 +1552,7 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file) name = ftrace_event_name(call); file->dir = debugfs_create_dir(name, d_events); if (!file->dir) { - pr_warning("Could not create debugfs '%s' directory\n", - name); + pr_warn("Could not create debugfs '%s' directory\n", name); return -1; } @@ -1575,8 +1575,8 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file) if (list_empty(head)) { ret = call->class->define_fields(call); if (ret < 0) { - pr_warning("Could not initialize trace point" - " events/%s\n", name); + pr_warn("Could not initialize trace point events/%s\n", + name); return -1; } } @@ -1621,7 +1621,6 @@ static void event_remove(struct ftrace_event_call *call) if (file->event_call != call) continue; ftrace_event_enable_disable(file, 0); - destroy_preds(file); /* * The do_for_each_event_file() is * a double loop. After finding the call for this @@ -1649,8 +1648,7 @@ static int event_init(struct ftrace_event_call *call) if (call->class->raw_init) { ret = call->class->raw_init(call); if (ret < 0 && ret != -ENOSYS) - pr_warn("Could not initialize trace events/%s\n", - name); + pr_warn("Could not initialize trace events/%s\n", name); } return ret; @@ -1749,7 +1747,8 @@ static void __trace_remove_event_call(struct ftrace_event_call *call) { event_remove(call); trace_destroy_fields(call); - destroy_call_preds(call); + free_event_filter(call->filter); + call->filter = NULL; } static int probe_remove_event_call(struct ftrace_event_call *call) @@ -1895,8 +1894,8 @@ __trace_add_event_dirs(struct trace_array *tr) list_for_each_entry(call, &ftrace_events, list) { ret = __trace_add_new_event(call, tr); if (ret < 0) - pr_warning("Could not create directory for event %s\n", - ftrace_event_name(call)); + pr_warn("Could not create directory for event %s\n", + ftrace_event_name(call)); } } @@ -2208,8 +2207,8 @@ __trace_early_add_event_dirs(struct trace_array *tr) list_for_each_entry(file, &tr->events, list) { ret = event_create_dir(tr->event_dir, file); if (ret < 0) - pr_warning("Could not create directory for event %s\n", - ftrace_event_name(file->event_call)); + pr_warn("Could not create directory for event %s\n", + ftrace_event_name(file->event_call)); } } @@ -2232,8 +2231,8 @@ __trace_early_add_events(struct trace_array *tr) ret = __trace_early_add_new_event(call, tr); if (ret < 0) - pr_warning("Could not create early event %s\n", - ftrace_event_name(call)); + pr_warn("Could not create early event %s\n", + ftrace_event_name(call)); } } @@ -2280,13 +2279,13 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) entry = debugfs_create_file("set_event", 0644, parent, tr, &ftrace_set_event_fops); if (!entry) { - pr_warning("Could not create debugfs 'set_event' entry\n"); + pr_warn("Could not create debugfs 'set_event' entry\n"); return -ENOMEM; } d_events = debugfs_create_dir("events", parent); if (!d_events) { - pr_warning("Could not create debugfs 'events' directory\n"); + pr_warn("Could not create debugfs 'events' directory\n"); return -ENOMEM; } @@ -2462,11 +2461,10 @@ static __init int event_trace_init(void) entry = debugfs_create_file("available_events", 0444, d_tracer, tr, &ftrace_avail_fops); if (!entry) - pr_warning("Could not create debugfs " - "'available_events' entry\n"); + pr_warn("Could not create debugfs 'available_events' entry\n"); if (trace_define_common_fields()) - pr_warning("tracing: Failed to allocate common fields"); + pr_warn("tracing: Failed to allocate common fields"); ret = early_event_add_tracer(d_tracer, tr); if (ret) @@ -2475,7 +2473,7 @@ static __init int event_trace_init(void) #ifdef CONFIG_MODULES ret = register_module_notifier(&trace_module_nb); if (ret) - pr_warning("Failed to register trace events module notifier\n"); + pr_warn("Failed to register trace events module notifier\n"); #endif return 0; } @@ -2579,7 +2577,7 @@ static __init void event_trace_self_tests(void) * it and the self test should not be on. */ if (file->flags & FTRACE_EVENT_FL_ENABLED) { - pr_warning("Enabled event during self test!\n"); + pr_warn("Enabled event during self test!\n"); WARN_ON_ONCE(1); continue; } @@ -2607,8 +2605,8 @@ static __init void event_trace_self_tests(void) ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1); if (WARN_ON_ONCE(ret)) { - pr_warning("error enabling system %s\n", - system->name); + pr_warn("error enabling system %s\n", + system->name); continue; } @@ -2616,8 +2614,8 @@ static __init void event_trace_self_tests(void) ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0); if (WARN_ON_ONCE(ret)) { - pr_warning("error disabling system %s\n", - system->name); + pr_warn("error disabling system %s\n", + system->name); continue; } @@ -2631,7 +2629,7 @@ static __init void event_trace_self_tests(void) ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1); if (WARN_ON_ONCE(ret)) { - pr_warning("error enabling all events\n"); + pr_warn("error enabling all events\n"); return; } @@ -2640,7 +2638,7 @@ static __init void event_trace_self_tests(void) /* reset sysname */ ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0); if (WARN_ON_ONCE(ret)) { - pr_warning("error disabling all events\n"); + pr_warn("error disabling all events\n"); return; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 8a8631926a07..7a8c1528e141 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -774,17 +774,12 @@ static void __free_preds(struct event_filter *filter) filter->n_preds = 0; } -static void call_filter_disable(struct ftrace_event_call *call) -{ - call->flags &= ~TRACE_EVENT_FL_FILTERED; -} - static void filter_disable(struct ftrace_event_file *file) { struct ftrace_event_call *call = file->event_call; if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - call_filter_disable(call); + call->flags &= ~TRACE_EVENT_FL_FILTERED; else file->flags &= ~FTRACE_EVENT_FL_FILTERED; } @@ -804,32 +799,6 @@ void free_event_filter(struct event_filter *filter) __free_filter(filter); } -void destroy_call_preds(struct ftrace_event_call *call) -{ - __free_filter(call->filter); - call->filter = NULL; -} - -static void destroy_file_preds(struct ftrace_event_file *file) -{ - __free_filter(file->filter); - file->filter = NULL; -} - -/* - * Called when destroying the ftrace_event_file. - * The file is being freed, so we do not need to worry about - * the file being currently used. This is for module code removing - * the tracepoints from within it. - */ -void destroy_preds(struct ftrace_event_file *file) -{ - if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - destroy_call_preds(file->event_call); - else - destroy_file_preds(file); -} - static struct event_filter *__alloc_filter(void) { struct event_filter *filter; @@ -873,17 +842,14 @@ static inline void __remove_filter(struct ftrace_event_file *file) remove_filter_string(file->filter); } -static void filter_free_subsystem_preds(struct event_subsystem *system, +static void filter_free_subsystem_preds(struct ftrace_subsystem_dir *dir, struct trace_array *tr) { struct ftrace_event_file *file; - struct ftrace_event_call *call; list_for_each_entry(file, &tr->events, list) { - call = file->event_call; - if (strcmp(call->class->system, system->name) != 0) + if (file->system != dir) continue; - __remove_filter(file); } } @@ -901,15 +867,13 @@ static inline void __free_subsystem_filter(struct ftrace_event_file *file) } } -static void filter_free_subsystem_filters(struct event_subsystem *system, +static void filter_free_subsystem_filters(struct ftrace_subsystem_dir *dir, struct trace_array *tr) { struct ftrace_event_file *file; - struct ftrace_event_call *call; list_for_each_entry(file, &tr->events, list) { - call = file->event_call; - if (strcmp(call->class->system, system->name) != 0) + if (file->system != dir) continue; __free_subsystem_filter(file); } @@ -1582,7 +1546,6 @@ static int fold_pred_tree(struct event_filter *filter, static int replace_preds(struct ftrace_event_call *call, struct event_filter *filter, struct filter_parse_state *ps, - char *filter_string, bool dry_run) { char *operand1 = NULL, *operand2 = NULL; @@ -1755,13 +1718,12 @@ struct filter_list { struct event_filter *filter; }; -static int replace_system_preds(struct event_subsystem *system, +static int replace_system_preds(struct ftrace_subsystem_dir *dir, struct trace_array *tr, struct filter_parse_state *ps, char *filter_string) { struct ftrace_event_file *file; - struct ftrace_event_call *call; struct filter_list *filter_item; struct filter_list *tmp; LIST_HEAD(filter_list); @@ -1769,15 +1731,14 @@ static int replace_system_preds(struct event_subsystem *system, int err; list_for_each_entry(file, &tr->events, list) { - call = file->event_call; - if (strcmp(call->class->system, system->name) != 0) + if (file->system != dir) continue; /* * Try to see if the filter can be applied * (filter arg is ignored on dry_run) */ - err = replace_preds(call, NULL, ps, filter_string, true); + err = replace_preds(file->event_call, NULL, ps, true); if (err) event_set_no_set_filter_flag(file); else @@ -1787,9 +1748,7 @@ static int replace_system_preds(struct event_subsystem *system, list_for_each_entry(file, &tr->events, list) { struct event_filter *filter; - call = file->event_call; - - if (strcmp(call->class->system, system->name) != 0) + if (file->system != dir) continue; if (event_no_set_filter_flag(file)) @@ -1811,7 +1770,7 @@ static int replace_system_preds(struct event_subsystem *system, if (err) goto fail_mem; - err = replace_preds(call, filter, ps, filter_string, false); + err = replace_preds(file->event_call, filter, ps, false); if (err) { filter_disable(file); parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); @@ -1933,7 +1892,7 @@ static int create_filter(struct ftrace_event_call *call, err = create_filter_start(filter_str, set_str, &ps, &filter); if (!err) { - err = replace_preds(call, filter, ps, filter_str, false); + err = replace_preds(call, filter, ps, false); if (err && set_str) append_filter_err(ps, filter); } @@ -1959,7 +1918,7 @@ int create_event_filter(struct ftrace_event_call *call, * Identical to create_filter() except that it creates a subsystem filter * and always remembers @filter_str. */ -static int create_system_filter(struct event_subsystem *system, +static int create_system_filter(struct ftrace_subsystem_dir *dir, struct trace_array *tr, char *filter_str, struct event_filter **filterp) { @@ -1969,7 +1928,7 @@ static int create_system_filter(struct event_subsystem *system, err = create_filter_start(filter_str, true, &ps, &filter); if (!err) { - err = replace_system_preds(system, tr, ps, filter_str); + err = replace_system_preds(dir, tr, ps, filter_str); if (!err) { /* System filters just show a default message */ kfree(filter->filter_string); @@ -2053,18 +2012,18 @@ int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, } if (!strcmp(strstrip(filter_string), "0")) { - filter_free_subsystem_preds(system, tr); + filter_free_subsystem_preds(dir, tr); remove_filter_string(system->filter); filter = system->filter; system->filter = NULL; /* Ensure all filters are no longer used */ synchronize_sched(); - filter_free_subsystem_filters(system, tr); + filter_free_subsystem_filters(dir, tr); __free_filter(filter); goto out_unlock; } - err = create_system_filter(system, tr, filter_string, &filter); + err = create_system_filter(dir, tr, filter_string, &filter); if (filter) { /* * No event actually uses the system filter diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 4de3e57f723c..f0a0c982cde3 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -15,6 +15,33 @@ #include "trace.h" #include "trace_output.h" +static bool kill_ftrace_graph; + +/** + * ftrace_graph_is_dead - returns true if ftrace_graph_stop() was called + * + * ftrace_graph_stop() is called when a severe error is detected in + * the function graph tracing. This function is called by the critical + * paths of function graph to keep those paths from doing any more harm. + */ +bool ftrace_graph_is_dead(void) +{ + return kill_ftrace_graph; +} + +/** + * ftrace_graph_stop - set to permanently disable function graph tracincg + * + * In case of an error int function graph tracing, this is called + * to try to keep function graph tracing from causing any more harm. + * Usually this is pretty severe and this is called to try to at least + * get a warning out to the user. + */ +void ftrace_graph_stop(void) +{ + kill_ftrace_graph = true; +} + /* When set, irq functions will be ignored */ static int ftrace_graph_skip_irqs; @@ -92,6 +119,9 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, unsigned long long calltime; int index; + if (unlikely(ftrace_graph_is_dead())) + return -EBUSY; + if (!current->ret_stack) return -EBUSY; @@ -323,7 +353,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) return ret; } -int trace_graph_thresh_entry(struct ftrace_graph_ent *trace) +static int trace_graph_thresh_entry(struct ftrace_graph_ent *trace) { if (tracing_thresh) return 1; @@ -412,7 +442,7 @@ void set_graph_array(struct trace_array *tr) smp_mb(); } -void trace_graph_thresh_return(struct ftrace_graph_ret *trace) +static void trace_graph_thresh_return(struct ftrace_graph_ret *trace) { if (tracing_thresh && (trace->rettime - trace->calltime < tracing_thresh)) @@ -445,6 +475,12 @@ static void graph_trace_reset(struct trace_array *tr) unregister_ftrace_graph(); } +static int graph_trace_update_thresh(struct trace_array *tr) +{ + graph_trace_reset(tr); + return graph_trace_init(tr); +} + static int max_bytes_for_cpu; static enum print_line_t @@ -1399,7 +1435,7 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags) seq_printf(s, " | | | |\n"); } -void print_graph_headers(struct seq_file *s) +static void print_graph_headers(struct seq_file *s) { print_graph_headers_flags(s, tracer_flags.val); } @@ -1495,6 +1531,7 @@ static struct trace_event graph_trace_ret_event = { static struct tracer graph_trace __tracer_data = { .name = "function_graph", + .update_thresh = graph_trace_update_thresh, .open = graph_trace_open, .pipe_open = graph_trace_open, .close = graph_trace_close, diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index f3dad80c20b2..c6977d5a9b12 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -20,23 +20,6 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; static int next_event_type = __TRACE_LAST_TYPE + 1; -int trace_print_seq(struct seq_file *m, struct trace_seq *s) -{ - int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; - int ret; - - ret = seq_write(m, s->buffer, len); - - /* - * Only reset this buffer if we successfully wrote to the - * seq_file buffer. - */ - if (!ret) - trace_seq_init(s); - - return ret; -} - enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; @@ -85,257 +68,6 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) return TRACE_TYPE_HANDLED; } -/** - * trace_seq_printf - sequence printing of trace information - * @s: trace sequence descriptor - * @fmt: printf format string - * - * It returns 0 if the trace oversizes the buffer's free - * space, 1 otherwise. - * - * The tracer may use either sequence operations or its own - * copy to user routines. To simplify formating of a trace - * trace_seq_printf is used to store strings into a special - * buffer (@s). Then the output may be either used by - * the sequencer or pulled into another buffer. - */ -int -trace_seq_printf(struct trace_seq *s, const char *fmt, ...) -{ - int len = (PAGE_SIZE - 1) - s->len; - va_list ap; - int ret; - - if (s->full || !len) - return 0; - - va_start(ap, fmt); - ret = vsnprintf(s->buffer + s->len, len, fmt, ap); - va_end(ap); - - /* If we can't write it all, don't bother writing anything */ - if (ret >= len) { - s->full = 1; - return 0; - } - - s->len += ret; - - return 1; -} -EXPORT_SYMBOL_GPL(trace_seq_printf); - -/** - * trace_seq_bitmask - put a list of longs as a bitmask print output - * @s: trace sequence descriptor - * @maskp: points to an array of unsigned longs that represent a bitmask - * @nmaskbits: The number of bits that are valid in @maskp - * - * It returns 0 if the trace oversizes the buffer's free - * space, 1 otherwise. - * - * Writes a ASCII representation of a bitmask string into @s. - */ -int -trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, - int nmaskbits) -{ - int len = (PAGE_SIZE - 1) - s->len; - int ret; - - if (s->full || !len) - return 0; - - ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits); - s->len += ret; - - return 1; -} -EXPORT_SYMBOL_GPL(trace_seq_bitmask); - -/** - * trace_seq_vprintf - sequence printing of trace information - * @s: trace sequence descriptor - * @fmt: printf format string - * - * The tracer may use either sequence operations or its own - * copy to user routines. To simplify formating of a trace - * trace_seq_printf is used to store strings into a special - * buffer (@s). Then the output may be either used by - * the sequencer or pulled into another buffer. - */ -int -trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) -{ - int len = (PAGE_SIZE - 1) - s->len; - int ret; - - if (s->full || !len) - return 0; - - ret = vsnprintf(s->buffer + s->len, len, fmt, args); - - /* If we can't write it all, don't bother writing anything */ - if (ret >= len) { - s->full = 1; - return 0; - } - - s->len += ret; - - return len; -} -EXPORT_SYMBOL_GPL(trace_seq_vprintf); - -int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) -{ - int len = (PAGE_SIZE - 1) - s->len; - int ret; - - if (s->full || !len) - return 0; - - ret = bstr_printf(s->buffer + s->len, len, fmt, binary); - - /* If we can't write it all, don't bother writing anything */ - if (ret >= len) { - s->full = 1; - return 0; - } - - s->len += ret; - - return len; -} - -/** - * trace_seq_puts - trace sequence printing of simple string - * @s: trace sequence descriptor - * @str: simple string to record - * - * The tracer may use either the sequence operations or its own - * copy to user routines. This function records a simple string - * into a special buffer (@s) for later retrieval by a sequencer - * or other mechanism. - */ -int trace_seq_puts(struct trace_seq *s, const char *str) -{ - int len = strlen(str); - - if (s->full) - return 0; - - if (len > ((PAGE_SIZE - 1) - s->len)) { - s->full = 1; - return 0; - } - - memcpy(s->buffer + s->len, str, len); - s->len += len; - - return len; -} - -int trace_seq_putc(struct trace_seq *s, unsigned char c) -{ - if (s->full) - return 0; - - if (s->len >= (PAGE_SIZE - 1)) { - s->full = 1; - return 0; - } - - s->buffer[s->len++] = c; - - return 1; -} -EXPORT_SYMBOL(trace_seq_putc); - -int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len) -{ - if (s->full) - return 0; - - if (len > ((PAGE_SIZE - 1) - s->len)) { - s->full = 1; - return 0; - } - - memcpy(s->buffer + s->len, mem, len); - s->len += len; - - return len; -} - -int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len) -{ - unsigned char hex[HEX_CHARS]; - const unsigned char *data = mem; - int i, j; - - if (s->full) - return 0; - -#ifdef __BIG_ENDIAN - for (i = 0, j = 0; i < len; i++) { -#else - for (i = len-1, j = 0; i >= 0; i--) { -#endif - hex[j++] = hex_asc_hi(data[i]); - hex[j++] = hex_asc_lo(data[i]); - } - hex[j++] = ' '; - - return trace_seq_putmem(s, hex, j); -} - -void *trace_seq_reserve(struct trace_seq *s, size_t len) -{ - void *ret; - - if (s->full) - return NULL; - - if (len > ((PAGE_SIZE - 1) - s->len)) { - s->full = 1; - return NULL; - } - - ret = s->buffer + s->len; - s->len += len; - - return ret; -} - -int trace_seq_path(struct trace_seq *s, const struct path *path) -{ - unsigned char *p; - - if (s->full) - return 0; - - if (s->len >= (PAGE_SIZE - 1)) { - s->full = 1; - return 0; - } - - p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); - if (!IS_ERR(p)) { - p = mangle_path(s->buffer + s->len, p, "\n"); - if (p) { - s->len = p - s->buffer; - return 1; - } - } else { - s->buffer[s->len++] = '?'; - return 1; - } - - s->full = 1; - return 0; -} - const char * ftrace_print_flags_seq(struct trace_seq *p, const char *delim, unsigned long flags, @@ -343,7 +75,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim, { unsigned long mask; const char *str; - const char *ret = p->buffer + p->len; + const char *ret = trace_seq_buffer_ptr(p); int i, first = 1; for (i = 0; flag_array[i].name && flags; i++) { @@ -379,7 +111,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, const struct trace_print_flags *symbol_array) { int i; - const char *ret = p->buffer + p->len; + const char *ret = trace_seq_buffer_ptr(p); for (i = 0; symbol_array[i].name; i++) { @@ -390,7 +122,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, break; } - if (ret == (const char *)(p->buffer + p->len)) + if (ret == (const char *)(trace_seq_buffer_ptr(p))) trace_seq_printf(p, "0x%lx", val); trace_seq_putc(p, 0); @@ -405,7 +137,7 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, const struct trace_print_flags_u64 *symbol_array) { int i; - const char *ret = p->buffer + p->len; + const char *ret = trace_seq_buffer_ptr(p); for (i = 0; symbol_array[i].name; i++) { @@ -416,7 +148,7 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, break; } - if (ret == (const char *)(p->buffer + p->len)) + if (ret == (const char *)(trace_seq_buffer_ptr(p))) trace_seq_printf(p, "0x%llx", val); trace_seq_putc(p, 0); @@ -430,7 +162,7 @@ const char * ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, unsigned int bitmask_size) { - const char *ret = p->buffer + p->len; + const char *ret = trace_seq_buffer_ptr(p); trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8); trace_seq_putc(p, 0); @@ -443,7 +175,7 @@ const char * ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) { int i; - const char *ret = p->buffer + p->len; + const char *ret = trace_seq_buffer_ptr(p); for (i = 0; i < buf_len; i++) trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]); diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 127a9d8c8357..80b25b585a70 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -35,9 +35,6 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry); extern int __unregister_ftrace_event(struct trace_event *event); extern struct rw_semaphore trace_event_sem; -#define MAX_MEMHEX_BYTES 8 -#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) - #define SEQ_PUT_FIELD_RET(s, x) \ do { \ if (!trace_seq_putmem(s, &(x), sizeof(x))) \ @@ -46,7 +43,6 @@ do { \ #define SEQ_PUT_HEX_FIELD_RET(s, x) \ do { \ - BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES); \ if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \ return TRACE_TYPE_PARTIAL_LINE; \ } while (0) diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c new file mode 100644 index 000000000000..1f24ed99dca2 --- /dev/null +++ b/kernel/trace/trace_seq.c @@ -0,0 +1,428 @@ +/* + * trace_seq.c + * + * Copyright (C) 2008-2014 Red Hat Inc, Steven Rostedt <srostedt@redhat.com> + * + * The trace_seq is a handy tool that allows you to pass a descriptor around + * to a buffer that other functions can write to. It is similar to the + * seq_file functionality but has some differences. + * + * To use it, the trace_seq must be initialized with trace_seq_init(). + * This will set up the counters within the descriptor. You can call + * trace_seq_init() more than once to reset the trace_seq to start + * from scratch. + * + * The buffer size is currently PAGE_SIZE, although it may become dynamic + * in the future. + * + * A write to the buffer will either succed or fail. That is, unlike + * sprintf() there will not be a partial write (well it may write into + * the buffer but it wont update the pointers). This allows users to + * try to write something into the trace_seq buffer and if it fails + * they can flush it and try again. + * + */ +#include <linux/uaccess.h> +#include <linux/seq_file.h> +#include <linux/trace_seq.h> + +/* How much buffer is left on the trace_seq? */ +#define TRACE_SEQ_BUF_LEFT(s) ((PAGE_SIZE - 1) - (s)->len) + +/* How much buffer is written? */ +#define TRACE_SEQ_BUF_USED(s) min((s)->len, (unsigned int)(PAGE_SIZE - 1)) + +/** + * trace_print_seq - move the contents of trace_seq into a seq_file + * @m: the seq_file descriptor that is the destination + * @s: the trace_seq descriptor that is the source. + * + * Returns 0 on success and non zero on error. If it succeeds to + * write to the seq_file it will reset the trace_seq, otherwise + * it does not modify the trace_seq to let the caller try again. + */ +int trace_print_seq(struct seq_file *m, struct trace_seq *s) +{ + unsigned int len = TRACE_SEQ_BUF_USED(s); + int ret; + + ret = seq_write(m, s->buffer, len); + + /* + * Only reset this buffer if we successfully wrote to the + * seq_file buffer. This lets the caller try again or + * do something else with the contents. + */ + if (!ret) + trace_seq_init(s); + + return ret; +} + +/** + * trace_seq_printf - sequence printing of trace information + * @s: trace sequence descriptor + * @fmt: printf format string + * + * The tracer may use either sequence operations or its own + * copy to user routines. To simplify formating of a trace + * trace_seq_printf() is used to store strings into a special + * buffer (@s). Then the output may be either used by + * the sequencer or pulled into another buffer. + * + * Returns 1 if we successfully written all the contents to + * the buffer. + * Returns 0 if we the length to write is bigger than the + * reserved buffer space. In this case, nothing gets written. + */ +int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) +{ + unsigned int len = TRACE_SEQ_BUF_LEFT(s); + va_list ap; + int ret; + + if (s->full || !len) + return 0; + + va_start(ap, fmt); + ret = vsnprintf(s->buffer + s->len, len, fmt, ap); + va_end(ap); + + /* If we can't write it all, don't bother writing anything */ + if (ret >= len) { + s->full = 1; + return 0; + } + + s->len += ret; + + return 1; +} +EXPORT_SYMBOL_GPL(trace_seq_printf); + +/** + * trace_seq_bitmask - write a bitmask array in its ASCII representation + * @s: trace sequence descriptor + * @maskp: points to an array of unsigned longs that represent a bitmask + * @nmaskbits: The number of bits that are valid in @maskp + * + * Writes a ASCII representation of a bitmask string into @s. + * + * Returns 1 if we successfully written all the contents to + * the buffer. + * Returns 0 if we the length to write is bigger than the + * reserved buffer space. In this case, nothing gets written. + */ +int trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, + int nmaskbits) +{ + unsigned int len = TRACE_SEQ_BUF_LEFT(s); + int ret; + + if (s->full || !len) + return 0; + + ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits); + s->len += ret; + + return 1; +} +EXPORT_SYMBOL_GPL(trace_seq_bitmask); + +/** + * trace_seq_vprintf - sequence printing of trace information + * @s: trace sequence descriptor + * @fmt: printf format string + * + * The tracer may use either sequence operations or its own + * copy to user routines. To simplify formating of a trace + * trace_seq_printf is used to store strings into a special + * buffer (@s). Then the output may be either used by + * the sequencer or pulled into another buffer. + * + * Returns how much it wrote to the buffer. + */ +int trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) +{ + unsigned int len = TRACE_SEQ_BUF_LEFT(s); + int ret; + + if (s->full || !len) + return 0; + + ret = vsnprintf(s->buffer + s->len, len, fmt, args); + + /* If we can't write it all, don't bother writing anything */ + if (ret >= len) { + s->full = 1; + return 0; + } + + s->len += ret; + + return len; +} +EXPORT_SYMBOL_GPL(trace_seq_vprintf); + +/** + * trace_seq_bprintf - Write the printf string from binary arguments + * @s: trace sequence descriptor + * @fmt: The format string for the @binary arguments + * @binary: The binary arguments for @fmt. + * + * When recording in a fast path, a printf may be recorded with just + * saving the format and the arguments as they were passed to the + * function, instead of wasting cycles converting the arguments into + * ASCII characters. Instead, the arguments are saved in a 32 bit + * word array that is defined by the format string constraints. + * + * This function will take the format and the binary array and finish + * the conversion into the ASCII string within the buffer. + * + * Returns how much it wrote to the buffer. + */ +int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) +{ + unsigned int len = TRACE_SEQ_BUF_LEFT(s); + int ret; + + if (s->full || !len) + return 0; + + ret = bstr_printf(s->buffer + s->len, len, fmt, binary); + + /* If we can't write it all, don't bother writing anything */ + if (ret >= len) { + s->full = 1; + return 0; + } + + s->len += ret; + + return len; +} +EXPORT_SYMBOL_GPL(trace_seq_bprintf); + +/** + * trace_seq_puts - trace sequence printing of simple string + * @s: trace sequence descriptor + * @str: simple string to record + * + * The tracer may use either the sequence operations or its own + * copy to user routines. This function records a simple string + * into a special buffer (@s) for later retrieval by a sequencer + * or other mechanism. + * + * Returns how much it wrote to the buffer. + */ +int trace_seq_puts(struct trace_seq *s, const char *str) +{ + unsigned int len = strlen(str); + + if (s->full) + return 0; + + if (len > TRACE_SEQ_BUF_LEFT(s)) { + s->full = 1; + return 0; + } + + memcpy(s->buffer + s->len, str, len); + s->len += len; + + return len; +} +EXPORT_SYMBOL_GPL(trace_seq_puts); + +/** + * trace_seq_putc - trace sequence printing of simple character + * @s: trace sequence descriptor + * @c: simple character to record + * + * The tracer may use either the sequence operations or its own + * copy to user routines. This function records a simple charater + * into a special buffer (@s) for later retrieval by a sequencer + * or other mechanism. + * + * Returns how much it wrote to the buffer. + */ +int trace_seq_putc(struct trace_seq *s, unsigned char c) +{ + if (s->full) + return 0; + + if (TRACE_SEQ_BUF_LEFT(s) < 1) { + s->full = 1; + return 0; + } + + s->buffer[s->len++] = c; + + return 1; +} +EXPORT_SYMBOL_GPL(trace_seq_putc); + +/** + * trace_seq_putmem - write raw data into the trace_seq buffer + * @s: trace sequence descriptor + * @mem: The raw memory to copy into the buffer + * @len: The length of the raw memory to copy (in bytes) + * + * There may be cases where raw memory needs to be written into the + * buffer and a strcpy() would not work. Using this function allows + * for such cases. + * + * Returns how much it wrote to the buffer. + */ +int trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len) +{ + if (s->full) + return 0; + + if (len > TRACE_SEQ_BUF_LEFT(s)) { + s->full = 1; + return 0; + } + + memcpy(s->buffer + s->len, mem, len); + s->len += len; + + return len; +} +EXPORT_SYMBOL_GPL(trace_seq_putmem); + +#define MAX_MEMHEX_BYTES 8U +#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) + +/** + * trace_seq_putmem_hex - write raw memory into the buffer in ASCII hex + * @s: trace sequence descriptor + * @mem: The raw memory to write its hex ASCII representation of + * @len: The length of the raw memory to copy (in bytes) + * + * This is similar to trace_seq_putmem() except instead of just copying the + * raw memory into the buffer it writes its ASCII representation of it + * in hex characters. + * + * Returns how much it wrote to the buffer. + */ +int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, + unsigned int len) +{ + unsigned char hex[HEX_CHARS]; + const unsigned char *data = mem; + unsigned int start_len; + int i, j; + int cnt = 0; + + if (s->full) + return 0; + + while (len) { + start_len = min(len, HEX_CHARS - 1); +#ifdef __BIG_ENDIAN + for (i = 0, j = 0; i < start_len; i++) { +#else + for (i = start_len-1, j = 0; i >= 0; i--) { +#endif + hex[j++] = hex_asc_hi(data[i]); + hex[j++] = hex_asc_lo(data[i]); + } + if (WARN_ON_ONCE(j == 0 || j/2 > len)) + break; + + /* j increments twice per loop */ + len -= j / 2; + hex[j++] = ' '; + + cnt += trace_seq_putmem(s, hex, j); + } + return cnt; +} +EXPORT_SYMBOL_GPL(trace_seq_putmem_hex); + +/** + * trace_seq_path - copy a path into the sequence buffer + * @s: trace sequence descriptor + * @path: path to write into the sequence buffer. + * + * Write a path name into the sequence buffer. + * + * Returns 1 if we successfully written all the contents to + * the buffer. + * Returns 0 if we the length to write is bigger than the + * reserved buffer space. In this case, nothing gets written. + */ +int trace_seq_path(struct trace_seq *s, const struct path *path) +{ + unsigned char *p; + + if (s->full) + return 0; + + if (TRACE_SEQ_BUF_LEFT(s) < 1) { + s->full = 1; + return 0; + } + + p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); + if (!IS_ERR(p)) { + p = mangle_path(s->buffer + s->len, p, "\n"); + if (p) { + s->len = p - s->buffer; + return 1; + } + } else { + s->buffer[s->len++] = '?'; + return 1; + } + + s->full = 1; + return 0; +} +EXPORT_SYMBOL_GPL(trace_seq_path); + +/** + * trace_seq_to_user - copy the squence buffer to user space + * @s: trace sequence descriptor + * @ubuf: The userspace memory location to copy to + * @cnt: The amount to copy + * + * Copies the sequence buffer into the userspace memory pointed to + * by @ubuf. It starts from the last read position (@s->readpos) + * and writes up to @cnt characters or till it reaches the end of + * the content in the buffer (@s->len), which ever comes first. + * + * On success, it returns a positive number of the number of bytes + * it copied. + * + * On failure it returns -EBUSY if all of the content in the + * sequence has been already read, which includes nothing in the + * sequenc (@s->len == @s->readpos). + * + * Returns -EFAULT if the copy to userspace fails. + */ +int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt) +{ + int len; + int ret; + + if (!cnt) + return 0; + + if (s->len <= s->readpos) + return -EBUSY; + + len = s->len - s->readpos; + if (cnt > len) + cnt = len; + ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt); + if (ret == cnt) + return -EFAULT; + + cnt -= ret; + + s->readpos += cnt; + return cnt; +} +EXPORT_SYMBOL_GPL(trace_seq_to_user); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 3c9b97e6b1f4..33ff6a24b802 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -265,7 +265,6 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) if (is_ret) tu->consumer.ret_handler = uretprobe_dispatcher; init_trace_uprobe_filter(&tu->filter); - tu->tp.call.flags |= TRACE_EVENT_FL_USE_CALL_FILTER; return tu; error: @@ -1292,7 +1291,7 @@ static int register_uprobe_event(struct trace_uprobe *tu) kfree(call->print_fmt); return -ENODEV; } - call->flags = 0; + call->class->reg = trace_uprobe_register; call->data = tu; ret = trace_add_event_call(call); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 35974ac69600..5dbe22aa3efd 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -265,7 +265,6 @@ struct workqueue_struct { static struct kmem_cache *pwq_cache; -static int wq_numa_tbl_len; /* highest possible NUMA node id + 1 */ static cpumask_var_t *wq_numa_possible_cpumask; /* possible CPUs of each node */ @@ -758,13 +757,6 @@ static bool too_many_workers(struct worker_pool *pool) int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ int nr_busy = pool->nr_workers - nr_idle; - /* - * nr_idle and idle_list may disagree if idle rebinding is in - * progress. Never return %true if idle_list is empty. - */ - if (list_empty(&pool->idle_list)) - return false; - return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; } @@ -850,7 +842,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu) pool = worker->pool; /* this can only happen on the local cpu */ - if (WARN_ON_ONCE(cpu != raw_smp_processor_id())) + if (WARN_ON_ONCE(cpu != raw_smp_processor_id() || pool->cpu != cpu)) return NULL; /* @@ -874,35 +866,22 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu) * worker_set_flags - set worker flags and adjust nr_running accordingly * @worker: self * @flags: flags to set - * @wakeup: wakeup an idle worker if necessary * - * Set @flags in @worker->flags and adjust nr_running accordingly. If - * nr_running becomes zero and @wakeup is %true, an idle worker is - * woken up. + * Set @flags in @worker->flags and adjust nr_running accordingly. * * CONTEXT: * spin_lock_irq(pool->lock) */ -static inline void worker_set_flags(struct worker *worker, unsigned int flags, - bool wakeup) +static inline void worker_set_flags(struct worker *worker, unsigned int flags) { struct worker_pool *pool = worker->pool; WARN_ON_ONCE(worker->task != current); - /* - * If transitioning into NOT_RUNNING, adjust nr_running and - * wake up an idle worker as necessary if requested by - * @wakeup. - */ + /* If transitioning into NOT_RUNNING, adjust nr_running. */ if ((flags & WORKER_NOT_RUNNING) && !(worker->flags & WORKER_NOT_RUNNING)) { - if (wakeup) { - if (atomic_dec_and_test(&pool->nr_running) && - !list_empty(&pool->worklist)) - wake_up_worker(pool); - } else - atomic_dec(&pool->nr_running); + atomic_dec(&pool->nr_running); } worker->flags |= flags; @@ -1232,7 +1211,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, pwq_activate_delayed_work(work); list_del_init(&work->entry); - pwq_dec_nr_in_flight(get_work_pwq(work), get_work_color(work)); + pwq_dec_nr_in_flight(pwq, get_work_color(work)); /* work->data points to pwq iff queued, point to pool */ set_work_pool_and_keep_pending(work, pool->id); @@ -1560,7 +1539,7 @@ static void worker_enter_idle(struct worker *worker) (worker->hentry.next || worker->hentry.pprev))) return; - /* can't use worker_set_flags(), also called from start_worker() */ + /* can't use worker_set_flags(), also called from create_worker() */ worker->flags |= WORKER_IDLE; pool->nr_idle++; worker->last_active = jiffies; @@ -1602,11 +1581,11 @@ static void worker_leave_idle(struct worker *worker) list_del_init(&worker->entry); } -static struct worker *alloc_worker(void) +static struct worker *alloc_worker(int node) { struct worker *worker; - worker = kzalloc(sizeof(*worker), GFP_KERNEL); + worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, node); if (worker) { INIT_LIST_HEAD(&worker->entry); INIT_LIST_HEAD(&worker->scheduled); @@ -1670,6 +1649,9 @@ static void worker_detach_from_pool(struct worker *worker, detach_completion = pool->detach_completion; mutex_unlock(&pool->attach_mutex); + /* clear leftover flags without pool->lock after it is detached */ + worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND); + if (detach_completion) complete(detach_completion); } @@ -1678,8 +1660,7 @@ static void worker_detach_from_pool(struct worker *worker, * create_worker - create a new workqueue worker * @pool: pool the new worker will belong to * - * Create a new worker which is attached to @pool. The new worker must be - * started by start_worker(). + * Create and start a new worker which is attached to @pool. * * CONTEXT: * Might sleep. Does GFP_KERNEL allocations. @@ -1698,7 +1679,7 @@ static struct worker *create_worker(struct worker_pool *pool) if (id < 0) goto fail; - worker = alloc_worker(); + worker = alloc_worker(pool->node); if (!worker) goto fail; @@ -1724,6 +1705,13 @@ static struct worker *create_worker(struct worker_pool *pool) /* successful, attach the worker to the pool */ worker_attach_to_pool(worker, pool); + /* start the newly created worker */ + spin_lock_irq(&pool->lock); + worker->pool->nr_workers++; + worker_enter_idle(worker); + wake_up_process(worker->task); + spin_unlock_irq(&pool->lock); + return worker; fail: @@ -1734,44 +1722,6 @@ fail: } /** - * start_worker - start a newly created worker - * @worker: worker to start - * - * Make the pool aware of @worker and start it. - * - * CONTEXT: - * spin_lock_irq(pool->lock). - */ -static void start_worker(struct worker *worker) -{ - worker->pool->nr_workers++; - worker_enter_idle(worker); - wake_up_process(worker->task); -} - -/** - * create_and_start_worker - create and start a worker for a pool - * @pool: the target pool - * - * Grab the managership of @pool and create and start a new worker for it. - * - * Return: 0 on success. A negative error code otherwise. - */ -static int create_and_start_worker(struct worker_pool *pool) -{ - struct worker *worker; - - worker = create_worker(pool); - if (worker) { - spin_lock_irq(&pool->lock); - start_worker(worker); - spin_unlock_irq(&pool->lock); - } - - return worker ? 0 : -ENOMEM; -} - -/** * destroy_worker - destroy a workqueue worker * @worker: worker to be destroyed * @@ -1909,23 +1859,10 @@ restart: mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); while (true) { - struct worker *worker; - - worker = create_worker(pool); - if (worker) { - del_timer_sync(&pool->mayday_timer); - spin_lock_irq(&pool->lock); - start_worker(worker); - if (WARN_ON_ONCE(need_to_create_worker(pool))) - goto restart; - return true; - } - - if (!need_to_create_worker(pool)) + if (create_worker(pool) || !need_to_create_worker(pool)) break; - __set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(CREATE_COOLDOWN); + schedule_timeout_interruptible(CREATE_COOLDOWN); if (!need_to_create_worker(pool)) break; @@ -1933,6 +1870,11 @@ restart: del_timer_sync(&pool->mayday_timer); spin_lock_irq(&pool->lock); + /* + * This is necessary even after a new worker was just successfully + * created as @pool->lock was dropped and the new worker might have + * already become busy. + */ if (need_to_create_worker(pool)) goto restart; return true; @@ -2020,13 +1962,8 @@ __acquires(&pool->lock) lockdep_copy_map(&lockdep_map, &work->lockdep_map); #endif - /* - * Ensure we're on the correct CPU. DISASSOCIATED test is - * necessary to avoid spurious warnings from rescuers servicing the - * unbound or a disassociated pool. - */ - WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) && - !(pool->flags & POOL_DISASSOCIATED) && + /* ensure we're on the correct CPU */ + WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) && raw_smp_processor_id() != pool->cpu); /* @@ -2052,17 +1989,22 @@ __acquires(&pool->lock) list_del_init(&work->entry); /* - * CPU intensive works don't participate in concurrency - * management. They're the scheduler's responsibility. + * CPU intensive works don't participate in concurrency management. + * They're the scheduler's responsibility. This takes @worker out + * of concurrency management and the next code block will chain + * execution of the pending work items. */ if (unlikely(cpu_intensive)) - worker_set_flags(worker, WORKER_CPU_INTENSIVE, true); + worker_set_flags(worker, WORKER_CPU_INTENSIVE); /* - * Unbound pool isn't concurrency managed and work items should be - * executed ASAP. Wake up another worker if necessary. + * Wake up another worker if necessary. The condition is always + * false for normal per-cpu workers since nr_running would always + * be >= 1 at this point. This is used to chain execution of the + * pending work items for WORKER_NOT_RUNNING workers such as the + * UNBOUND and CPU_INTENSIVE ones. */ - if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool)) + if (need_more_worker(pool)) wake_up_worker(pool); /* @@ -2218,7 +2160,7 @@ recheck: } } while (keep_working(pool)); - worker_set_flags(worker, WORKER_PREP, false); + worker_set_flags(worker, WORKER_PREP); sleep: /* * pool->lock is held and there's no work to process and no need to @@ -2311,29 +2253,27 @@ repeat: move_linked_works(work, scheduled, &n); process_scheduled_works(rescuer); - spin_unlock_irq(&pool->lock); - - worker_detach_from_pool(rescuer, pool); - - spin_lock_irq(&pool->lock); /* * Put the reference grabbed by send_mayday(). @pool won't - * go away while we're holding its lock. + * go away while we're still attached to it. */ put_pwq(pwq); /* - * Leave this pool. If keep_working() is %true, notify a + * Leave this pool. If need_more_worker() is %true, notify a * regular worker; otherwise, we end up with 0 concurrency * and stalling the execution. */ - if (keep_working(pool)) + if (need_more_worker(pool)) wake_up_worker(pool); rescuer->pool = NULL; - spin_unlock(&pool->lock); - spin_lock(&wq_mayday_lock); + spin_unlock_irq(&pool->lock); + + worker_detach_from_pool(rescuer, pool); + + spin_lock_irq(&wq_mayday_lock); } spin_unlock_irq(&wq_mayday_lock); @@ -3458,7 +3398,7 @@ static void put_unbound_pool(struct worker_pool *pool) return; /* sanity checks */ - if (WARN_ON(!(pool->flags & POOL_DISASSOCIATED)) || + if (WARN_ON(!(pool->cpu < 0)) || WARN_ON(!list_empty(&pool->worklist))) return; @@ -3524,7 +3464,7 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) { if (wqattrs_equal(pool->attrs, attrs)) { pool->refcnt++; - goto out_unlock; + return pool; } } @@ -3557,12 +3497,12 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) goto fail; /* create and start the initial worker */ - if (create_and_start_worker(pool) < 0) + if (!create_worker(pool)) goto fail; /* install */ hash_add(unbound_pool_hash, &pool->hash_node, hash); -out_unlock: + return pool; fail: if (pool) @@ -3591,11 +3531,6 @@ static void pwq_unbound_release_workfn(struct work_struct *work) if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND))) return; - /* - * Unlink @pwq. Synchronization against wq->mutex isn't strictly - * necessary on release but do it anyway. It's easier to verify - * and consistent with the linking path. - */ mutex_lock(&wq->mutex); list_del_rcu(&pwq->pwqs_node); is_last = list_empty(&wq->pwqs); @@ -3692,10 +3627,7 @@ static void link_pwq(struct pool_workqueue *pwq) if (!list_empty(&pwq->pwqs_node)) return; - /* - * Set the matching work_color. This is synchronized with - * wq->mutex to avoid confusing flush_workqueue(). - */ + /* set the matching work_color */ pwq->work_color = wq->work_color; /* sync max_active to the current setting */ @@ -3832,7 +3764,7 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs))) return -EINVAL; - pwq_tbl = kzalloc(wq_numa_tbl_len * sizeof(pwq_tbl[0]), GFP_KERNEL); + pwq_tbl = kzalloc(nr_node_ids * sizeof(pwq_tbl[0]), GFP_KERNEL); new_attrs = alloc_workqueue_attrs(GFP_KERNEL); tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL); if (!pwq_tbl || !new_attrs || !tmp_attrs) @@ -4080,7 +4012,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, /* allocate wq and format name */ if (flags & WQ_UNBOUND) - tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]); + tbl_size = nr_node_ids * sizeof(wq->numa_pwq_tbl[0]); wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL); if (!wq) @@ -4122,7 +4054,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, if (flags & WQ_MEM_RECLAIM) { struct worker *rescuer; - rescuer = alloc_worker(); + rescuer = alloc_worker(NUMA_NO_NODE); if (!rescuer) goto err_destroy; @@ -4470,8 +4402,6 @@ static void wq_unbind_fn(struct work_struct *work) struct worker *worker; for_each_cpu_worker_pool(pool, cpu) { - WARN_ON_ONCE(cpu != smp_processor_id()); - mutex_lock(&pool->attach_mutex); spin_lock_irq(&pool->lock); @@ -4543,6 +4473,7 @@ static void rebind_workers(struct worker_pool *pool) pool->attrs->cpumask) < 0); spin_lock_irq(&pool->lock); + pool->flags &= ~POOL_DISASSOCIATED; for_each_pool_worker(worker, pool) { unsigned int worker_flags = worker->flags; @@ -4632,7 +4563,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb, for_each_cpu_worker_pool(pool, cpu) { if (pool->nr_workers) continue; - if (create_and_start_worker(pool) < 0) + if (!create_worker(pool)) return NOTIFY_BAD; } break; @@ -4644,15 +4575,10 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb, for_each_pool(pool, pi) { mutex_lock(&pool->attach_mutex); - if (pool->cpu == cpu) { - spin_lock_irq(&pool->lock); - pool->flags &= ~POOL_DISASSOCIATED; - spin_unlock_irq(&pool->lock); - + if (pool->cpu == cpu) rebind_workers(pool); - } else if (pool->cpu < 0) { + else if (pool->cpu < 0) restore_unbound_workers_cpumask(pool, cpu); - } mutex_unlock(&pool->attach_mutex); } @@ -4856,10 +4782,6 @@ static void __init wq_numa_init(void) cpumask_var_t *tbl; int node, cpu; - /* determine NUMA pwq table len - highest node id + 1 */ - for_each_node(node) - wq_numa_tbl_len = max(wq_numa_tbl_len, node + 1); - if (num_possible_nodes() <= 1) return; @@ -4876,7 +4798,7 @@ static void __init wq_numa_init(void) * available. Build one from cpu_to_node() which should have been * fully initialized by now. */ - tbl = kzalloc(wq_numa_tbl_len * sizeof(tbl[0]), GFP_KERNEL); + tbl = kzalloc(nr_node_ids * sizeof(tbl[0]), GFP_KERNEL); BUG_ON(!tbl); for_each_node(node) @@ -4936,7 +4858,7 @@ static int __init init_workqueues(void) for_each_cpu_worker_pool(pool, cpu) { pool->flags &= ~POOL_DISASSOCIATED; - BUG_ON(create_and_start_worker(pool) < 0); + BUG_ON(!create_worker(pool)); } } |