diff options
Diffstat (limited to 'kernel')
34 files changed, 1177 insertions, 947 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index 22bb4f24f071..8d528f9930da 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1883,6 +1883,23 @@ out_null: audit_log_format(ab, " exe=(null)"); } +struct tty_struct *audit_get_tty(struct task_struct *tsk) +{ + struct tty_struct *tty = NULL; + unsigned long flags; + + spin_lock_irqsave(&tsk->sighand->siglock, flags); + if (tsk->signal) + tty = tty_kref_get(tsk->signal->tty); + spin_unlock_irqrestore(&tsk->sighand->siglock, flags); + return tty; +} + +void audit_put_tty(struct tty_struct *tty) +{ + tty_kref_put(tty); +} + void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) { const struct cred *cred; diff --git a/kernel/audit.h b/kernel/audit.h index cbbe6bb6496e..a492f4c4e710 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -23,6 +23,7 @@ #include <linux/audit.h> #include <linux/skbuff.h> #include <uapi/linux/mqueue.h> +#include <linux/tty.h> /* AUDIT_NAMES is the number of slots we reserve in the audit_context * for saving names from getname(). If we get more names we will allocate @@ -262,6 +263,9 @@ extern struct audit_entry *audit_dupe_rule(struct audit_krule *old); extern void audit_log_d_path_exe(struct audit_buffer *ab, struct mm_struct *mm); +extern struct tty_struct *audit_get_tty(struct task_struct *tsk); +extern void audit_put_tty(struct tty_struct *tty); + /* audit watch functions */ #ifdef CONFIG_AUDIT_WATCH extern void audit_put_watch(struct audit_watch *watch); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 62ab53d7619c..2672d105cffc 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -63,7 +63,6 @@ #include <asm/unistd.h> #include <linux/security.h> #include <linux/list.h> -#include <linux/tty.h> #include <linux/binfmts.h> #include <linux/highmem.h> #include <linux/syscalls.h> @@ -1985,14 +1984,15 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, if (!audit_enabled) return; + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); + if (!ab) + return; + uid = from_kuid(&init_user_ns, task_uid(current)); oldloginuid = from_kuid(&init_user_ns, koldloginuid); loginuid = from_kuid(&init_user_ns, kloginuid), tty = audit_get_tty(current); - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); - if (!ab) - return; audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid); audit_log_task_context(ab); audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d", diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 668e07903c8f..eec9f90ba030 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -126,31 +126,6 @@ * are set to NOT_INIT to indicate that they are no longer readable. */ -/* types of values stored in eBPF registers */ -enum bpf_reg_type { - NOT_INIT = 0, /* nothing was written into register */ - UNKNOWN_VALUE, /* reg doesn't contain a valid pointer */ - PTR_TO_CTX, /* reg points to bpf_context */ - CONST_PTR_TO_MAP, /* reg points to struct bpf_map */ - PTR_TO_MAP_VALUE, /* reg points to map element value */ - PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */ - FRAME_PTR, /* reg == frame_pointer */ - PTR_TO_STACK, /* reg == frame_pointer + imm */ - CONST_IMM, /* constant integer value */ - - /* PTR_TO_PACKET represents: - * skb->data - * skb->data + imm - * skb->data + (u16) var - * skb->data + (u16) var + imm - * if (range > 0) then [ptr, ptr + range - off) is safe to access - * if (id > 0) means that some 'var' was added - * if (off > 0) menas that 'imm' was added - */ - PTR_TO_PACKET, - PTR_TO_PACKET_END, /* skb->data + headlen */ -}; - struct reg_state { enum bpf_reg_type type; union { @@ -695,10 +670,10 @@ static int check_packet_access(struct verifier_env *env, u32 regno, int off, /* check access to 'struct bpf_context' fields */ static int check_ctx_access(struct verifier_env *env, int off, int size, - enum bpf_access_type t) + enum bpf_access_type t, enum bpf_reg_type *reg_type) { if (env->prog->aux->ops->is_valid_access && - env->prog->aux->ops->is_valid_access(off, size, t)) { + env->prog->aux->ops->is_valid_access(off, size, t, reg_type)) { /* remember the offset of last byte accessed in ctx */ if (env->prog->aux->max_ctx_offset < off + size) env->prog->aux->max_ctx_offset = off + size; @@ -798,21 +773,19 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, mark_reg_unknown_value(state->regs, value_regno); } else if (reg->type == PTR_TO_CTX) { + enum bpf_reg_type reg_type = UNKNOWN_VALUE; + if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { verbose("R%d leaks addr into ctx\n", value_regno); return -EACCES; } - err = check_ctx_access(env, off, size, t); + err = check_ctx_access(env, off, size, t, ®_type); if (!err && t == BPF_READ && value_regno >= 0) { mark_reg_unknown_value(state->regs, value_regno); - if (off == offsetof(struct __sk_buff, data) && - env->allow_ptr_leaks) + if (env->allow_ptr_leaks) /* note that reg.[id|off|range] == 0 */ - state->regs[value_regno].type = PTR_TO_PACKET; - else if (off == offsetof(struct __sk_buff, data_end) && - env->allow_ptr_leaks) - state->regs[value_regno].type = PTR_TO_PACKET_END; + state->regs[value_regno].type = reg_type; } } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) { diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 86cb5c6e8932..75c0ff00aca6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -837,6 +837,8 @@ static void put_css_set_locked(struct css_set *cset) static void put_css_set(struct css_set *cset) { + unsigned long flags; + /* * Ensure that the refcount doesn't hit zero while any readers * can see it. Similar to atomic_dec_and_lock(), but for an @@ -845,9 +847,9 @@ static void put_css_set(struct css_set *cset) if (atomic_add_unless(&cset->refcount, -1, 1)) return; - spin_lock_bh(&css_set_lock); + spin_lock_irqsave(&css_set_lock, flags); put_css_set_locked(cset); - spin_unlock_bh(&css_set_lock); + spin_unlock_irqrestore(&css_set_lock, flags); } /* @@ -1070,11 +1072,11 @@ static struct css_set *find_css_set(struct css_set *old_cset, /* First see if we already have a cgroup group that matches * the desired set */ - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); cset = find_existing_css_set(old_cset, cgrp, template); if (cset) get_css_set(cset); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); if (cset) return cset; @@ -1102,7 +1104,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, * find_existing_css_set() */ memcpy(cset->subsys, template, sizeof(cset->subsys)); - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); /* Add reference counts and links from the new css_set. */ list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; @@ -1128,7 +1130,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, css_get(css); } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); return cset; } @@ -1192,7 +1194,7 @@ static void cgroup_destroy_root(struct cgroup_root *root) * Release all the links from cset_links to this hierarchy's * root cgroup */ - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { list_del(&link->cset_link); @@ -1200,7 +1202,7 @@ static void cgroup_destroy_root(struct cgroup_root *root) kfree(link); } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); if (!list_empty(&root->root_list)) { list_del(&root->root_list); @@ -1600,11 +1602,11 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) ss->root = dst_root; css->cgroup = dcgrp; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); hash_for_each(css_set_table, i, cset, hlist) list_move_tail(&cset->e_cset_node[ss->id], &dcgrp->e_csets[ss->id]); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; @@ -1640,10 +1642,10 @@ static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, if (!buf) return -ENOMEM; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot); len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); if (len >= PATH_MAX) len = -ERANGE; @@ -1897,7 +1899,7 @@ static void cgroup_enable_task_cg_lists(void) { struct task_struct *p, *g; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); if (use_task_css_set_links) goto out_unlock; @@ -1922,8 +1924,12 @@ static void cgroup_enable_task_cg_lists(void) * entry won't be deleted though the process has exited. * Do it while holding siglock so that we don't end up * racing against cgroup_exit(). + * + * Interrupts were already disabled while acquiring + * the css_set_lock, so we do not need to disable it + * again when acquiring the sighand->siglock here. */ - spin_lock_irq(&p->sighand->siglock); + spin_lock(&p->sighand->siglock); if (!(p->flags & PF_EXITING)) { struct css_set *cset = task_css_set(p); @@ -1932,11 +1938,11 @@ static void cgroup_enable_task_cg_lists(void) list_add_tail(&p->cg_list, &cset->tasks); get_css_set(cset); } - spin_unlock_irq(&p->sighand->siglock); + spin_unlock(&p->sighand->siglock); } while_each_thread(g, p); read_unlock(&tasklist_lock); out_unlock: - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); } static void init_cgroup_housekeeping(struct cgroup *cgrp) @@ -2043,13 +2049,13 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) * Link the root cgroup in this hierarchy into all the css_set * objects. */ - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); hash_for_each(css_set_table, i, cset, hlist) { link_css_set(&tmp_links, cset, root_cgrp); if (css_set_populated(cset)) cgroup_update_populated(root_cgrp, true); } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); BUG_ON(!list_empty(&root_cgrp->self.children)); BUG_ON(atomic_read(&root->nr_cgrps) != 1); @@ -2256,11 +2262,11 @@ out_mount: struct cgroup *cgrp; mutex_lock(&cgroup_mutex); - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); cgrp = cset_cgroup_from_root(ns->root_cset, root); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); mutex_unlock(&cgroup_mutex); nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb); @@ -2337,11 +2343,11 @@ char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, char *ret; mutex_lock(&cgroup_mutex); - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); mutex_unlock(&cgroup_mutex); return ret; @@ -2369,7 +2375,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) char *path = NULL; mutex_lock(&cgroup_mutex); - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); @@ -2382,7 +2388,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) path = buf; } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); mutex_unlock(&cgroup_mutex); return path; } @@ -2557,7 +2563,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, * the new cgroup. There are no failure cases after here, so this * is the commit point. */ - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); list_for_each_entry(cset, &tset->src_csets, mg_node) { list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) { struct css_set *from_cset = task_css_set(task); @@ -2568,7 +2574,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, put_css_set_locked(from_cset); } } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); /* * Migration is committed, all target tasks are now on dst_csets. @@ -2597,13 +2603,13 @@ out_cancel_attach: } } while_each_subsys_mask(); out_release_tset: - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); list_splice_init(&tset->dst_csets, &tset->src_csets); list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) { list_splice_tail_init(&cset->mg_tasks, &cset->tasks); list_del_init(&cset->mg_node); } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); return ret; } @@ -2634,7 +2640,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) lockdep_assert_held(&cgroup_mutex); - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) { cset->mg_src_cgrp = NULL; cset->mg_dst_cgrp = NULL; @@ -2642,7 +2648,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) list_del_init(&cset->mg_preload_node); put_css_set_locked(cset); } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); } /** @@ -2783,7 +2789,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup, * already PF_EXITING could be freed from underneath us unless we * take an rcu_read_lock. */ - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); rcu_read_lock(); task = leader; do { @@ -2792,7 +2798,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup, break; } while_each_thread(leader, task); rcu_read_unlock(); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); return cgroup_taskset_migrate(&tset, root); } @@ -2816,7 +2822,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, return -EBUSY; /* look up all src csets */ - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); rcu_read_lock(); task = leader; do { @@ -2826,7 +2832,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, break; } while_each_thread(leader, task); rcu_read_unlock(); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); /* prepare dst csets and commit */ ret = cgroup_migrate_prepare_dst(&preloaded_csets); @@ -2859,9 +2865,9 @@ static int cgroup_procs_write_permission(struct task_struct *task, struct cgroup *cgrp; struct inode *inode; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); while (!cgroup_is_descendant(dst_cgrp, cgrp)) cgrp = cgroup_parent(cgrp); @@ -2962,9 +2968,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) if (root == &cgrp_dfl_root) continue; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); from_cgrp = task_cgroup_from_root(from, root); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); retval = cgroup_attach_task(from_cgrp, tsk, false); if (retval) @@ -3080,7 +3086,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) percpu_down_write(&cgroup_threadgroup_rwsem); /* look up all csses currently attached to @cgrp's subtree */ - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { struct cgrp_cset_link *link; @@ -3088,14 +3094,14 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) cgroup_migrate_add_src(link->cset, dsct, &preloaded_csets); } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); /* NULL dst indicates self on default hierarchy */ ret = cgroup_migrate_prepare_dst(&preloaded_csets); if (ret) goto out_finish; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) { struct task_struct *task, *ntask; @@ -3107,7 +3113,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list) cgroup_taskset_add(task, &tset); } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); ret = cgroup_taskset_migrate(&tset, cgrp->root); out_finish: @@ -3908,10 +3914,10 @@ static int cgroup_task_count(const struct cgroup *cgrp) int count = 0; struct cgrp_cset_link *link; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); list_for_each_entry(link, &cgrp->cset_links, cset_link) count += atomic_read(&link->cset->refcount); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); return count; } @@ -4249,7 +4255,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, memset(it, 0, sizeof(*it)); - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); it->ss = css->ss; @@ -4262,7 +4268,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, css_task_iter_advance_css_set(it); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); } /** @@ -4280,7 +4286,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) it->cur_task = NULL; } - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); if (it->task_pos) { it->cur_task = list_entry(it->task_pos, struct task_struct, @@ -4289,7 +4295,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) css_task_iter_advance(it); } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); return it->cur_task; } @@ -4303,10 +4309,10 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) void css_task_iter_end(struct css_task_iter *it) { if (it->cur_cset) { - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); list_del(&it->iters_node); put_css_set_locked(it->cur_cset); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); } if (it->cur_task) @@ -4338,10 +4344,10 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) mutex_lock(&cgroup_mutex); /* all tasks in @from are being moved, all csets are source */ - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); list_for_each_entry(link, &from->cset_links, cset_link) cgroup_migrate_add_src(link->cset, to, &preloaded_csets); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); ret = cgroup_migrate_prepare_dst(&preloaded_csets); if (ret) @@ -5063,6 +5069,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, memset(css, 0, sizeof(*css)); css->cgroup = cgrp; css->ss = ss; + css->id = -1; INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->children); css->serial_nr = css_serial_nr_next++; @@ -5150,7 +5157,7 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL); if (err < 0) - goto err_free_percpu_ref; + goto err_free_css; css->id = err; /* @css is ready to be brought online now, make it visible */ @@ -5174,9 +5181,6 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, err_list_del: list_del_rcu(&css->sibling); - cgroup_idr_remove(&ss->css_idr, css->id); -err_free_percpu_ref: - percpu_ref_exit(&css->refcnt); err_free_css: call_rcu(&css->rcu_head, css_free_rcu_fn); return ERR_PTR(err); @@ -5451,10 +5455,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) */ cgrp->self.flags &= ~CSS_ONLINE; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); list_for_each_entry(link, &cgrp->cset_links, cset_link) link->cset->dead = true; - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); /* initiate massacre of all css's */ for_each_css(css, ssid, cgrp) @@ -5725,7 +5729,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, goto out; mutex_lock(&cgroup_mutex); - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); for_each_root(root) { struct cgroup_subsys *ss; @@ -5778,7 +5782,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, retval = 0; out_unlock: - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); mutex_unlock(&cgroup_mutex); kfree(buf); out: @@ -5923,13 +5927,13 @@ void cgroup_post_fork(struct task_struct *child) if (use_task_css_set_links) { struct css_set *cset; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); cset = task_css_set(current); if (list_empty(&child->cg_list)) { get_css_set(cset); css_set_move_task(child, NULL, cset, false); } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); } /* @@ -5974,9 +5978,9 @@ void cgroup_exit(struct task_struct *tsk) cset = task_css_set(tsk); if (!list_empty(&tsk->cg_list)) { - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); css_set_move_task(tsk, cset, NULL, false); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); } else { get_css_set(cset); } @@ -6044,9 +6048,9 @@ static void cgroup_release_agent(struct work_struct *work) if (!pathbuf || !agentbuf) goto out; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); if (!path) goto out; @@ -6306,12 +6310,12 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, return ERR_PTR(-EPERM); mutex_lock(&cgroup_mutex); - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); cset = task_css_set(current); get_css_set(cset); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); mutex_unlock(&cgroup_mutex); new_ns = alloc_cgroup_ns(); @@ -6435,7 +6439,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) if (!name_buf) return -ENOMEM; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); rcu_read_lock(); cset = rcu_dereference(current->cgroups); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { @@ -6446,7 +6450,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) c->root->hierarchy_id, name_buf); } rcu_read_unlock(); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); kfree(name_buf); return 0; } @@ -6457,7 +6461,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) struct cgroup_subsys_state *css = seq_css(seq); struct cgrp_cset_link *link; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { struct css_set *cset = link->cset; struct task_struct *task; @@ -6480,7 +6484,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) overflow: seq_puts(seq, " ...\n"); } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); return 0; } diff --git a/kernel/cpu.c b/kernel/cpu.c index d948e44c471e..7b61887f7ccd 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1201,6 +1201,8 @@ static struct cpuhp_step cpuhp_bp_states[] = { .teardown = takedown_cpu, .cant_stop = true, }, +#else + [CPUHP_BRINGUP_CPU] = { }, #endif }; diff --git a/kernel/events/core.c b/kernel/events/core.c index 9c51ec3f0f44..43d43a2d5811 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1678,12 +1678,33 @@ static bool is_orphaned_event(struct perf_event *event) return event->state == PERF_EVENT_STATE_DEAD; } -static inline int pmu_filter_match(struct perf_event *event) +static inline int __pmu_filter_match(struct perf_event *event) { struct pmu *pmu = event->pmu; return pmu->filter_match ? pmu->filter_match(event) : 1; } +/* + * Check whether we should attempt to schedule an event group based on + * PMU-specific filtering. An event group can consist of HW and SW events, + * potentially with a SW leader, so we must check all the filters, to + * determine whether a group is schedulable: + */ +static inline int pmu_filter_match(struct perf_event *event) +{ + struct perf_event *child; + + if (!__pmu_filter_match(event)) + return 0; + + list_for_each_entry(child, &event->sibling_list, group_entry) { + if (!__pmu_filter_match(child)) + return 0; + } + + return 1; +} + static inline int event_filter_match(struct perf_event *event) { @@ -7529,7 +7550,7 @@ static void perf_event_free_bpf_prog(struct perf_event *event) prog = event->tp_event->prog; if (prog) { event->tp_event->prog = NULL; - bpf_prog_put(prog); + bpf_prog_put_rcu(prog); } } diff --git a/kernel/fork.c b/kernel/fork.c index 5c2c355aa97f..4a7ec0c6c88c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -148,18 +148,18 @@ static inline void free_task_struct(struct task_struct *tsk) } #endif -void __weak arch_release_thread_info(struct thread_info *ti) +void __weak arch_release_thread_stack(unsigned long *stack) { } -#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR +#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR /* * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a * kmemcache based allocator. */ # if THREAD_SIZE >= PAGE_SIZE -static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) { struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP, @@ -172,33 +172,33 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, return page ? page_address(page) : NULL; } -static inline void free_thread_info(struct thread_info *ti) +static inline void free_thread_stack(unsigned long *stack) { - struct page *page = virt_to_page(ti); + struct page *page = virt_to_page(stack); memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, -(1 << THREAD_SIZE_ORDER)); __free_kmem_pages(page, THREAD_SIZE_ORDER); } # else -static struct kmem_cache *thread_info_cache; +static struct kmem_cache *thread_stack_cache; -static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) { - return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node); + return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); } -static void free_thread_info(struct thread_info *ti) +static void free_thread_stack(unsigned long *stack) { - kmem_cache_free(thread_info_cache, ti); + kmem_cache_free(thread_stack_cache, stack); } -void thread_info_cache_init(void) +void thread_stack_cache_init(void) { - thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE, + thread_stack_cache = kmem_cache_create("thread_stack", THREAD_SIZE, THREAD_SIZE, 0, NULL); - BUG_ON(thread_info_cache == NULL); + BUG_ON(thread_stack_cache == NULL); } # endif #endif @@ -221,9 +221,9 @@ struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; -static void account_kernel_stack(struct thread_info *ti, int account) +static void account_kernel_stack(unsigned long *stack, int account) { - struct zone *zone = page_zone(virt_to_page(ti)); + struct zone *zone = page_zone(virt_to_page(stack)); mod_zone_page_state(zone, NR_KERNEL_STACK, account); } @@ -231,8 +231,8 @@ static void account_kernel_stack(struct thread_info *ti, int account) void free_task(struct task_struct *tsk) { account_kernel_stack(tsk->stack, -1); - arch_release_thread_info(tsk->stack); - free_thread_info(tsk->stack); + arch_release_thread_stack(tsk->stack); + free_thread_stack(tsk->stack); rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); put_seccomp_filter(tsk); @@ -343,7 +343,7 @@ void set_task_stack_end_magic(struct task_struct *tsk) static struct task_struct *dup_task_struct(struct task_struct *orig, int node) { struct task_struct *tsk; - struct thread_info *ti; + unsigned long *stack; int err; if (node == NUMA_NO_NODE) @@ -352,15 +352,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (!tsk) return NULL; - ti = alloc_thread_info_node(tsk, node); - if (!ti) + stack = alloc_thread_stack_node(tsk, node); + if (!stack) goto free_tsk; err = arch_dup_task_struct(tsk, orig); if (err) - goto free_ti; + goto free_stack; - tsk->stack = ti; + tsk->stack = stack; #ifdef CONFIG_SECCOMP /* * We must handle setting up seccomp filters once we're under @@ -392,14 +392,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->task_frag.page = NULL; tsk->wake_q.next = NULL; - account_kernel_stack(ti, 1); + account_kernel_stack(stack, 1); kcov_task_init(tsk); return tsk; -free_ti: - free_thread_info(ti); +free_stack: + free_thread_stack(stack); free_tsk: free_task_struct(tsk); return NULL; diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index e25e92fb44fa..6a5c239c7669 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -18,7 +18,7 @@ #include <linux/vmalloc.h> #include "gcov.h" -#if __GNUC__ == 5 && __GNUC_MINOR__ >= 1 +#if (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1) #define GCOV_COUNTERS 10 #elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9 #define GCOV_COUNTERS 9 diff --git a/kernel/jump_label.c b/kernel/jump_label.c index ac4ab953b49c..0dbea887d625 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -58,13 +58,36 @@ static void jump_label_update(struct static_key *key); void static_key_slow_inc(struct static_key *key) { + int v, v1; + STATIC_KEY_CHECK_USE(); - if (atomic_inc_not_zero(&key->enabled)) - return; + + /* + * Careful if we get concurrent static_key_slow_inc() calls; + * later calls must wait for the first one to _finish_ the + * jump_label_update() process. At the same time, however, + * the jump_label_update() call below wants to see + * static_key_enabled(&key) for jumps to be updated properly. + * + * So give a special meaning to negative key->enabled: it sends + * static_key_slow_inc() down the slow path, and it is non-zero + * so it counts as "enabled" in jump_label_update(). Note that + * atomic_inc_unless_negative() checks >= 0, so roll our own. + */ + for (v = atomic_read(&key->enabled); v > 0; v = v1) { + v1 = atomic_cmpxchg(&key->enabled, v, v + 1); + if (likely(v1 == v)) + return; + } jump_label_lock(); - if (atomic_inc_return(&key->enabled) == 1) + if (atomic_read(&key->enabled) == 0) { + atomic_set(&key->enabled, -1); jump_label_update(key); + atomic_set(&key->enabled, 1); + } else { + atomic_inc(&key->enabled); + } jump_label_unlock(); } EXPORT_SYMBOL_GPL(static_key_slow_inc); @@ -72,6 +95,13 @@ EXPORT_SYMBOL_GPL(static_key_slow_inc); static void __static_key_slow_dec(struct static_key *key, unsigned long rate_limit, struct delayed_work *work) { + /* + * The negative count check is valid even when a negative + * key->enabled is in use by static_key_slow_inc(); a + * __static_key_slow_dec() before the first static_key_slow_inc() + * returns is unbalanced, because all other static_key_slow_inc() + * instances block while the update is in progress. + */ if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { WARN(atomic_read(&key->enabled) < 0, "jump label: negative count!\n"); diff --git a/kernel/kcov.c b/kernel/kcov.c index a02f2dddd1d7..8d44b3fea9d0 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -264,7 +264,12 @@ static const struct file_operations kcov_fops = { static int __init kcov_init(void) { - if (!debugfs_create_file("kcov", 0600, NULL, NULL, &kcov_fops)) { + /* + * The kcov debugfs file won't ever get removed and thus, + * there is no need to protect it against removal races. The + * use of debugfs_create_file_unsafe() is actually safe here. + */ + if (!debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops)) { pr_err("failed to create kcov in debugfs\n"); return -ENOMEM; } diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 3ef3736002d8..9c951fade415 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -49,21 +49,21 @@ void debug_mutex_free_waiter(struct mutex_waiter *waiter) } void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti) + struct task_struct *task) { SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock)); /* Mark the current thread as blocked on the lock: */ - ti->task->blocked_on = waiter; + task->blocked_on = waiter; } void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti) + struct task_struct *task) { DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); - DEBUG_LOCKS_WARN_ON(waiter->task != ti->task); - DEBUG_LOCKS_WARN_ON(ti->task->blocked_on != waiter); - ti->task->blocked_on = NULL; + DEBUG_LOCKS_WARN_ON(waiter->task != task); + DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter); + task->blocked_on = NULL; list_del_init(&waiter->list); waiter->task = NULL; diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h index 372e6530180d..57a871ae3c81 100644 --- a/kernel/locking/mutex-debug.h +++ b/kernel/locking/mutex-debug.h @@ -20,9 +20,9 @@ extern void debug_mutex_wake_waiter(struct mutex *lock, extern void debug_mutex_free_waiter(struct mutex_waiter *waiter); extern void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti); + struct task_struct *task); extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti); + struct task_struct *task); extern void debug_mutex_unlock(struct mutex *lock); extern void debug_mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key); diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 79d2d765a75f..a70b90db3909 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -537,7 +537,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, goto skip_wait; debug_mutex_lock_common(lock, &waiter); - debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); + debug_mutex_add_waiter(lock, &waiter, task); /* add waiting tasks to the end of the waitqueue (FIFO): */ list_add_tail(&waiter.list, &lock->wait_list); @@ -584,7 +584,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, } __set_task_state(task, TASK_RUNNING); - mutex_remove_waiter(lock, &waiter, current_thread_info()); + mutex_remove_waiter(lock, &waiter, task); /* set it to 0 if there are no waiters left: */ if (likely(list_empty(&lock->wait_list))) atomic_set(&lock->count, 0); @@ -605,7 +605,7 @@ skip_wait: return 0; err: - mutex_remove_waiter(lock, &waiter, task_thread_info(task)); + mutex_remove_waiter(lock, &waiter, task); spin_unlock_mutex(&lock->wait_lock, flags); debug_mutex_free_waiter(&waiter); mutex_release(&lock->dep_map, 1, ip); diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 12f96199441c..6cd6b8e9efd7 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -13,7 +13,7 @@ do { spin_lock(lock); (void)(flags); } while (0) #define spin_unlock_mutex(lock, flags) \ do { spin_unlock(lock); (void)(flags); } while (0) -#define mutex_remove_waiter(lock, waiter, ti) \ +#define mutex_remove_waiter(lock, waiter, task) \ __list_del((waiter)->list.prev, (waiter)->list.next) #ifdef CONFIG_MUTEX_SPIN_ON_OWNER diff --git a/kernel/power/process.c b/kernel/power/process.c index df058bed53ce..0c2ee9761d57 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -146,6 +146,18 @@ int freeze_processes(void) if (!error && !oom_killer_disable()) error = -EBUSY; + /* + * There is a hard to fix race between oom_reaper kernel thread + * and oom_killer_disable. oom_reaper calls exit_oom_victim + * before the victim reaches exit_mm so try to freeze all the tasks + * again and catch such a left over task. + */ + if (!error) { + pr_info("Double checking all user space processes after OOM killer disable... "); + error = try_to_freeze_tasks(true); + pr_cont("\n"); + } + if (error) thaw_processes(); return error; diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 3cee0d8393ed..d38ab08a3fe7 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -58,7 +58,7 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>"); #define VERBOSE_PERFOUT_ERRSTRING(s) \ do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) -torture_param(bool, gp_exp, true, "Use expedited GP wait primitives"); +torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); torture_param(int, nreaders, -1, "Number of RCU reader threads"); torture_param(int, nwriters, -1, "Number of RCU updater threads"); @@ -96,12 +96,7 @@ static int rcu_perf_writer_state; #define MAX_MEAS 10000 #define MIN_MEAS 100 -#if defined(MODULE) || defined(CONFIG_RCU_PERF_TEST_RUNNABLE) -#define RCUPERF_RUNNABLE_INIT 1 -#else -#define RCUPERF_RUNNABLE_INIT 0 -#endif -static int perf_runnable = RCUPERF_RUNNABLE_INIT; +static int perf_runnable = IS_ENABLED(MODULE); module_param(perf_runnable, int, 0444); MODULE_PARM_DESC(perf_runnable, "Start rcuperf at boot"); @@ -363,8 +358,6 @@ rcu_perf_writer(void *arg) u64 *wdpp = writer_durations[me]; VERBOSE_PERFOUT_STRING("rcu_perf_writer task started"); - WARN_ON(rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp); - WARN_ON(rcu_gp_is_normal() && gp_exp); WARN_ON(!wdpp); set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); sp.sched_priority = 1; @@ -631,12 +624,24 @@ rcu_perf_init(void) firsterr = -ENOMEM; goto unwind; } + if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) { + VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); + firsterr = -EINVAL; + goto unwind; + } + if (rcu_gp_is_normal() && gp_exp) { + VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); + firsterr = -EINVAL; + goto unwind; + } for (i = 0; i < nrealwriters; i++) { writer_durations[i] = kcalloc(MAX_MEAS, sizeof(*writer_durations[i]), GFP_KERNEL); - if (!writer_durations[i]) + if (!writer_durations[i]) { + firsterr = -ENOMEM; goto unwind; + } firsterr = torture_create_kthread(rcu_perf_writer, (void *)i, writer_tasks[i]); if (firsterr) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 084a28a732eb..971e2b138063 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -182,12 +182,7 @@ static const char *rcu_torture_writer_state_getname(void) return rcu_torture_writer_state_names[i]; } -#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) -#define RCUTORTURE_RUNNABLE_INIT 1 -#else -#define RCUTORTURE_RUNNABLE_INIT 0 -#endif -static int torture_runnable = RCUTORTURE_RUNNABLE_INIT; +static int torture_runnable = IS_ENABLED(MODULE); module_param(torture_runnable, int, 0444); MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot"); @@ -1476,7 +1471,7 @@ static int rcu_torture_barrier_cbs(void *arg) break; /* * The above smp_load_acquire() ensures barrier_phase load - * is ordered before the folloiwng ->call(). + * is ordered before the following ->call(). */ local_irq_disable(); /* Just to test no-irq call_rcu(). */ cur_ops->call(&rcu, rcu_torture_barrier_cbf); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b7326893221f..f433959e9322 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -125,12 +125,14 @@ int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; /* Number of rcu_nodes at specified level. */ static int num_rcu_lvl[] = NUM_RCU_LVL_INIT; int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ +/* panic() on RCU Stall sysctl. */ +int sysctl_panic_on_rcu_stall __read_mostly; /* * The rcu_scheduler_active variable transitions from zero to one just * before the first task is spawned. So when this variable is zero, RCU * can assume that there is but one task, allowing RCU to (for example) - * optimize synchronize_sched() to a simple barrier(). When this variable + * optimize synchronize_rcu() to a simple barrier(). When this variable * is one, RCU must actually do all the hard work required to detect real * grace periods. This variable is also used to suppress boot-time false * positives from lockdep-RCU error checking. @@ -159,6 +161,7 @@ static void invoke_rcu_core(void); static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, bool wake); +static void sync_sched_exp_online_cleanup(int cpu); /* rcuc/rcub kthread realtime priority */ #ifdef CONFIG_RCU_KTHREAD_PRIO @@ -1284,9 +1287,9 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) rcu_for_each_leaf_node(rsp, rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask != 0) { - for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) - if (rnp->qsmask & (1UL << cpu)) - dump_cpu_task(rnp->grplo + cpu); + for_each_leaf_node_possible_cpu(rnp, cpu) + if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) + dump_cpu_task(cpu); } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } @@ -1311,6 +1314,12 @@ static void rcu_stall_kick_kthreads(struct rcu_state *rsp) } } +static inline void panic_on_rcu_stall(void) +{ + if (sysctl_panic_on_rcu_stall) + panic("RCU Stall\n"); +} + static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) { int cpu; @@ -1351,10 +1360,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) raw_spin_lock_irqsave_rcu_node(rnp, flags); ndetected += rcu_print_task_stall(rnp); if (rnp->qsmask != 0) { - for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) - if (rnp->qsmask & (1UL << cpu)) { - print_cpu_stall_info(rsp, - rnp->grplo + cpu); + for_each_leaf_node_possible_cpu(rnp, cpu) + if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { + print_cpu_stall_info(rsp, cpu); ndetected++; } } @@ -1390,6 +1398,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) rcu_check_gp_kthread_starvation(rsp); + panic_on_rcu_stall(); + force_quiescent_state(rsp); /* Kick them all. */ } @@ -1430,6 +1440,8 @@ static void print_cpu_stall(struct rcu_state *rsp) jiffies + 3 * rcu_jiffies_till_stall_check() + 3); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + panic_on_rcu_stall(); + /* * Attempt to revive the RCU machinery by forcing a context switch. * @@ -1989,8 +2001,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) * of the tree within the rsp->node[] array. Note that other CPUs * will access only the leaves of the hierarchy, thus seeing that no * grace period is in progress, at least until the corresponding - * leaf node has been initialized. In addition, we have excluded - * CPU-hotplug operations. + * leaf node has been initialized. * * The grace period cannot complete until the initialization * process finishes, because this kthread handles both. @@ -2872,7 +2883,6 @@ static void force_qs_rnp(struct rcu_state *rsp, unsigned long *maxj), bool *isidle, unsigned long *maxj) { - unsigned long bit; int cpu; unsigned long flags; unsigned long mask; @@ -2907,9 +2917,8 @@ static void force_qs_rnp(struct rcu_state *rsp, continue; } } - cpu = rnp->grplo; - bit = 1; - for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { + for_each_leaf_node_possible_cpu(rnp, cpu) { + unsigned long bit = leaf_node_cpu_bit(rnp, cpu); if ((rnp->qsmask & bit) != 0) { if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) mask |= bit; @@ -3448,548 +3457,6 @@ static bool rcu_seq_done(unsigned long *sp, unsigned long s) return ULONG_CMP_GE(READ_ONCE(*sp), s); } -/* Wrapper functions for expedited grace periods. */ -static void rcu_exp_gp_seq_start(struct rcu_state *rsp) -{ - rcu_seq_start(&rsp->expedited_sequence); -} -static void rcu_exp_gp_seq_end(struct rcu_state *rsp) -{ - rcu_seq_end(&rsp->expedited_sequence); - smp_mb(); /* Ensure that consecutive grace periods serialize. */ -} -static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) -{ - unsigned long s; - - smp_mb(); /* Caller's modifications seen first by other CPUs. */ - s = rcu_seq_snap(&rsp->expedited_sequence); - trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); - return s; -} -static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) -{ - return rcu_seq_done(&rsp->expedited_sequence, s); -} - -/* - * Reset the ->expmaskinit values in the rcu_node tree to reflect any - * recent CPU-online activity. Note that these masks are not cleared - * when CPUs go offline, so they reflect the union of all CPUs that have - * ever been online. This means that this function normally takes its - * no-work-to-do fastpath. - */ -static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) -{ - bool done; - unsigned long flags; - unsigned long mask; - unsigned long oldmask; - int ncpus = READ_ONCE(rsp->ncpus); - struct rcu_node *rnp; - struct rcu_node *rnp_up; - - /* If no new CPUs onlined since last time, nothing to do. */ - if (likely(ncpus == rsp->ncpus_snap)) - return; - rsp->ncpus_snap = ncpus; - - /* - * Each pass through the following loop propagates newly onlined - * CPUs for the current rcu_node structure up the rcu_node tree. - */ - rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (rnp->expmaskinit == rnp->expmaskinitnext) { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - continue; /* No new CPUs, nothing to do. */ - } - - /* Update this node's mask, track old value for propagation. */ - oldmask = rnp->expmaskinit; - rnp->expmaskinit = rnp->expmaskinitnext; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - - /* If was already nonzero, nothing to propagate. */ - if (oldmask) - continue; - - /* Propagate the new CPU up the tree. */ - mask = rnp->grpmask; - rnp_up = rnp->parent; - done = false; - while (rnp_up) { - raw_spin_lock_irqsave_rcu_node(rnp_up, flags); - if (rnp_up->expmaskinit) - done = true; - rnp_up->expmaskinit |= mask; - raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags); - if (done) - break; - mask = rnp_up->grpmask; - rnp_up = rnp_up->parent; - } - } -} - -/* - * Reset the ->expmask values in the rcu_node tree in preparation for - * a new expedited grace period. - */ -static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) -{ - unsigned long flags; - struct rcu_node *rnp; - - sync_exp_reset_tree_hotplug(rsp); - rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock_irqsave_rcu_node(rnp, flags); - WARN_ON_ONCE(rnp->expmask); - rnp->expmask = rnp->expmaskinit; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - } -} - -/* - * Return non-zero if there is no RCU expedited grace period in progress - * for the specified rcu_node structure, in other words, if all CPUs and - * tasks covered by the specified rcu_node structure have done their bit - * for the current expedited grace period. Works only for preemptible - * RCU -- other RCU implementation use other means. - * - * Caller must hold the rcu_state's exp_mutex. - */ -static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) -{ - return rnp->exp_tasks == NULL && - READ_ONCE(rnp->expmask) == 0; -} - -/* - * Report the exit from RCU read-side critical section for the last task - * that queued itself during or before the current expedited preemptible-RCU - * grace period. This event is reported either to the rcu_node structure on - * which the task was queued or to one of that rcu_node structure's ancestors, - * recursively up the tree. (Calm down, calm down, we do the recursion - * iteratively!) - * - * Caller must hold the rcu_state's exp_mutex and the specified rcu_node - * structure's ->lock. - */ -static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, - bool wake, unsigned long flags) - __releases(rnp->lock) -{ - unsigned long mask; - - for (;;) { - if (!sync_rcu_preempt_exp_done(rnp)) { - if (!rnp->expmask) - rcu_initiate_boost(rnp, flags); - else - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - break; - } - if (rnp->parent == NULL) { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - if (wake) { - smp_mb(); /* EGP done before wake_up(). */ - swake_up(&rsp->expedited_wq); - } - break; - } - mask = rnp->grpmask; - raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */ - rnp = rnp->parent; - raw_spin_lock_rcu_node(rnp); /* irqs already disabled */ - WARN_ON_ONCE(!(rnp->expmask & mask)); - rnp->expmask &= ~mask; - } -} - -/* - * Report expedited quiescent state for specified node. This is a - * lock-acquisition wrapper function for __rcu_report_exp_rnp(). - * - * Caller must hold the rcu_state's exp_mutex. - */ -static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, - struct rcu_node *rnp, bool wake) -{ - unsigned long flags; - - raw_spin_lock_irqsave_rcu_node(rnp, flags); - __rcu_report_exp_rnp(rsp, rnp, wake, flags); -} - -/* - * Report expedited quiescent state for multiple CPUs, all covered by the - * specified leaf rcu_node structure. Caller must hold the rcu_state's - * exp_mutex. - */ -static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, - unsigned long mask, bool wake) -{ - unsigned long flags; - - raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (!(rnp->expmask & mask)) { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - return; - } - rnp->expmask &= ~mask; - __rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */ -} - -/* - * Report expedited quiescent state for specified rcu_data (CPU). - */ -static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, - bool wake) -{ - rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake); -} - -/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ -static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat, - unsigned long s) -{ - if (rcu_exp_gp_seq_done(rsp, s)) { - trace_rcu_exp_grace_period(rsp->name, s, TPS("done")); - /* Ensure test happens before caller kfree(). */ - smp_mb__before_atomic(); /* ^^^ */ - atomic_long_inc(stat); - return true; - } - return false; -} - -/* - * Funnel-lock acquisition for expedited grace periods. Returns true - * if some other task completed an expedited grace period that this task - * can piggy-back on, and with no mutex held. Otherwise, returns false - * with the mutex held, indicating that the caller must actually do the - * expedited grace period. - */ -static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) -{ - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); - struct rcu_node *rnp = rdp->mynode; - struct rcu_node *rnp_root = rcu_get_root(rsp); - - /* Low-contention fastpath. */ - if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) && - (rnp == rnp_root || - ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) && - mutex_trylock(&rsp->exp_mutex)) - goto fastpath; - - /* - * Each pass through the following loop works its way up - * the rcu_node tree, returning if others have done the work or - * otherwise falls through to acquire rsp->exp_mutex. The mapping - * from CPU to rcu_node structure can be inexact, as it is just - * promoting locality and is not strictly needed for correctness. - */ - for (; rnp != NULL; rnp = rnp->parent) { - if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s)) - return true; - - /* Work not done, either wait here or go up. */ - spin_lock(&rnp->exp_lock); - if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) { - - /* Someone else doing GP, so wait for them. */ - spin_unlock(&rnp->exp_lock); - trace_rcu_exp_funnel_lock(rsp->name, rnp->level, - rnp->grplo, rnp->grphi, - TPS("wait")); - wait_event(rnp->exp_wq[(s >> 1) & 0x3], - sync_exp_work_done(rsp, - &rdp->exp_workdone2, s)); - return true; - } - rnp->exp_seq_rq = s; /* Followers can wait on us. */ - spin_unlock(&rnp->exp_lock); - trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo, - rnp->grphi, TPS("nxtlvl")); - } - mutex_lock(&rsp->exp_mutex); -fastpath: - if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) { - mutex_unlock(&rsp->exp_mutex); - return true; - } - rcu_exp_gp_seq_start(rsp); - trace_rcu_exp_grace_period(rsp->name, s, TPS("start")); - return false; -} - -/* Invoked on each online non-idle CPU for expedited quiescent state. */ -static void sync_sched_exp_handler(void *data) -{ - struct rcu_data *rdp; - struct rcu_node *rnp; - struct rcu_state *rsp = data; - - rdp = this_cpu_ptr(rsp->rda); - rnp = rdp->mynode; - if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || - __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) - return; - if (rcu_is_cpu_rrupt_from_idle()) { - rcu_report_exp_rdp(&rcu_sched_state, - this_cpu_ptr(&rcu_sched_data), true); - return; - } - __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); - resched_cpu(smp_processor_id()); -} - -/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ -static void sync_sched_exp_online_cleanup(int cpu) -{ - struct rcu_data *rdp; - int ret; - struct rcu_node *rnp; - struct rcu_state *rsp = &rcu_sched_state; - - rdp = per_cpu_ptr(rsp->rda, cpu); - rnp = rdp->mynode; - if (!(READ_ONCE(rnp->expmask) & rdp->grpmask)) - return; - ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0); - WARN_ON_ONCE(ret); -} - -/* - * Select the nodes that the upcoming expedited grace period needs - * to wait for. - */ -static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, - smp_call_func_t func) -{ - int cpu; - unsigned long flags; - unsigned long mask; - unsigned long mask_ofl_test; - unsigned long mask_ofl_ipi; - int ret; - struct rcu_node *rnp; - - sync_exp_reset_tree(rsp); - rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave_rcu_node(rnp, flags); - - /* Each pass checks a CPU for identity, offline, and idle. */ - mask_ofl_test = 0; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) { - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - - if (raw_smp_processor_id() == cpu || - !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) - mask_ofl_test |= rdp->grpmask; - } - mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; - - /* - * Need to wait for any blocked tasks as well. Note that - * additional blocking tasks will also block the expedited - * GP until such time as the ->expmask bits are cleared. - */ - if (rcu_preempt_has_tasks(rnp)) - rnp->exp_tasks = rnp->blkd_tasks.next; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - - /* IPI the remaining CPUs for expedited quiescent state. */ - mask = 1; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { - if (!(mask_ofl_ipi & mask)) - continue; -retry_ipi: - ret = smp_call_function_single(cpu, func, rsp, 0); - if (!ret) { - mask_ofl_ipi &= ~mask; - continue; - } - /* Failed, raced with offline. */ - raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (cpu_online(cpu) && - (rnp->expmask & mask)) { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - schedule_timeout_uninterruptible(1); - if (cpu_online(cpu) && - (rnp->expmask & mask)) - goto retry_ipi; - raw_spin_lock_irqsave_rcu_node(rnp, flags); - } - if (!(rnp->expmask & mask)) - mask_ofl_ipi &= ~mask; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - } - /* Report quiescent states for those that went offline. */ - mask_ofl_test |= mask_ofl_ipi; - if (mask_ofl_test) - rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false); - } -} - -static void synchronize_sched_expedited_wait(struct rcu_state *rsp) -{ - int cpu; - unsigned long jiffies_stall; - unsigned long jiffies_start; - unsigned long mask; - int ndetected; - struct rcu_node *rnp; - struct rcu_node *rnp_root = rcu_get_root(rsp); - int ret; - - jiffies_stall = rcu_jiffies_till_stall_check(); - jiffies_start = jiffies; - - for (;;) { - ret = swait_event_timeout( - rsp->expedited_wq, - sync_rcu_preempt_exp_done(rnp_root), - jiffies_stall); - if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) - return; - if (ret < 0) { - /* Hit a signal, disable CPU stall warnings. */ - swait_event(rsp->expedited_wq, - sync_rcu_preempt_exp_done(rnp_root)); - return; - } - pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", - rsp->name); - ndetected = 0; - rcu_for_each_leaf_node(rsp, rnp) { - ndetected += rcu_print_task_exp_stall(rnp); - mask = 1; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { - struct rcu_data *rdp; - - if (!(rnp->expmask & mask)) - continue; - ndetected++; - rdp = per_cpu_ptr(rsp->rda, cpu); - pr_cont(" %d-%c%c%c", cpu, - "O."[!!cpu_online(cpu)], - "o."[!!(rdp->grpmask & rnp->expmaskinit)], - "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); - } - mask <<= 1; - } - pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", - jiffies - jiffies_start, rsp->expedited_sequence, - rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]); - if (ndetected) { - pr_err("blocking rcu_node structures:"); - rcu_for_each_node_breadth_first(rsp, rnp) { - if (rnp == rnp_root) - continue; /* printed unconditionally */ - if (sync_rcu_preempt_exp_done(rnp)) - continue; - pr_cont(" l=%u:%d-%d:%#lx/%c", - rnp->level, rnp->grplo, rnp->grphi, - rnp->expmask, - ".T"[!!rnp->exp_tasks]); - } - pr_cont("\n"); - } - rcu_for_each_leaf_node(rsp, rnp) { - mask = 1; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { - if (!(rnp->expmask & mask)) - continue; - dump_cpu_task(cpu); - } - } - jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; - } -} - -/* - * Wait for the current expedited grace period to complete, and then - * wake up everyone who piggybacked on the just-completed expedited - * grace period. Also update all the ->exp_seq_rq counters as needed - * in order to avoid counter-wrap problems. - */ -static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) -{ - struct rcu_node *rnp; - - synchronize_sched_expedited_wait(rsp); - rcu_exp_gp_seq_end(rsp); - trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); - - /* - * Switch over to wakeup mode, allowing the next GP, but -only- the - * next GP, to proceed. - */ - mutex_lock(&rsp->exp_wake_mutex); - mutex_unlock(&rsp->exp_mutex); - - rcu_for_each_node_breadth_first(rsp, rnp) { - if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { - spin_lock(&rnp->exp_lock); - /* Recheck, avoid hang in case someone just arrived. */ - if (ULONG_CMP_LT(rnp->exp_seq_rq, s)) - rnp->exp_seq_rq = s; - spin_unlock(&rnp->exp_lock); - } - wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]); - } - trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); - mutex_unlock(&rsp->exp_wake_mutex); -} - -/** - * synchronize_sched_expedited - Brute-force RCU-sched grace period - * - * Wait for an RCU-sched grace period to elapse, but use a "big hammer" - * approach to force the grace period to end quickly. This consumes - * significant time on all CPUs and is unfriendly to real-time workloads, - * so is thus not recommended for any sort of common-case code. In fact, - * if you are using synchronize_sched_expedited() in a loop, please - * restructure your code to batch your updates, and then use a single - * synchronize_sched() instead. - * - * This implementation can be thought of as an application of sequence - * locking to expedited grace periods, but using the sequence counter to - * determine when someone else has already done the work instead of for - * retrying readers. - */ -void synchronize_sched_expedited(void) -{ - unsigned long s; - struct rcu_state *rsp = &rcu_sched_state; - - /* If only one CPU, this is automatically a grace period. */ - if (rcu_blocking_is_gp()) - return; - - /* If expedited grace periods are prohibited, fall back to normal. */ - if (rcu_gp_is_normal()) { - wait_rcu_gp(call_rcu_sched); - return; - } - - /* Take a snapshot of the sequence number. */ - s = rcu_exp_gp_seq_snap(rsp); - if (exp_funnel_lock(rsp, s)) - return; /* Someone else did our work for us. */ - - /* Initialize the rcu_node tree in preparation for the wait. */ - sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); - - /* Wait and clean up, including waking everyone. */ - rcu_exp_wait_wake(rsp, s); -} -EXPORT_SYMBOL_GPL(synchronize_sched_expedited); - /* * Check to see if there is any immediate RCU-related work to be done * by the current CPU, for the specified type of RCU, returning 1 if so. @@ -4280,7 +3747,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) /* Set up local state, ensuring consistent view of global state. */ raw_spin_lock_irqsave_rcu_node(rnp, flags); - rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); + rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); @@ -4363,9 +3830,6 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) - return; - /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ mask = rdp->grpmask; raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ @@ -4750,4 +4214,5 @@ void __init rcu_init(void) rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); } +#include "tree_exp.h" #include "tree_plugin.h" diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index e3959f5e6ddf..f714f873bf9d 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -254,6 +254,13 @@ struct rcu_node { } ____cacheline_internodealigned_in_smp; /* + * Bitmasks in an rcu_node cover the interval [grplo, grphi] of CPU IDs, and + * are indexed relative to this interval rather than the global CPU ID space. + * This generates the bit for a CPU in node-local masks. + */ +#define leaf_node_cpu_bit(rnp, cpu) (1UL << ((cpu) - (rnp)->grplo)) + +/* * Do a full breadth-first scan of the rcu_node structures for the * specified rcu_state structure. */ @@ -281,6 +288,14 @@ struct rcu_node { (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) /* + * Iterate over all possible CPUs in a leaf RCU node. + */ +#define for_each_leaf_node_possible_cpu(rnp, cpu) \ + for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \ + cpu <= rnp->grphi; \ + cpu = cpumask_next((cpu), cpu_possible_mask)) + +/* * Union to allow "aggregate OR" operation on the need for a quiescent * state by the normal and expedited grace periods. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h new file mode 100644 index 000000000000..6d86ab6ec2c9 --- /dev/null +++ b/kernel/rcu/tree_exp.h @@ -0,0 +1,655 @@ +/* + * RCU expedited grace periods + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright IBM Corporation, 2016 + * + * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + */ + +/* Wrapper functions for expedited grace periods. */ +static void rcu_exp_gp_seq_start(struct rcu_state *rsp) +{ + rcu_seq_start(&rsp->expedited_sequence); +} +static void rcu_exp_gp_seq_end(struct rcu_state *rsp) +{ + rcu_seq_end(&rsp->expedited_sequence); + smp_mb(); /* Ensure that consecutive grace periods serialize. */ +} +static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) +{ + unsigned long s; + + smp_mb(); /* Caller's modifications seen first by other CPUs. */ + s = rcu_seq_snap(&rsp->expedited_sequence); + trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); + return s; +} +static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) +{ + return rcu_seq_done(&rsp->expedited_sequence, s); +} + +/* + * Reset the ->expmaskinit values in the rcu_node tree to reflect any + * recent CPU-online activity. Note that these masks are not cleared + * when CPUs go offline, so they reflect the union of all CPUs that have + * ever been online. This means that this function normally takes its + * no-work-to-do fastpath. + */ +static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) +{ + bool done; + unsigned long flags; + unsigned long mask; + unsigned long oldmask; + int ncpus = READ_ONCE(rsp->ncpus); + struct rcu_node *rnp; + struct rcu_node *rnp_up; + + /* If no new CPUs onlined since last time, nothing to do. */ + if (likely(ncpus == rsp->ncpus_snap)) + return; + rsp->ncpus_snap = ncpus; + + /* + * Each pass through the following loop propagates newly onlined + * CPUs for the current rcu_node structure up the rcu_node tree. + */ + rcu_for_each_leaf_node(rsp, rnp) { + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (rnp->expmaskinit == rnp->expmaskinitnext) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + continue; /* No new CPUs, nothing to do. */ + } + + /* Update this node's mask, track old value for propagation. */ + oldmask = rnp->expmaskinit; + rnp->expmaskinit = rnp->expmaskinitnext; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + + /* If was already nonzero, nothing to propagate. */ + if (oldmask) + continue; + + /* Propagate the new CPU up the tree. */ + mask = rnp->grpmask; + rnp_up = rnp->parent; + done = false; + while (rnp_up) { + raw_spin_lock_irqsave_rcu_node(rnp_up, flags); + if (rnp_up->expmaskinit) + done = true; + rnp_up->expmaskinit |= mask; + raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags); + if (done) + break; + mask = rnp_up->grpmask; + rnp_up = rnp_up->parent; + } + } +} + +/* + * Reset the ->expmask values in the rcu_node tree in preparation for + * a new expedited grace period. + */ +static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) +{ + unsigned long flags; + struct rcu_node *rnp; + + sync_exp_reset_tree_hotplug(rsp); + rcu_for_each_node_breadth_first(rsp, rnp) { + raw_spin_lock_irqsave_rcu_node(rnp, flags); + WARN_ON_ONCE(rnp->expmask); + rnp->expmask = rnp->expmaskinit; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } +} + +/* + * Return non-zero if there is no RCU expedited grace period in progress + * for the specified rcu_node structure, in other words, if all CPUs and + * tasks covered by the specified rcu_node structure have done their bit + * for the current expedited grace period. Works only for preemptible + * RCU -- other RCU implementation use other means. + * + * Caller must hold the rcu_state's exp_mutex. + */ +static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) +{ + return rnp->exp_tasks == NULL && + READ_ONCE(rnp->expmask) == 0; +} + +/* + * Report the exit from RCU read-side critical section for the last task + * that queued itself during or before the current expedited preemptible-RCU + * grace period. This event is reported either to the rcu_node structure on + * which the task was queued or to one of that rcu_node structure's ancestors, + * recursively up the tree. (Calm down, calm down, we do the recursion + * iteratively!) + * + * Caller must hold the rcu_state's exp_mutex and the specified rcu_node + * structure's ->lock. + */ +static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, + bool wake, unsigned long flags) + __releases(rnp->lock) +{ + unsigned long mask; + + for (;;) { + if (!sync_rcu_preempt_exp_done(rnp)) { + if (!rnp->expmask) + rcu_initiate_boost(rnp, flags); + else + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + break; + } + if (rnp->parent == NULL) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + if (wake) { + smp_mb(); /* EGP done before wake_up(). */ + swake_up(&rsp->expedited_wq); + } + break; + } + mask = rnp->grpmask; + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */ + rnp = rnp->parent; + raw_spin_lock_rcu_node(rnp); /* irqs already disabled */ + WARN_ON_ONCE(!(rnp->expmask & mask)); + rnp->expmask &= ~mask; + } +} + +/* + * Report expedited quiescent state for specified node. This is a + * lock-acquisition wrapper function for __rcu_report_exp_rnp(). + * + * Caller must hold the rcu_state's exp_mutex. + */ +static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, + struct rcu_node *rnp, bool wake) +{ + unsigned long flags; + + raw_spin_lock_irqsave_rcu_node(rnp, flags); + __rcu_report_exp_rnp(rsp, rnp, wake, flags); +} + +/* + * Report expedited quiescent state for multiple CPUs, all covered by the + * specified leaf rcu_node structure. Caller must hold the rcu_state's + * exp_mutex. + */ +static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, + unsigned long mask, bool wake) +{ + unsigned long flags; + + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (!(rnp->expmask & mask)) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return; + } + rnp->expmask &= ~mask; + __rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */ +} + +/* + * Report expedited quiescent state for specified rcu_data (CPU). + */ +static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, + bool wake) +{ + rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake); +} + +/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ +static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat, + unsigned long s) +{ + if (rcu_exp_gp_seq_done(rsp, s)) { + trace_rcu_exp_grace_period(rsp->name, s, TPS("done")); + /* Ensure test happens before caller kfree(). */ + smp_mb__before_atomic(); /* ^^^ */ + atomic_long_inc(stat); + return true; + } + return false; +} + +/* + * Funnel-lock acquisition for expedited grace periods. Returns true + * if some other task completed an expedited grace period that this task + * can piggy-back on, and with no mutex held. Otherwise, returns false + * with the mutex held, indicating that the caller must actually do the + * expedited grace period. + */ +static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) +{ + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); + struct rcu_node *rnp = rdp->mynode; + struct rcu_node *rnp_root = rcu_get_root(rsp); + + /* Low-contention fastpath. */ + if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) && + (rnp == rnp_root || + ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) && + mutex_trylock(&rsp->exp_mutex)) + goto fastpath; + + /* + * Each pass through the following loop works its way up + * the rcu_node tree, returning if others have done the work or + * otherwise falls through to acquire rsp->exp_mutex. The mapping + * from CPU to rcu_node structure can be inexact, as it is just + * promoting locality and is not strictly needed for correctness. + */ + for (; rnp != NULL; rnp = rnp->parent) { + if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s)) + return true; + + /* Work not done, either wait here or go up. */ + spin_lock(&rnp->exp_lock); + if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) { + + /* Someone else doing GP, so wait for them. */ + spin_unlock(&rnp->exp_lock); + trace_rcu_exp_funnel_lock(rsp->name, rnp->level, + rnp->grplo, rnp->grphi, + TPS("wait")); + wait_event(rnp->exp_wq[(s >> 1) & 0x3], + sync_exp_work_done(rsp, + &rdp->exp_workdone2, s)); + return true; + } + rnp->exp_seq_rq = s; /* Followers can wait on us. */ + spin_unlock(&rnp->exp_lock); + trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo, + rnp->grphi, TPS("nxtlvl")); + } + mutex_lock(&rsp->exp_mutex); +fastpath: + if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) { + mutex_unlock(&rsp->exp_mutex); + return true; + } + rcu_exp_gp_seq_start(rsp); + trace_rcu_exp_grace_period(rsp->name, s, TPS("start")); + return false; +} + +/* Invoked on each online non-idle CPU for expedited quiescent state. */ +static void sync_sched_exp_handler(void *data) +{ + struct rcu_data *rdp; + struct rcu_node *rnp; + struct rcu_state *rsp = data; + + rdp = this_cpu_ptr(rsp->rda); + rnp = rdp->mynode; + if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || + __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) + return; + if (rcu_is_cpu_rrupt_from_idle()) { + rcu_report_exp_rdp(&rcu_sched_state, + this_cpu_ptr(&rcu_sched_data), true); + return; + } + __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); + resched_cpu(smp_processor_id()); +} + +/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ +static void sync_sched_exp_online_cleanup(int cpu) +{ + struct rcu_data *rdp; + int ret; + struct rcu_node *rnp; + struct rcu_state *rsp = &rcu_sched_state; + + rdp = per_cpu_ptr(rsp->rda, cpu); + rnp = rdp->mynode; + if (!(READ_ONCE(rnp->expmask) & rdp->grpmask)) + return; + ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0); + WARN_ON_ONCE(ret); +} + +/* + * Select the nodes that the upcoming expedited grace period needs + * to wait for. + */ +static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, + smp_call_func_t func) +{ + int cpu; + unsigned long flags; + unsigned long mask_ofl_test; + unsigned long mask_ofl_ipi; + int ret; + struct rcu_node *rnp; + + sync_exp_reset_tree(rsp); + rcu_for_each_leaf_node(rsp, rnp) { + raw_spin_lock_irqsave_rcu_node(rnp, flags); + + /* Each pass checks a CPU for identity, offline, and idle. */ + mask_ofl_test = 0; + for_each_leaf_node_possible_cpu(rnp, cpu) { + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + if (raw_smp_processor_id() == cpu || + !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) + mask_ofl_test |= rdp->grpmask; + } + mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; + + /* + * Need to wait for any blocked tasks as well. Note that + * additional blocking tasks will also block the expedited + * GP until such time as the ->expmask bits are cleared. + */ + if (rcu_preempt_has_tasks(rnp)) + rnp->exp_tasks = rnp->blkd_tasks.next; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + + /* IPI the remaining CPUs for expedited quiescent state. */ + for_each_leaf_node_possible_cpu(rnp, cpu) { + unsigned long mask = leaf_node_cpu_bit(rnp, cpu); + if (!(mask_ofl_ipi & mask)) + continue; +retry_ipi: + ret = smp_call_function_single(cpu, func, rsp, 0); + if (!ret) { + mask_ofl_ipi &= ~mask; + continue; + } + /* Failed, raced with offline. */ + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (cpu_online(cpu) && + (rnp->expmask & mask)) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + schedule_timeout_uninterruptible(1); + if (cpu_online(cpu) && + (rnp->expmask & mask)) + goto retry_ipi; + raw_spin_lock_irqsave_rcu_node(rnp, flags); + } + if (!(rnp->expmask & mask)) + mask_ofl_ipi &= ~mask; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } + /* Report quiescent states for those that went offline. */ + mask_ofl_test |= mask_ofl_ipi; + if (mask_ofl_test) + rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false); + } +} + +static void synchronize_sched_expedited_wait(struct rcu_state *rsp) +{ + int cpu; + unsigned long jiffies_stall; + unsigned long jiffies_start; + unsigned long mask; + int ndetected; + struct rcu_node *rnp; + struct rcu_node *rnp_root = rcu_get_root(rsp); + int ret; + + jiffies_stall = rcu_jiffies_till_stall_check(); + jiffies_start = jiffies; + + for (;;) { + ret = swait_event_timeout( + rsp->expedited_wq, + sync_rcu_preempt_exp_done(rnp_root), + jiffies_stall); + if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) + return; + if (ret < 0) { + /* Hit a signal, disable CPU stall warnings. */ + swait_event(rsp->expedited_wq, + sync_rcu_preempt_exp_done(rnp_root)); + return; + } + pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", + rsp->name); + ndetected = 0; + rcu_for_each_leaf_node(rsp, rnp) { + ndetected += rcu_print_task_exp_stall(rnp); + for_each_leaf_node_possible_cpu(rnp, cpu) { + struct rcu_data *rdp; + + mask = leaf_node_cpu_bit(rnp, cpu); + if (!(rnp->expmask & mask)) + continue; + ndetected++; + rdp = per_cpu_ptr(rsp->rda, cpu); + pr_cont(" %d-%c%c%c", cpu, + "O."[!!cpu_online(cpu)], + "o."[!!(rdp->grpmask & rnp->expmaskinit)], + "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); + } + } + pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", + jiffies - jiffies_start, rsp->expedited_sequence, + rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]); + if (ndetected) { + pr_err("blocking rcu_node structures:"); + rcu_for_each_node_breadth_first(rsp, rnp) { + if (rnp == rnp_root) + continue; /* printed unconditionally */ + if (sync_rcu_preempt_exp_done(rnp)) + continue; + pr_cont(" l=%u:%d-%d:%#lx/%c", + rnp->level, rnp->grplo, rnp->grphi, + rnp->expmask, + ".T"[!!rnp->exp_tasks]); + } + pr_cont("\n"); + } + rcu_for_each_leaf_node(rsp, rnp) { + for_each_leaf_node_possible_cpu(rnp, cpu) { + mask = leaf_node_cpu_bit(rnp, cpu); + if (!(rnp->expmask & mask)) + continue; + dump_cpu_task(cpu); + } + } + jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; + } +} + +/* + * Wait for the current expedited grace period to complete, and then + * wake up everyone who piggybacked on the just-completed expedited + * grace period. Also update all the ->exp_seq_rq counters as needed + * in order to avoid counter-wrap problems. + */ +static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) +{ + struct rcu_node *rnp; + + synchronize_sched_expedited_wait(rsp); + rcu_exp_gp_seq_end(rsp); + trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); + + /* + * Switch over to wakeup mode, allowing the next GP, but -only- the + * next GP, to proceed. + */ + mutex_lock(&rsp->exp_wake_mutex); + mutex_unlock(&rsp->exp_mutex); + + rcu_for_each_node_breadth_first(rsp, rnp) { + if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { + spin_lock(&rnp->exp_lock); + /* Recheck, avoid hang in case someone just arrived. */ + if (ULONG_CMP_LT(rnp->exp_seq_rq, s)) + rnp->exp_seq_rq = s; + spin_unlock(&rnp->exp_lock); + } + wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]); + } + trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); + mutex_unlock(&rsp->exp_wake_mutex); +} + +/** + * synchronize_sched_expedited - Brute-force RCU-sched grace period + * + * Wait for an RCU-sched grace period to elapse, but use a "big hammer" + * approach to force the grace period to end quickly. This consumes + * significant time on all CPUs and is unfriendly to real-time workloads, + * so is thus not recommended for any sort of common-case code. In fact, + * if you are using synchronize_sched_expedited() in a loop, please + * restructure your code to batch your updates, and then use a single + * synchronize_sched() instead. + * + * This implementation can be thought of as an application of sequence + * locking to expedited grace periods, but using the sequence counter to + * determine when someone else has already done the work instead of for + * retrying readers. + */ +void synchronize_sched_expedited(void) +{ + unsigned long s; + struct rcu_state *rsp = &rcu_sched_state; + + /* If only one CPU, this is automatically a grace period. */ + if (rcu_blocking_is_gp()) + return; + + /* If expedited grace periods are prohibited, fall back to normal. */ + if (rcu_gp_is_normal()) { + wait_rcu_gp(call_rcu_sched); + return; + } + + /* Take a snapshot of the sequence number. */ + s = rcu_exp_gp_seq_snap(rsp); + if (exp_funnel_lock(rsp, s)) + return; /* Someone else did our work for us. */ + + /* Initialize the rcu_node tree in preparation for the wait. */ + sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); + + /* Wait and clean up, including waking everyone. */ + rcu_exp_wait_wake(rsp, s); +} +EXPORT_SYMBOL_GPL(synchronize_sched_expedited); + +#ifdef CONFIG_PREEMPT_RCU + +/* + * Remote handler for smp_call_function_single(). If there is an + * RCU read-side critical section in effect, request that the + * next rcu_read_unlock() record the quiescent state up the + * ->expmask fields in the rcu_node tree. Otherwise, immediately + * report the quiescent state. + */ +static void sync_rcu_exp_handler(void *info) +{ + struct rcu_data *rdp; + struct rcu_state *rsp = info; + struct task_struct *t = current; + + /* + * Within an RCU read-side critical section, request that the next + * rcu_read_unlock() report. Unless this RCU read-side critical + * section has already blocked, in which case it is already set + * up for the expedited grace period to wait on it. + */ + if (t->rcu_read_lock_nesting > 0 && + !t->rcu_read_unlock_special.b.blocked) { + t->rcu_read_unlock_special.b.exp_need_qs = true; + return; + } + + /* + * We are either exiting an RCU read-side critical section (negative + * values of t->rcu_read_lock_nesting) or are not in one at all + * (zero value of t->rcu_read_lock_nesting). Or we are in an RCU + * read-side critical section that blocked before this expedited + * grace period started. Either way, we can immediately report + * the quiescent state. + */ + rdp = this_cpu_ptr(rsp->rda); + rcu_report_exp_rdp(rsp, rdp, true); +} + +/** + * synchronize_rcu_expedited - Brute-force RCU grace period + * + * Wait for an RCU-preempt grace period, but expedite it. The basic + * idea is to IPI all non-idle non-nohz online CPUs. The IPI handler + * checks whether the CPU is in an RCU-preempt critical section, and + * if so, it sets a flag that causes the outermost rcu_read_unlock() + * to report the quiescent state. On the other hand, if the CPU is + * not in an RCU read-side critical section, the IPI handler reports + * the quiescent state immediately. + * + * Although this is a greate improvement over previous expedited + * implementations, it is still unfriendly to real-time workloads, so is + * thus not recommended for any sort of common-case code. In fact, if + * you are using synchronize_rcu_expedited() in a loop, please restructure + * your code to batch your updates, and then Use a single synchronize_rcu() + * instead. + */ +void synchronize_rcu_expedited(void) +{ + struct rcu_state *rsp = rcu_state_p; + unsigned long s; + + /* If expedited grace periods are prohibited, fall back to normal. */ + if (rcu_gp_is_normal()) { + wait_rcu_gp(call_rcu); + return; + } + + s = rcu_exp_gp_seq_snap(rsp); + if (exp_funnel_lock(rsp, s)) + return; /* Someone else did our work for us. */ + + /* Initialize the rcu_node tree in preparation for the wait. */ + sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); + + /* Wait for ->blkd_tasks lists to drain, then wake everyone up. */ + rcu_exp_wait_wake(rsp, s); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + +#else /* #ifdef CONFIG_PREEMPT_RCU */ + +/* + * Wait for an rcu-preempt grace period, but make it happen quickly. + * But because preemptible RCU does not exist, map to rcu-sched. + */ +void synchronize_rcu_expedited(void) +{ + synchronize_sched_expedited(); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index ff1cd4e1188d..0082fce402a0 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -79,8 +79,6 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); if (IS_ENABLED(CONFIG_PROVE_RCU)) pr_info("\tRCU lockdep checking is enabled.\n"); - if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE)) - pr_info("\tRCU torture testing starts during boot.\n"); if (RCU_NUM_LVLS >= 4) pr_info("\tFour(or more)-level hierarchy is enabled.\n"); if (RCU_FANOUT_LEAF != 16) @@ -681,84 +679,6 @@ void synchronize_rcu(void) } EXPORT_SYMBOL_GPL(synchronize_rcu); -/* - * Remote handler for smp_call_function_single(). If there is an - * RCU read-side critical section in effect, request that the - * next rcu_read_unlock() record the quiescent state up the - * ->expmask fields in the rcu_node tree. Otherwise, immediately - * report the quiescent state. - */ -static void sync_rcu_exp_handler(void *info) -{ - struct rcu_data *rdp; - struct rcu_state *rsp = info; - struct task_struct *t = current; - - /* - * Within an RCU read-side critical section, request that the next - * rcu_read_unlock() report. Unless this RCU read-side critical - * section has already blocked, in which case it is already set - * up for the expedited grace period to wait on it. - */ - if (t->rcu_read_lock_nesting > 0 && - !t->rcu_read_unlock_special.b.blocked) { - t->rcu_read_unlock_special.b.exp_need_qs = true; - return; - } - - /* - * We are either exiting an RCU read-side critical section (negative - * values of t->rcu_read_lock_nesting) or are not in one at all - * (zero value of t->rcu_read_lock_nesting). Or we are in an RCU - * read-side critical section that blocked before this expedited - * grace period started. Either way, we can immediately report - * the quiescent state. - */ - rdp = this_cpu_ptr(rsp->rda); - rcu_report_exp_rdp(rsp, rdp, true); -} - -/** - * synchronize_rcu_expedited - Brute-force RCU grace period - * - * Wait for an RCU-preempt grace period, but expedite it. The basic - * idea is to IPI all non-idle non-nohz online CPUs. The IPI handler - * checks whether the CPU is in an RCU-preempt critical section, and - * if so, it sets a flag that causes the outermost rcu_read_unlock() - * to report the quiescent state. On the other hand, if the CPU is - * not in an RCU read-side critical section, the IPI handler reports - * the quiescent state immediately. - * - * Although this is a greate improvement over previous expedited - * implementations, it is still unfriendly to real-time workloads, so is - * thus not recommended for any sort of common-case code. In fact, if - * you are using synchronize_rcu_expedited() in a loop, please restructure - * your code to batch your updates, and then Use a single synchronize_rcu() - * instead. - */ -void synchronize_rcu_expedited(void) -{ - struct rcu_state *rsp = rcu_state_p; - unsigned long s; - - /* If expedited grace periods are prohibited, fall back to normal. */ - if (rcu_gp_is_normal()) { - wait_rcu_gp(call_rcu); - return; - } - - s = rcu_exp_gp_seq_snap(rsp); - if (exp_funnel_lock(rsp, s)) - return; /* Someone else did our work for us. */ - - /* Initialize the rcu_node tree in preparation for the wait. */ - sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); - - /* Wait for ->blkd_tasks lists to drain, then wake everyone up. */ - rcu_exp_wait_wake(rsp, s); -} -EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); - /** * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. * @@ -883,16 +803,6 @@ static void rcu_preempt_check_callbacks(void) } /* - * Wait for an rcu-preempt grace period, but make it happen quickly. - * But because preemptible RCU does not exist, map to rcu-sched. - */ -void synchronize_rcu_expedited(void) -{ - synchronize_sched_expedited(); -} -EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); - -/* * Because preemptible RCU does not exist, rcu_barrier() is just * another name for rcu_barrier_sched(). */ @@ -1254,8 +1164,9 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) return; if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) return; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) - if ((mask & 0x1) && cpu != outgoingcpu) + for_each_leaf_node_possible_cpu(rnp, cpu) + if ((mask & leaf_node_cpu_bit(rnp, cpu)) && + cpu != outgoingcpu) cpumask_set_cpu(cpu, cm); if (cpumask_weight(cm) == 0) cpumask_setall(cm); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 3e888cd5a594..f0d8322bc3ec 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -528,6 +528,7 @@ static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 10; module_param(rcu_task_stall_timeout, int, 0644); static void rcu_spawn_tasks_kthread(void); +static struct task_struct *rcu_tasks_kthread_ptr; /* * Post an RCU-tasks callback. First call must be from process context @@ -537,6 +538,7 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) { unsigned long flags; bool needwake; + bool havetask = READ_ONCE(rcu_tasks_kthread_ptr); rhp->next = NULL; rhp->func = func; @@ -545,7 +547,9 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) *rcu_tasks_cbs_tail = rhp; rcu_tasks_cbs_tail = &rhp->next; raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); - if (needwake) { + /* We can't create the thread unless interrupts are enabled. */ + if ((needwake && havetask) || + (!havetask && !irqs_disabled_flags(flags))) { rcu_spawn_tasks_kthread(); wake_up(&rcu_tasks_cbs_wq); } @@ -790,7 +794,6 @@ static int __noreturn rcu_tasks_kthread(void *arg) static void rcu_spawn_tasks_kthread(void) { static DEFINE_MUTEX(rcu_tasks_kthread_mutex); - static struct task_struct *rcu_tasks_kthread_ptr; struct task_struct *t; if (READ_ONCE(rcu_tasks_kthread_ptr)) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5cd6931cb2cb..af0ef74df23c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1536,7 +1536,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p) for (;;) { /* Any allowed, online CPU? */ for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { - if (!cpu_active(dest_cpu)) + if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu)) + continue; + if (!cpu_online(dest_cpu)) continue; goto out; } @@ -2535,10 +2537,9 @@ void wake_up_new_task(struct task_struct *p) */ set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif - /* Post initialize new task's util average when its cfs_rq is set */ + rq = __task_rq_lock(p, &rf); post_init_entity_util_avg(&p->se); - rq = __task_rq_lock(p, &rf); activate_task(rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p); @@ -5148,14 +5149,16 @@ void show_state_filter(unsigned long state_filter) /* * reset the NMI-timeout, listing all files on a slow * console might take a lot of time: + * Also, reset softlockup watchdogs on all CPUs, because + * another CPU might be blocked waiting for us to process + * an IPI. */ touch_nmi_watchdog(); + touch_all_softlockup_watchdogs(); if (!state_filter || (p->state & state_filter)) sched_show_task(p); } - touch_all_softlockup_watchdogs(); - #ifdef CONFIG_SCHED_DEBUG if (!state_filter) sysrq_sched_debug_show(); @@ -5391,13 +5394,15 @@ void idle_task_exit(void) /* * Since this CPU is going 'away' for a while, fold any nr_active delta * we might have. Assumes we're called after migrate_tasks() so that the - * nr_active count is stable. + * nr_active count is stable. We need to take the teardown thread which + * is calling this into account, so we hand in adjust = 1 to the load + * calculation. * * Also see the comment "Global load-average calculations". */ static void calc_load_migrate(struct rq *rq) { - long delta = calc_load_fold_active(rq); + long delta = calc_load_fold_active(rq, 1); if (delta) atomic_long_add(delta, &calc_load_tasks); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 218f8e83db73..c8c5d2d48424 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -735,8 +735,6 @@ void post_init_entity_util_avg(struct sched_entity *se) } } -static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); -static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq); #else void init_entity_runnable_average(struct sched_entity *se) { @@ -2499,28 +2497,22 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) #ifdef CONFIG_FAIR_GROUP_SCHED # ifdef CONFIG_SMP -static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) +static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) { - long tg_weight; + long tg_weight, load, shares; /* - * Use this CPU's real-time load instead of the last load contribution - * as the updating of the contribution is delayed, and we will use the - * the real-time load to calc the share. See update_tg_load_avg(). + * This really should be: cfs_rq->avg.load_avg, but instead we use + * cfs_rq->load.weight, which is its upper bound. This helps ramp up + * the shares for small weight interactive tasks. */ - tg_weight = atomic_long_read(&tg->load_avg); - tg_weight -= cfs_rq->tg_load_avg_contrib; - tg_weight += cfs_rq->load.weight; + load = scale_load_down(cfs_rq->load.weight); - return tg_weight; -} - -static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) -{ - long tg_weight, load, shares; + tg_weight = atomic_long_read(&tg->load_avg); - tg_weight = calc_tg_weight(tg, cfs_rq); - load = cfs_rq->load.weight; + /* Ensure tg_weight >= load */ + tg_weight -= cfs_rq->tg_load_avg_contrib; + tg_weight += load; shares = (tg->shares * load); if (tg_weight) @@ -2539,6 +2531,7 @@ static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) return tg->shares; } # endif /* CONFIG_SMP */ + static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { @@ -2904,6 +2897,23 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) } } +/* + * Unsigned subtract and clamp on underflow. + * + * Explicitly do a load-store to ensure the intermediate value never hits + * memory. This allows lockless observations without ever seeing the negative + * values. + */ +#define sub_positive(_ptr, _val) do { \ + typeof(_ptr) ptr = (_ptr); \ + typeof(*ptr) val = (_val); \ + typeof(*ptr) res, var = READ_ONCE(*ptr); \ + res = var - val; \ + if (res > var) \ + res = 0; \ + WRITE_ONCE(*ptr, res); \ +} while (0) + /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) @@ -2913,15 +2923,15 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) if (atomic_long_read(&cfs_rq->removed_load_avg)) { s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); - sa->load_avg = max_t(long, sa->load_avg - r, 0); - sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); + sub_positive(&sa->load_avg, r); + sub_positive(&sa->load_sum, r * LOAD_AVG_MAX); removed_load = 1; } if (atomic_long_read(&cfs_rq->removed_util_avg)) { long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); - sa->util_avg = max_t(long, sa->util_avg - r, 0); - sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0); + sub_positive(&sa->util_avg, r); + sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); removed_util = 1; } @@ -2994,10 +3004,10 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s &se->avg, se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); - cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); - cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); - cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); - cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); + sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); + sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum); + sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); + sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); cfs_rq_util_change(cfs_rq); } @@ -3246,7 +3256,7 @@ static inline void check_schedstat_required(void) trace_sched_stat_iowait_enabled() || trace_sched_stat_blocked_enabled() || trace_sched_stat_runtime_enabled()) { - pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, " + printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, " "stat_blocked and stat_runtime require the " "kernel parameter schedstats=enabled or " "kernel.sched_schedstats=1\n"); @@ -4185,6 +4195,26 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) if (!cfs_bandwidth_used()) return; + /* Synchronize hierarchical throttle counter: */ + if (unlikely(!cfs_rq->throttle_uptodate)) { + struct rq *rq = rq_of(cfs_rq); + struct cfs_rq *pcfs_rq; + struct task_group *tg; + + cfs_rq->throttle_uptodate = 1; + + /* Get closest up-to-date node, because leaves go first: */ + for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) { + pcfs_rq = tg->cfs_rq[cpu_of(rq)]; + if (pcfs_rq->throttle_uptodate) + break; + } + if (tg) { + cfs_rq->throttle_count = pcfs_rq->throttle_count; + cfs_rq->throttled_clock_task = rq_clock_task(rq); + } + } + /* an active group must be handled by the update_curr()->put() path */ if (!cfs_rq->runtime_enabled || cfs_rq->curr) return; @@ -4500,15 +4530,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { + /* Avoid re-evaluating load for this entity: */ + se = parent_entity(se); /* * Bias pick_next to pick a task from this cfs_rq, as * p is sleeping when it is within its sched_slice. */ - if (task_sleep && parent_entity(se)) - set_next_buddy(parent_entity(se)); - - /* avoid re-evaluating load for this entity */ - se = parent_entity(se); + if (task_sleep && se && !throttled_hierarchy(cfs_rq)) + set_next_buddy(se); break; } flags |= DEQUEUE_SLEEP; @@ -4910,19 +4939,24 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) return wl; for_each_sched_entity(se) { - long w, W; + struct cfs_rq *cfs_rq = se->my_q; + long W, w = cfs_rq_load_avg(cfs_rq); - tg = se->my_q->tg; + tg = cfs_rq->tg; /* * W = @wg + \Sum rw_j */ - W = wg + calc_tg_weight(tg, se->my_q); + W = wg + atomic_long_read(&tg->load_avg); + + /* Ensure \Sum rw_j >= rw_i */ + W -= cfs_rq->tg_load_avg_contrib; + W += w; /* * w = rw_i + @wl */ - w = cfs_rq_load_avg(se->my_q) + wl; + w += wl; /* * wl = S * s'_i; see (2) @@ -8496,8 +8530,9 @@ void free_fair_sched_group(struct task_group *tg) int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { - struct cfs_rq *cfs_rq; struct sched_entity *se; + struct cfs_rq *cfs_rq; + struct rq *rq; int i; tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); @@ -8512,6 +8547,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_bandwidth(tg_cfs_bandwidth(tg)); for_each_possible_cpu(i) { + rq = cpu_rq(i); + cfs_rq = kzalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, cpu_to_node(i)); if (!cfs_rq) @@ -8525,7 +8562,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); init_entity_runnable_average(se); + + raw_spin_lock_irq(&rq->lock); post_init_entity_util_avg(se); + raw_spin_unlock_irq(&rq->lock); } return 1; diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index b0b93fd33af9..a2d6eb71f06b 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -78,11 +78,11 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) loads[2] = (avenrun[2] + offset) << shift; } -long calc_load_fold_active(struct rq *this_rq) +long calc_load_fold_active(struct rq *this_rq, long adjust) { long nr_active, delta = 0; - nr_active = this_rq->nr_running; + nr_active = this_rq->nr_running - adjust; nr_active += (long)this_rq->nr_uninterruptible; if (nr_active != this_rq->calc_load_active) { @@ -188,7 +188,7 @@ void calc_load_enter_idle(void) * We're going into NOHZ mode, if there's any pending delta, fold it * into the pending idle delta. */ - delta = calc_load_fold_active(this_rq); + delta = calc_load_fold_active(this_rq, 0); if (delta) { int idx = calc_load_write_idx(); @@ -389,7 +389,7 @@ void calc_global_load_tick(struct rq *this_rq) if (time_before(jiffies, this_rq->calc_load_update)) return; - delta = calc_load_fold_active(this_rq); + delta = calc_load_fold_active(this_rq, 0); if (delta) atomic_long_add(delta, &calc_load_tasks); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 425bf5ddaa5a..81283592942b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -28,7 +28,7 @@ extern unsigned long calc_load_update; extern atomic_long_t calc_load_tasks; extern void calc_global_load_tick(struct rq *this_rq); -extern long calc_load_fold_active(struct rq *this_rq); +extern long calc_load_fold_active(struct rq *this_rq, long adjust); #ifdef CONFIG_SMP extern void cpu_load_update_active(struct rq *this_rq); @@ -437,7 +437,7 @@ struct cfs_rq { u64 throttled_clock, throttled_clock_task; u64 throttled_clock_task_time; - int throttled, throttle_count; + int throttled, throttle_count, throttle_uptodate; struct list_head throttled_list; #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 87b2fc38398b..35f0dcb1cb4f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1205,6 +1205,17 @@ static struct ctl_table kern_table[] = { .extra2 = &one, }, #endif +#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) + { + .procname = "panic_on_rcu_stall", + .data = &sysctl_panic_on_rcu_stall, + .maxlen = sizeof(sysctl_panic_on_rcu_stall), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, +#endif { } }; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 1cafba860b08..39008d78927a 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -777,6 +777,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) timer->it.cpu.expires = 0; sample_to_timespec(timer->it_clock, timer->it.cpu.expires, &itp->it_value); + return; } else { cpu_timer_sample_group(timer->it_clock, p, &now); unlock_task_sighand(p, &flags); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 479d25cd3d4f..255e225393ac 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2186,6 +2186,7 @@ struct timespec64 get_monotonic_coarse64(void) return now; } +EXPORT_SYMBOL(get_monotonic_coarse64); /* * Must hold jiffies_lock diff --git a/kernel/torture.c b/kernel/torture.c index fa0bdeee17ac..75961b3decfe 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -82,6 +82,104 @@ static int min_online = -1; static int max_online; /* + * Attempt to take a CPU offline. Return false if the CPU is already + * offline or if it is not subject to CPU-hotplug operations. The + * caller can detect other failures by looking at the statistics. + */ +bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, + unsigned long *sum_offl, int *min_offl, int *max_offl) +{ + unsigned long delta; + int ret; + unsigned long starttime; + + if (!cpu_online(cpu) || !cpu_is_hotpluggable(cpu)) + return false; + + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: offlining %d\n", + torture_type, cpu); + starttime = jiffies; + (*n_offl_attempts)++; + ret = cpu_down(cpu); + if (ret) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: offline %d failed: errno %d\n", + torture_type, cpu, ret); + } else { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: offlined %d\n", + torture_type, cpu); + (*n_offl_successes)++; + delta = jiffies - starttime; + sum_offl += delta; + if (*min_offl < 0) { + *min_offl = delta; + *max_offl = delta; + } + if (*min_offl > delta) + *min_offl = delta; + if (*max_offl < delta) + *max_offl = delta; + } + + return true; +} +EXPORT_SYMBOL_GPL(torture_offline); + +/* + * Attempt to bring a CPU online. Return false if the CPU is already + * online or if it is not subject to CPU-hotplug operations. The + * caller can detect other failures by looking at the statistics. + */ +bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes, + unsigned long *sum_onl, int *min_onl, int *max_onl) +{ + unsigned long delta; + int ret; + unsigned long starttime; + + if (cpu_online(cpu) || !cpu_is_hotpluggable(cpu)) + return false; + + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: onlining %d\n", + torture_type, cpu); + starttime = jiffies; + (*n_onl_attempts)++; + ret = cpu_up(cpu); + if (ret) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: online %d failed: errno %d\n", + torture_type, cpu, ret); + } else { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: onlined %d\n", + torture_type, cpu); + (*n_onl_successes)++; + delta = jiffies - starttime; + *sum_onl += delta; + if (*min_onl < 0) { + *min_onl = delta; + *max_onl = delta; + } + if (*min_onl > delta) + *min_onl = delta; + if (*max_onl < delta) + *max_onl = delta; + } + + return true; +} +EXPORT_SYMBOL_GPL(torture_online); + +/* * Execute random CPU-hotplug operations at the interval specified * by the onoff_interval. */ @@ -89,16 +187,19 @@ static int torture_onoff(void *arg) { int cpu; - unsigned long delta; int maxcpu = -1; DEFINE_TORTURE_RANDOM(rand); - int ret; - unsigned long starttime; VERBOSE_TOROUT_STRING("torture_onoff task started"); for_each_online_cpu(cpu) maxcpu = cpu; WARN_ON(maxcpu < 0); + + if (maxcpu == 0) { + VERBOSE_TOROUT_STRING("Only one CPU, so CPU-hotplug testing is disabled"); + goto stop; + } + if (onoff_holdoff > 0) { VERBOSE_TOROUT_STRING("torture_onoff begin holdoff"); schedule_timeout_interruptible(onoff_holdoff); @@ -106,69 +207,16 @@ torture_onoff(void *arg) } while (!torture_must_stop()) { cpu = (torture_random(&rand) >> 4) % (maxcpu + 1); - if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "torture_onoff task: offlining %d\n", - torture_type, cpu); - starttime = jiffies; - n_offline_attempts++; - ret = cpu_down(cpu); - if (ret) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "torture_onoff task: offline %d failed: errno %d\n", - torture_type, cpu, ret); - } else { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "torture_onoff task: offlined %d\n", - torture_type, cpu); - n_offline_successes++; - delta = jiffies - starttime; - sum_offline += delta; - if (min_offline < 0) { - min_offline = delta; - max_offline = delta; - } - if (min_offline > delta) - min_offline = delta; - if (max_offline < delta) - max_offline = delta; - } - } else if (cpu_is_hotpluggable(cpu)) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "torture_onoff task: onlining %d\n", - torture_type, cpu); - starttime = jiffies; - n_online_attempts++; - ret = cpu_up(cpu); - if (ret) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "torture_onoff task: online %d failed: errno %d\n", - torture_type, cpu, ret); - } else { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "torture_onoff task: onlined %d\n", - torture_type, cpu); - n_online_successes++; - delta = jiffies - starttime; - sum_online += delta; - if (min_online < 0) { - min_online = delta; - max_online = delta; - } - if (min_online > delta) - min_online = delta; - if (max_online < delta) - max_online = delta; - } - } + if (!torture_offline(cpu, + &n_offline_attempts, &n_offline_successes, + &sum_offline, &min_offline, &max_offline)) + torture_online(cpu, + &n_online_attempts, &n_online_successes, + &sum_online, &min_online, &max_online); schedule_timeout_interruptible(onoff_interval); } + +stop: torture_kthread_stopping("torture_onoff"); return 0; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 720b7bb01d43..26f603da7e26 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -209,6 +209,10 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) event->pmu->count) return -EINVAL; + if (unlikely(event->attr.type != PERF_TYPE_HARDWARE && + event->attr.type != PERF_TYPE_RAW)) + return -EINVAL; + /* * we don't know if the function is run successfully by the * return value. It can be judged in other places, such as @@ -349,7 +353,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func } /* bpf+kprobe programs can access fields of 'struct pt_regs' */ -static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type) +static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type, + enum bpf_reg_type *reg_type) { /* check bounds */ if (off < 0 || off >= sizeof(struct pt_regs)) @@ -427,7 +432,8 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) } } -static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type) +static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, + enum bpf_reg_type *reg_type) { if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) return false; diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index f96f0383f6c6..ad1d6164e946 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -36,6 +36,10 @@ struct trace_bprintk_fmt { static inline struct trace_bprintk_fmt *lookup_format(const char *fmt) { struct trace_bprintk_fmt *pos; + + if (!fmt) + return ERR_PTR(-EINVAL); + list_for_each_entry(pos, &trace_bprintk_fmt_list, list) { if (!strcmp(pos->fmt, fmt)) return pos; @@ -57,7 +61,8 @@ void hold_module_trace_bprintk_format(const char **start, const char **end) for (iter = start; iter < end; iter++) { struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter); if (tb_fmt) { - *iter = tb_fmt->fmt; + if (!IS_ERR(tb_fmt)) + *iter = tb_fmt->fmt; continue; } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e1c0e996b5ae..97e7b793df35 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4600,15 +4600,11 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) if (!cpumask_test_cpu(cpu, pool->attrs->cpumask)) return; - /* is @cpu the only online CPU? */ cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask); - if (cpumask_weight(&cpumask) != 1) - return; /* as we're called from CPU_ONLINE, the following shouldn't fail */ for_each_pool_worker(worker, pool) - WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, - pool->attrs->cpumask) < 0); + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, &cpumask) < 0); } /* |