From 3f47da0f32f5e43e6ae901129d5b9c2600011a2c Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 13 Jan 2015 15:30:34 +0800 Subject: rcu_tree: Avoid touching rnp->completed when a new GP is started In rcu_gp_init(), rnp->completed equals to rsp->completed in THEORY, we don't need to touch it normally. If something goes wrong, it will complain and fixup rnp->completed and avoid oops. This commit thus avoids the normal needless store to rnp->completed. Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 48d640ca1a05..077d0b700f74 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1757,8 +1757,8 @@ static int rcu_gp_init(struct rcu_state *rsp) rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; ACCESS_ONCE(rnp->gpnum) = rsp->gpnum; - WARN_ON_ONCE(rnp->completed != rsp->completed); - ACCESS_ONCE(rnp->completed) = rsp->completed; + if (WARN_ON_ONCE(rnp->completed != rsp->completed)) + ACCESS_ONCE(rnp->completed) = rsp->completed; if (rnp == rdp->mynode) (void)__note_gp_changes(rsp, rnp, rdp); rcu_preempt_boost_start_gp(rnp); -- cgit v1.2.3 From d3f3f3f25b1d4ee152f3f19a812c3a282da4c120 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 18 Jan 2015 18:21:09 -0800 Subject: rcu: Abstract default callback-list initialization from init_callback_list() In preparation for early-boot posting of callbacks, this commit abstracts initialization of the default (non-no-CB) callbacks list from the init_callback_list() function into a new init_default_callback_list() function. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 48d640ca1a05..f8cdb92da10b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1328,19 +1328,29 @@ void rcu_cpu_stall_reset(void) } /* - * Initialize the specified rcu_data structure's callback list to empty. + * Initialize the specified rcu_data structure's default callback list + * to empty. The default callback list is the one that is not used by + * no-callbacks CPUs. */ -static void init_callback_list(struct rcu_data *rdp) +static void init_default_callback_list(struct rcu_data *rdp) { int i; - if (init_nocb_callback_list(rdp)) - return; rdp->nxtlist = NULL; for (i = 0; i < RCU_NEXT_SIZE; i++) rdp->nxttail[i] = &rdp->nxtlist; } +/* + * Initialize the specified rcu_data structure's callback list to empty. + */ +static void init_callback_list(struct rcu_data *rdp) +{ + if (init_nocb_callback_list(rdp)) + return; + init_default_callback_list(rdp); +} + /* * Determine the value that ->completed will have at the end of the * next subsequent grace period. This is used to tag callbacks so that -- cgit v1.2.3 From 2723249a31a68ccc0ec8ac59a905d7f9430bf8f6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 20 Jan 2015 22:44:13 -0800 Subject: rcu: Wire ->rda pointers at compile time This commit wires up the rcu_state structures' ->rda pointers to the per-CPU rcu_data structures at compile time, thus ensuring that this linkage is present at early boot, in turn allowing posting of callbacks before rcu_init() is executed. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f8cdb92da10b..d2fa95e4a268 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -91,8 +91,10 @@ static const char *tp_##sname##_varname __used __tracepoint_string = sname##_var #define RCU_STATE_INITIALIZER(sname, sabbr, cr) \ DEFINE_RCU_TPS(sname) \ +DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data); \ struct rcu_state sname##_state = { \ .level = { &sname##_state.node[0] }, \ + .rda = &sname##_data, \ .call = cr, \ .fqs_state = RCU_GP_IDLE, \ .gpnum = 0UL - 300UL, \ @@ -104,8 +106,7 @@ struct rcu_state sname##_state = { \ .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ .name = RCU_STATE_NAME(sname), \ .abbr = sabbr, \ -}; \ -DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data) +} RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); @@ -3843,7 +3844,6 @@ static void __init rcu_init_one(struct rcu_state *rsp, } } - rsp->rda = rda; init_waitqueue_head(&rsp->gp_wq); rnp = rsp->level[rcu_num_lvls - 1]; for_each_possible_cpu(i) { -- cgit v1.2.3 From 143da9c2fc030a5774674f2ebc2f934fab3dcd9a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 19 Jan 2015 19:57:32 -0800 Subject: rcu: Prevent early-boot RCU callbacks from splatting Currently, a call_rcu() that precedes rcu_init() will splat due to the callback lists not having yet been initialized. This commit causes the first such callback to initialize the boot CPU's RCU callback list. Note that this commit does not change rcu_init()-time initialization, which means that the callback will be discarded at rcu_init() time. Fixing this is the job of later commits. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d2fa95e4a268..fcfdbe53bb70 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2838,11 +2838,21 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), if (cpu != -1) rdp = per_cpu_ptr(rsp->rda, cpu); - offline = !__call_rcu_nocb(rdp, head, lazy, flags); - WARN_ON_ONCE(offline); - /* _call_rcu() is illegal on offline CPU; leak the callback. */ - local_irq_restore(flags); - return; + if (likely(rdp->mynode)) { + /* Post-boot, so this should be for a no-CBs CPU. */ + offline = !__call_rcu_nocb(rdp, head, lazy, flags); + WARN_ON_ONCE(offline); + /* Offline CPU, _call_rcu() illegal, leak callback. */ + local_irq_restore(flags); + return; + } + /* + * Very early boot, before rcu_init(). Initialize if needed + * and then drop through to queue the callback. + */ + BUG_ON(cpu != -1); + if (!likely(rdp->nxtlist)) + init_default_callback_list(rdp); } ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1; if (lazy) -- cgit v1.2.3 From 39c8d313c3c546a414cc51b4f6571c2f8cc06407 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 20 Jan 2015 23:42:38 -0800 Subject: rcu: Avoid clobbering early boot callbacks When a CPU comes online, it initializes its callback list. This is a bad thing if this is the first time that the CPU has come online and if that CPU has early boot callbacks. This commit therefore avoid initializing the callback list if there are callbacks present, in which case the initial call_rcu() did the initialization for us. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index fcfdbe53bb70..92fd3eab5823 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3583,7 +3583,8 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; - init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ + if (!rdp->nxtlist) + init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; rcu_sysidle_init_percpu_data(rdp->dynticks); atomic_set(&rdp->dynticks->dynticks, -- cgit v1.2.3 From 1925d1967c93a1c421271aade7953f6857e9f579 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 18 Jan 2015 17:45:05 -0800 Subject: rcu: Fix a couple of typos in rcu_all_qs() comment header Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 077d0b700f74..4e37c7fd9e29 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -292,10 +292,10 @@ void rcu_note_context_switch(void) EXPORT_SYMBOL_GPL(rcu_note_context_switch); /* - * Register a quiesecent state for all RCU flavors. If there is an + * Register a quiescent state for all RCU flavors. If there is an * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight * dyntick-idle quiescent state visible to other CPUs (but only for those - * RCU flavors in desparate need of a quiescent state, which will normally + * RCU flavors in desperate need of a quiescent state, which will normally * be none of them). Either way, do a lightweight quiescent state for * all RCU flavors. */ -- cgit v1.2.3 From 5afff48bdf7481570c9385a8a674a81ffb8f09ee Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 18 Feb 2015 16:39:09 -0800 Subject: rcu: Update from rcu_expedited variable to rcu_gp_is_expedited() This commit updates open-coded tests of the rcu_expedited variable to instead use rcu_gp_is_expedited(). Signed-off-by: Paul E. McKenney --- kernel/rcu/srcu.c | 2 +- kernel/rcu/tree.c | 9 +++++---- kernel/rcu/tree_plugin.h | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index 445bf8ffe3fb..c871f07eff69 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -507,7 +507,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) */ void synchronize_srcu(struct srcu_struct *sp) { - __synchronize_srcu(sp, rcu_expedited + __synchronize_srcu(sp, rcu_gp_is_expedited() ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT : SYNCHRONIZE_SRCU_TRYCOUNT); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 48d640ca1a05..4325fbe79d84 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2954,7 +2954,7 @@ void synchronize_sched(void) "Illegal synchronize_sched() in RCU-sched read-side critical section"); if (rcu_blocking_is_gp()) return; - if (rcu_expedited) + if (rcu_gp_is_expedited()) synchronize_sched_expedited(); else wait_rcu_gp(call_rcu_sched); @@ -2981,7 +2981,7 @@ void synchronize_rcu_bh(void) "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); if (rcu_blocking_is_gp()) return; - if (rcu_expedited) + if (rcu_gp_is_expedited()) synchronize_rcu_bh_expedited(); else wait_rcu_gp(call_rcu_bh); @@ -3660,11 +3660,12 @@ static int rcu_pm_notify(struct notifier_block *self, case PM_HIBERNATION_PREPARE: case PM_SUSPEND_PREPARE: if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ - rcu_expedited = 1; + rcu_expedite_gp(); break; case PM_POST_HIBERNATION: case PM_POST_SUSPEND: - rcu_expedited = 0; + if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ + rcu_unexpedite_gp(); break; default: break; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0a571e9a0f1d..63726b734d34 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -585,7 +585,7 @@ void synchronize_rcu(void) "Illegal synchronize_rcu() in RCU read-side critical section"); if (!rcu_scheduler_active) return; - if (rcu_expedited) + if (rcu_gp_is_expedited()) synchronize_rcu_expedited(); else wait_rcu_gp(call_rcu); -- cgit v1.2.3 From 675da67f24e2d6d8df0cedf12e59085ed8bbf4e7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 23 Feb 2015 15:57:07 -0800 Subject: rcu: Fixes to NO_HZ_FULL sysidle accounting On second and subsequent passes through quiescent-state forcing, the isidle variable was initialized to false, which would prevent full sysidle state from being reached if a grace period needed more than one round of quiescent-state forcing (which most should not). However, the check for offline CPUs in the quiescent-state forcing main loop had the wrong sense, which could prevent CPUs from ever entering full sysidle state. This commit fixes both of these bugs. Given that sysidle is not yet wired up, this has no effect in old kernels, but might have proven frustrating had anyone attempted to wire it up. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 48d640ca1a05..735bd7ee749a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1798,7 +1798,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) fqs_state = RCU_FORCE_QS; } else { /* Handle dyntick-idle and offline CPUs. */ - isidle = false; + isidle = true; force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); } /* Clear flag to prevent immediate re-entry. */ @@ -2596,8 +2596,8 @@ static void force_qs_rnp(struct rcu_state *rsp, bit = 1; for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { if ((rnp->qsmask & bit) != 0) { - if ((rnp->qsmaskinit & bit) != 0) - *isidle = false; + if ((rnp->qsmaskinit & bit) == 0) + *isidle = false; /* Pending hotplug. */ if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) mask |= bit; } -- cgit v1.2.3 From 5871968d531f39c23a8e6c69525bb705bca52e04 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 24 Feb 2015 11:05:36 -0800 Subject: rcu: Tighten up affinity and check for sysidle If the RCU grace-period kthread invoking rcu_sysidle_check_cpu() happens to be running on the tick_do_timer_cpu initially, then rcu_bind_gp_kthread() won't bind it. This kthread might then migrate before invoking rcu_gp_fqs(), which will trigger the WARN_ON_ONCE() in rcu_sysidle_check_cpu(). This commit therefore makes rcu_bind_gp_kthread() do the binding even if the kthread is currently on the same CPU. Because this incurs added overhead, this commit also causes each RCU grace-period kthread to invoke rcu_bind_gp_kthread() once at boot rather than at the beginning of each grace period. And as long as rcu_bind_gp_kthread() is being modified, this commit eliminates its #ifdef. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- kernel/rcu/tree_plugin.h | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 735bd7ee749a..a6972c20eaa5 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1707,7 +1707,6 @@ static int rcu_gp_init(struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); ACCESS_ONCE(rsp->gp_activity) = jiffies; - rcu_bind_gp_kthread(); raw_spin_lock_irq(&rnp->lock); smp_mb__after_unlock_lock(); if (!ACCESS_ONCE(rsp->gp_flags)) { @@ -1895,6 +1894,7 @@ static int __noreturn rcu_gp_kthread(void *arg) struct rcu_state *rsp = arg; struct rcu_node *rnp = rcu_get_root(rsp); + rcu_bind_gp_kthread(); for (;;) { /* Handle grace-period start. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0a571e9a0f1d..b46c92824db1 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2763,7 +2763,8 @@ static void rcu_sysidle_exit(int irq) /* * Check to see if the current CPU is idle. Note that usermode execution - * does not count as idle. The caller must have disabled interrupts. + * does not count as idle. The caller must have disabled interrupts, + * and must be running on tick_do_timer_cpu. */ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, unsigned long *maxj) @@ -2784,8 +2785,8 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, if (!*isidle || rdp->rsp != rcu_state_p || cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) return; - if (rcu_gp_in_progress(rdp->rsp)) - WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); + /* Verify affinity of current kthread. */ + WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); /* Pick up current idle and NMI-nesting counter and check. */ cur = atomic_read(&rdtp->dynticks_idle); @@ -3068,11 +3069,10 @@ static void rcu_bind_gp_kthread(void) return; #ifdef CONFIG_NO_HZ_FULL_SYSIDLE cpu = tick_do_timer_cpu; - if (cpu >= 0 && cpu < nr_cpu_ids && raw_smp_processor_id() != cpu) + if (cpu >= 0 && cpu < nr_cpu_ids) set_cpus_allowed_ptr(current, cpumask_of(cpu)); #else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ - if (!is_housekeeping_cpu(raw_smp_processor_id())) - housekeeping_affine(current); + housekeeping_affine(current); #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ } -- cgit v1.2.3 From 34404ca8fb252ccee662c4368c555ccf774acc3b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 19 Jan 2015 20:39:20 -0800 Subject: rcu: Move early-boot callbacks to no-CBs lists for no-CBs CPUs When a CPU is first determined to be a no-CBs CPUs, this commit causes any early boot callbacks to be moved to the no-CBs callback list, allowing them to be invoked. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 + kernel/rcu/tree_plugin.h | 24 ++++++++++++------------ 2 files changed, 13 insertions(+), 12 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 92fd3eab5823..0317bf7d997f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2851,6 +2851,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), * and then drop through to queue the callback. */ BUG_ON(cpu != -1); + WARN_ON_ONCE(!rcu_is_watching()); if (!likely(rdp->nxtlist)) init_default_callback_list(rdp); } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 75d5f096bcb0..afddd5641bea 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2393,18 +2393,8 @@ void __init rcu_init_nohz(void) pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); for_each_rcu_flavor(rsp) { - for_each_cpu(cpu, rcu_nocb_mask) { - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - - /* - * If there are early callbacks, they will need - * to be moved to the nocb lists. - */ - WARN_ON_ONCE(rdp->nxttail[RCU_NEXT_TAIL] != - &rdp->nxtlist && - rdp->nxttail[RCU_NEXT_TAIL] != NULL); - init_nocb_callback_list(rdp); - } + for_each_cpu(cpu, rcu_nocb_mask) + init_nocb_callback_list(per_cpu_ptr(rsp->rda, cpu)); rcu_organize_nocb_kthreads(rsp); } } @@ -2541,6 +2531,16 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) if (!rcu_is_nocb_cpu(rdp->cpu)) return false; + /* If there are early-boot callbacks, move them to nocb lists. */ + if (rdp->nxtlist) { + rdp->nocb_head = rdp->nxtlist; + rdp->nocb_tail = rdp->nxttail[RCU_NEXT_TAIL]; + atomic_long_set(&rdp->nocb_q_count, rdp->qlen); + atomic_long_set(&rdp->nocb_q_count_lazy, rdp->qlen_lazy); + rdp->nxtlist = NULL; + rdp->qlen = 0; + rdp->qlen_lazy = 0; + } rdp->nxttail[RCU_NEXT_TAIL] = NULL; return true; } -- cgit v1.2.3 From 476276781095c79580abe27a65988549ac7f5f89 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 19 Jan 2015 21:10:21 -0800 Subject: rcu: Move early boot callback tests earlier Because callbacks can now be posted quite early in boot, move the early boot callback tests to precede RCU initialization. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0317bf7d997f..c8e6569c5fbd 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3948,6 +3948,8 @@ void __init rcu_init(void) { int cpu; + rcu_early_boot_tests(); + rcu_bootup_announce(); rcu_init_geometry(); rcu_init_one(&rcu_bh_state, &rcu_bh_data); @@ -3964,8 +3966,6 @@ void __init rcu_init(void) pm_notifier(rcu_pm_notify, 0); for_each_online_cpu(cpu) rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); - - rcu_early_boot_tests(); } #include "tree_plugin.h" -- cgit v1.2.3 From 6629240575992a6f0d18c46f5160b34527b0e501 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 19 Jan 2015 19:16:38 -0800 Subject: rcu: Use IS_ENABLED() to CONFIG_RCU_FANOUT_EXACT #ifdef This commit uses IS_ENABLED() to remove the #ifdef from the rcu_init_levelspread() functions. No effect on executable code. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4e37c7fd9e29..35e1604f7e3e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3734,30 +3734,26 @@ void rcu_scheduler_starting(void) * Compute the per-level fanout, either using the exact fanout specified * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT. */ -#ifdef CONFIG_RCU_FANOUT_EXACT static void __init rcu_init_levelspread(struct rcu_state *rsp) { int i; - rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; - for (i = rcu_num_lvls - 2; i >= 0; i--) - rsp->levelspread[i] = CONFIG_RCU_FANOUT; -} -#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ -static void __init rcu_init_levelspread(struct rcu_state *rsp) -{ - int ccur; - int cprv; - int i; - - cprv = nr_cpu_ids; - for (i = rcu_num_lvls - 1; i >= 0; i--) { - ccur = rsp->levelcnt[i]; - rsp->levelspread[i] = (cprv + ccur - 1) / ccur; - cprv = ccur; + if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT)) { + rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; + for (i = rcu_num_lvls - 2; i >= 0; i--) + rsp->levelspread[i] = CONFIG_RCU_FANOUT; + } else { + int ccur; + int cprv; + + cprv = nr_cpu_ids; + for (i = rcu_num_lvls - 1; i >= 0; i--) { + ccur = rsp->levelcnt[i]; + rsp->levelspread[i] = (cprv + ccur - 1) / ccur; + cprv = ccur; + } } } -#endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */ /* * Helper function for rcu_init() that initializes one rcu_state structure. -- cgit v1.2.3 From e7580f33889299e484a80f42c20611ead42f199e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 24 Feb 2015 14:23:39 -0800 Subject: rcu: Get rcu_sched_force_quiescent_state() where it belongs The very similar functions rcu_force_quiescent_state(), rcu_bh_force_quiescent_state(), and rcu_sched_force_quiescent_state() are supposed to be together, but have drifted apart. This commit restores rcu_sched_force_quiescent_state() to its rightful place. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 35e1604f7e3e..fbe9dd9ced54 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -409,6 +409,15 @@ void rcu_bh_force_quiescent_state(void) } EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); +/* + * Force a quiescent state for RCU-sched. + */ +void rcu_sched_force_quiescent_state(void) +{ + force_quiescent_state(&rcu_sched_state); +} +EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); + /* * Show the state of the grace-period kthreads. */ @@ -482,15 +491,6 @@ void rcutorture_record_progress(unsigned long vernum) } EXPORT_SYMBOL_GPL(rcutorture_record_progress); -/* - * Force a quiescent state for RCU-sched. - */ -void rcu_sched_force_quiescent_state(void) -{ - force_quiescent_state(&rcu_sched_state); -} -EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); - /* * Does the CPU have callbacks ready to be invoked? */ -- cgit v1.2.3 From 9910affa89fe0895153880b115ec243636e70af3 Mon Sep 17 00:00:00 2001 From: Yao Dongdong Date: Wed, 25 Feb 2015 17:09:46 +0800 Subject: rcu: Remove redundant check of cpu_online() Because invoke_cpu_core() checks whether the current CPU is online, there is no need for __call_rcu_core() to redundantly check it. There should not be any performance degradation because the called function is visible to the compiler. This commit therefore removes the redundant check. Signed-off-by: Yao Dongdong Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index fbe9dd9ced54..23194a77a768 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2741,7 +2741,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, * If called from an extended quiescent state, invoke the RCU * core in order to force a re-evaluation of RCU's idleness. */ - if (!rcu_is_watching() && cpu_online(smp_processor_id())) + if (!rcu_is_watching()) invoke_rcu_core(); /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ -- cgit v1.2.3 From b33078b6098148c3efdacc907249a247c9d5491e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 16 Jan 2015 14:01:21 -0800 Subject: rcu: Consolidate offline-CPU callback initialization Currently, both rcu_cleanup_dead_cpu() and rcu_send_cbs_to_orphanage() initialize the outgoing CPU's callback list. However, only rcu_cleanup_dead_cpu() invokes rcu_send_cbs_to_orphanage(), and it does so unconditionally, which means that only one of these initializations is required. This commit therefore consolidates the callback-list initialization with the rest of the callback handling in rcu_send_cbs_to_orphanage(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 48d640ca1a05..8e020c59ecfd 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2256,8 +2256,12 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL]; } - /* Finally, initialize the rcu_data structure's list to empty. */ + /* + * Finally, initialize the rcu_data structure's list to empty and + * disallow further callbacks on this CPU. + */ init_callback_list(rdp); + rdp->nxttail[RCU_NEXT_TAIL] = NULL; } /* @@ -2398,9 +2402,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", cpu, rdp->qlen, rdp->nxtlist); - init_callback_list(rdp); - /* Disallow further callbacks on this CPU. */ - rdp->nxttail[RCU_NEXT_TAIL] = NULL; mutex_unlock(&rsp->onoff_mutex); } -- cgit v1.2.3 From 78043c467a91573cc1d51827fe10d7d15ae79a60 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 18 Jan 2015 17:46:24 -0800 Subject: rcu: Put all orphan-callback-related code under same comment Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8e020c59ecfd..98da632d1d49 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2385,9 +2385,9 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) /* Exclude any attempts to start a new grace period. */ mutex_lock(&rsp->onoff_mutex); - raw_spin_lock_irqsave(&rsp->orphan_lock, flags); /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ + raw_spin_lock_irqsave(&rsp->orphan_lock, flags); rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); rcu_adopt_orphan_cbs(rsp, flags); raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); -- cgit v1.2.3 From 237a0f2193c6daf9b1edd7fd15d55e680f268952 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 22 Jan 2015 14:32:06 -0800 Subject: rcu: Detect stalls caused by failure to propagate up rcu_node tree If all CPUs have passed through quiescent states, then stalls might be due to starvation of the grace-period kthread or to failure to propagate the quiescent states up the rcu_node combining tree. The current stall warning messages do not differentiate, so this commit adds a printout of the root rcu_node structure's ->qsmask field. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 98da632d1d49..3b7e4133ca99 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1196,9 +1196,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) } else { j = jiffies; gpa = ACCESS_ONCE(rsp->gp_activity); - pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld\n", + pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n", rsp->name, j - gpa, j, gpa, - jiffies_till_next_fqs); + jiffies_till_next_fqs, + rcu_get_root(rsp)->qsmask); /* In this case, the current CPU might be at fault. */ sched_show_task(current); } -- cgit v1.2.3 From 37745d281069682d901f00c0121949a7d224195f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 22 Jan 2015 18:24:08 -0800 Subject: rcu: Provide diagnostic option to slow down grace-period initialization Grace-period initialization normally proceeds quite quickly, so that it is very difficult to reproduce races against grace-period initialization. This commit therefore allows grace-period initialization to be artificially slowed down, increasing race-reproduction probability. A pair of new Kconfig parameters are provided, CONFIG_RCU_TORTURE_TEST_SLOW_INIT to enable the slowdowns, and CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY to specify the number of jiffies of slowdown to apply. A boot-time parameter named rcutree.gp_init_delay allows boot-time delay to be specified. By default, no delay will be applied even if CONFIG_RCU_TORTURE_TEST_SLOW_INIT is set. Signed-off-by: Paul E. McKenney --- Documentation/kernel-parameters.txt | 6 ++++++ kernel/rcu/tree.c | 10 ++++++++++ lib/Kconfig.debug | 24 ++++++++++++++++++++++++ 3 files changed, 40 insertions(+) (limited to 'kernel/rcu/tree.c') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index bfcb1a62a7b4..94de410ec341 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2968,6 +2968,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Set maximum number of finished RCU callbacks to process in one batch. + rcutree.gp_init_delay= [KNL] + Set the number of jiffies to delay each step of + RCU grace-period initialization. This only has + effect when CONFIG_RCU_TORTURE_TEST_SLOW_INIT is + set. + rcutree.rcu_fanout_leaf= [KNL] Increase the number of CPUs assigned to each leaf rcu_node structure. Useful for very large diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3b7e4133ca99..b42001fd55fb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -160,6 +160,12 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO; module_param(kthread_prio, int, 0644); +/* Delay in jiffies for grace-period initialization delays. */ +static int gp_init_delay = IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_INIT) + ? CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY + : 0; +module_param(gp_init_delay, int, 0644); + /* * Track the rcutorture test sequence number and the update version * number within a given test. The rcutorture_testseq is incremented @@ -1769,6 +1775,10 @@ static int rcu_gp_init(struct rcu_state *rsp) raw_spin_unlock_irq(&rnp->lock); cond_resched_rcu_qs(); ACCESS_ONCE(rsp->gp_activity) = jiffies; + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_INIT) && + gp_init_delay > 0 && + !(rsp->gpnum % (rcu_num_nodes * 10))) + schedule_timeout_uninterruptible(gp_init_delay); } mutex_unlock(&rsp->onoff_mutex); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index c5cefb3c009c..feee8dab441e 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1257,6 +1257,30 @@ config RCU_TORTURE_TEST_RUNNABLE Say N here if you want the RCU torture tests to start only after being manually enabled via /proc. +config RCU_TORTURE_TEST_SLOW_INIT + bool "Slow down RCU grace-period initialization to expose races" + depends on RCU_TORTURE_TEST + help + This option makes grace-period initialization block for a + few jiffies between initializing each pair of consecutive + rcu_node structures. This helps to expose races involving + grace-period initialization, in other words, it makes your + kernel less stable. It can also greatly increase grace-period + latency, especially on systems with large numbers of CPUs. + This is useful when torture-testing RCU, but in almost no + other circumstance. + + Say Y here if you want your system to crash and hang more often. + Say N if you want a sane system. + +config RCU_TORTURE_TEST_SLOW_INIT_DELAY + int "How much to slow down RCU grace-period initialization" + range 0 5 + default 0 + help + This option specifies the number of jiffies to wait between + each rcu_node structure initialization. + config RCU_CPU_STALL_TIMEOUT int "RCU CPU stall timeout in seconds" depends on RCU_STALL_COMMON -- cgit v1.2.3 From 999c286347538388170f919146d7cfa58689472e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 31 Jan 2015 21:12:02 -0800 Subject: rcu: Remove event tracing from rcu_cpu_notify(), used by offline CPUs Offline CPUs cannot safely invoke trace events, but such CPUs do execute within rcu_cpu_notify(). Therefore, this commit removes the trace events from rcu_cpu_notify(). These trace events are for utilization, against which rcu_cpu_notify() execution time should be negligible. Reported-by: Fengguang Wu Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b42001fd55fb..a7151d26b940 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3629,7 +3629,6 @@ static int rcu_cpu_notify(struct notifier_block *self, struct rcu_node *rnp = rdp->mynode; struct rcu_state *rsp; - trace_rcu_utilization(TPS("Start CPU hotplug")); switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: @@ -3661,7 +3660,6 @@ static int rcu_cpu_notify(struct notifier_block *self, default: break; } - trace_rcu_utilization(TPS("End CPU hotplug")); return NOTIFY_OK; } -- cgit v1.2.3 From cc99a310caf811aebbd0986f433d824e4a5e7ce5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 23 Feb 2015 08:59:29 -0800 Subject: rcu: Move rcu_report_unblock_qs_rnp() to common code The rcu_report_unblock_qs_rnp() function is invoked when the last task blocking the current grace period exits its outermost RCU read-side critical section. Previously, this was called only from rcu_read_unlock_special(), and was therefore defined only when CONFIG_RCU_PREEMPT=y. However, this function will be invoked even when CONFIG_RCU_PREEMPT=n once CPU-hotplug operations are processed only at the beginnings of RCU grace periods. The reason for this change is that the last task on a given leaf rcu_node structure's ->blkd_tasks list might well exit its RCU read-side critical section between the time that recent CPU-hotplug operations were applied and when the new grace period was initialized. This situation could result in RCU waiting forever on that leaf rcu_node structure, because if all that structure's CPUs were already offline, there would be no quiescent-state events to drive that structure's part of the grace period. This commit therefore moves rcu_report_unblock_qs_rnp() to common code that is built unconditionally so that the quiescent-state-forcing code can clean up after this situation, avoiding the grace-period stall. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 39 +++++++++++++++++++++++++++++++++++++++ kernel/rcu/tree_plugin.h | 40 ++-------------------------------------- 2 files changed, 41 insertions(+), 38 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a7151d26b940..5b5cb1ff73ed 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2126,6 +2126,45 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, rcu_report_qs_rsp(rsp, flags); /* releases rnp->lock. */ } +/* + * Record a quiescent state for all tasks that were previously queued + * on the specified rcu_node structure and that were blocking the current + * RCU grace period. The caller must hold the specified rnp->lock with + * irqs disabled, and this lock is released upon return, but irqs remain + * disabled. + */ +static void __maybe_unused rcu_report_unblock_qs_rnp(struct rcu_state *rsp, + struct rcu_node *rnp, unsigned long flags) + __releases(rnp->lock) +{ + unsigned long mask; + struct rcu_node *rnp_p; + + WARN_ON_ONCE(rsp == &rcu_bh_state || rsp == &rcu_sched_state); + if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; /* Still need more quiescent states! */ + } + + rnp_p = rnp->parent; + if (rnp_p == NULL) { + /* + * Either there is only one rcu_node in the tree, + * or tasks were kicked up to root rcu_node due to + * CPUs going offline. + */ + rcu_report_qs_rsp(rsp, flags); + return; + } + + /* Report up the rest of the hierarchy. */ + mask = rnp->grpmask; + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ + smp_mb__after_unlock_lock(); + rcu_report_qs_rnp(mask, rsp, rnp_p, flags); +} + /* * Record a quiescent state for the specified CPU to that CPU's rcu_data * structure. This must be either called from the specified CPU, or diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index a22721547442..ec6c2efb28cd 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -232,43 +232,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) return rnp->gp_tasks != NULL; } -/* - * Record a quiescent state for all tasks that were previously queued - * on the specified rcu_node structure and that were blocking the current - * RCU grace period. The caller must hold the specified rnp->lock with - * irqs disabled, and this lock is released upon return, but irqs remain - * disabled. - */ -static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) - __releases(rnp->lock) -{ - unsigned long mask; - struct rcu_node *rnp_p; - - if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; /* Still need more quiescent states! */ - } - - rnp_p = rnp->parent; - if (rnp_p == NULL) { - /* - * Either there is only one rcu_node in the tree, - * or tasks were kicked up to root rcu_node due to - * CPUs going offline. - */ - rcu_report_qs_rsp(&rcu_preempt_state, flags); - return; - } - - /* Report up the rest of the hierarchy. */ - mask = rnp->grpmask; - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); - rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags); -} - /* * Advance a ->blkd_tasks-list pointer to the next entry, instead * returning NULL if at the end of the list. @@ -399,7 +362,8 @@ void rcu_read_unlock_special(struct task_struct *t) rnp->grplo, rnp->grphi, !!rnp->gp_tasks); - rcu_report_unblock_qs_rnp(rnp, flags); + rcu_report_unblock_qs_rnp(&rcu_preempt_state, + rnp, flags); } else { raw_spin_unlock_irqrestore(&rnp->lock, flags); } -- cgit v1.2.3 From 0aa04b055e71bd3b8040dd71a126126c66b6f01e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 23 Jan 2015 21:52:37 -0800 Subject: rcu: Process offlining and onlining only at grace-period start Races between CPU hotplug and grace periods can be difficult to resolve, so the ->onoff_mutex is used to exclude the two events. Unfortunately, this means that it is impossible for an outgoing CPU to perform the last bits of its offlining from its last pass through the idle loop, because sleeplocks cannot be acquired in that context. This commit avoids these problems by buffering online and offline events in a new ->qsmaskinitnext field in the leaf rcu_node structures. When a grace period starts, the events accumulated in this mask are applied to the ->qsmaskinit field, and, if needed, up the rcu_node tree. The special case of all CPUs corresponding to a given leaf rcu_node structure being offline while there are still elements in that structure's ->blkd_tasks list is handled using a new ->wait_blkd_tasks field. In this case, propagating the offline bits up the tree is deferred until the beginning of the grace period after all of the tasks have exited their RCU read-side critical sections and removed themselves from the list, at which point the ->wait_blkd_tasks flag is cleared. If one of that leaf rcu_node structure's CPUs comes back online before the list empties, then the ->wait_blkd_tasks flag is simply cleared. This of course means that RCU's notion of which CPUs are offline can be out of date. This is OK because RCU need only wait on CPUs that were online at the time that the grace period started. In addition, RCU's force-quiescent-state actions will handle the case where a CPU goes offline after the grace period starts. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 154 +++++++++++++++++++++++++++++++++++++---------- kernel/rcu/tree.h | 9 +++ kernel/rcu/tree_plugin.h | 22 ++----- kernel/rcu/tree_trace.c | 4 +- 4 files changed, 136 insertions(+), 53 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5b5cb1ff73ed..f0f4d3510d24 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -152,6 +152,8 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active); */ static int rcu_scheduler_fully_active __read_mostly; +static void rcu_init_new_rnp(struct rcu_node *rnp_leaf); +static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf); static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); static void invoke_rcu_core(void); static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); @@ -178,6 +180,17 @@ module_param(gp_init_delay, int, 0644); unsigned long rcutorture_testseq; unsigned long rcutorture_vernum; +/* + * Compute the mask of online CPUs for the specified rcu_node structure. + * This will not be stable unless the rcu_node structure's ->lock is + * held, but the bit corresponding to the current CPU will be stable + * in most contexts. + */ +unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) +{ + return ACCESS_ONCE(rnp->qsmaskinitnext); +} + /* * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s * permit this function to be invoked without holding the root rcu_node @@ -960,7 +973,7 @@ bool rcu_lockdep_current_cpu_online(void) preempt_disable(); rdp = this_cpu_ptr(&rcu_sched_data); rnp = rdp->mynode; - ret = (rdp->grpmask & rnp->qsmaskinit) || + ret = (rdp->grpmask & rcu_rnp_online_cpus(rnp)) || !rcu_scheduler_fully_active; preempt_enable(); return ret; @@ -1710,6 +1723,7 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) */ static int rcu_gp_init(struct rcu_state *rsp) { + unsigned long oldmask; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); @@ -1744,6 +1758,55 @@ static int rcu_gp_init(struct rcu_state *rsp) mutex_lock(&rsp->onoff_mutex); smp_mb__after_unlock_lock(); /* ->gpnum increment before GP! */ + /* + * Apply per-leaf buffered online and offline operations to the + * rcu_node tree. Note that this new grace period need not wait + * for subsequent online CPUs, and that quiescent-state forcing + * will handle subsequent offline CPUs. + */ + rcu_for_each_leaf_node(rsp, rnp) { + raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); + if (rnp->qsmaskinit == rnp->qsmaskinitnext && + !rnp->wait_blkd_tasks) { + /* Nothing to do on this leaf rcu_node structure. */ + raw_spin_unlock_irq(&rnp->lock); + continue; + } + + /* Record old state, apply changes to ->qsmaskinit field. */ + oldmask = rnp->qsmaskinit; + rnp->qsmaskinit = rnp->qsmaskinitnext; + + /* If zero-ness of ->qsmaskinit changed, propagate up tree. */ + if (!oldmask != !rnp->qsmaskinit) { + if (!oldmask) /* First online CPU for this rcu_node. */ + rcu_init_new_rnp(rnp); + else if (rcu_preempt_has_tasks(rnp)) /* blocked tasks */ + rnp->wait_blkd_tasks = true; + else /* Last offline CPU and can propagate. */ + rcu_cleanup_dead_rnp(rnp); + } + + /* + * If all waited-on tasks from prior grace period are + * done, and if all this rcu_node structure's CPUs are + * still offline, propagate up the rcu_node tree and + * clear ->wait_blkd_tasks. Otherwise, if one of this + * rcu_node structure's CPUs has since come back online, + * simply clear ->wait_blkd_tasks (but rcu_cleanup_dead_rnp() + * checks for this, so just call it unconditionally). + */ + if (rnp->wait_blkd_tasks && + (!rcu_preempt_has_tasks(rnp) || + rnp->qsmaskinit)) { + rnp->wait_blkd_tasks = false; + rcu_cleanup_dead_rnp(rnp); + } + + raw_spin_unlock_irq(&rnp->lock); + } + /* * Set the quiescent-state-needed bits in all the rcu_node * structures for all currently online CPUs in breadth-first order, @@ -2133,7 +2196,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, * irqs disabled, and this lock is released upon return, but irqs remain * disabled. */ -static void __maybe_unused rcu_report_unblock_qs_rnp(struct rcu_state *rsp, +static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, struct rcu_node *rnp, unsigned long flags) __releases(rnp->lock) { @@ -2409,6 +2472,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) raw_spin_lock(&rnp->lock); /* irqs already disabled. */ smp_mb__after_unlock_lock(); /* GP memory ordering. */ rnp->qsmaskinit &= ~mask; + rnp->qsmask &= ~mask; if (rnp->qsmaskinit) { raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ return; @@ -2427,6 +2491,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { unsigned long flags; + unsigned long mask; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ @@ -2443,12 +2508,12 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ + mask = rdp->grpmask; raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */ - rnp->qsmaskinit &= ~rdp->grpmask; - if (rnp->qsmaskinit == 0 && !rcu_preempt_has_tasks(rnp)) - rcu_cleanup_dead_rnp(rnp); - rcu_report_qs_rnp(rdp->grpmask, rsp, rnp, flags); /* Rlses rnp->lock. */ + rnp->qsmaskinitnext &= ~mask; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", cpu, rdp->qlen, rdp->nxtlist); @@ -2654,12 +2719,21 @@ static void force_qs_rnp(struct rcu_state *rsp, } } if (mask != 0) { - - /* rcu_report_qs_rnp() releases rnp->lock. */ + /* Idle/offline CPUs, report. */ rcu_report_qs_rnp(mask, rsp, rnp, flags); - continue; + } else if (rnp->parent && + list_empty(&rnp->blkd_tasks) && + !rnp->qsmask && + (rnp->parent->qsmask & rnp->grpmask)) { + /* + * Race between grace-period initialization and task + * existing RCU read-side critical section, report. + */ + rcu_report_unblock_qs_rnp(rsp, rnp, flags); + } else { + /* Nothing to do here, so just drop the lock. */ + raw_spin_unlock_irqrestore(&rnp->lock, flags); } - raw_spin_unlock_irqrestore(&rnp->lock, flags); } } @@ -3568,6 +3642,28 @@ void rcu_barrier_sched(void) } EXPORT_SYMBOL_GPL(rcu_barrier_sched); +/* + * Propagate ->qsinitmask bits up the rcu_node tree to account for the + * first CPU in a given leaf rcu_node structure coming online. The caller + * must hold the corresponding leaf rcu_node ->lock with interrrupts + * disabled. + */ +static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) +{ + long mask; + struct rcu_node *rnp = rnp_leaf; + + for (;;) { + mask = rnp->grpmask; + rnp = rnp->parent; + if (rnp == NULL) + return; + raw_spin_lock(&rnp->lock); /* Interrupts already disabled. */ + rnp->qsmaskinit |= mask; + raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ + } +} + /* * Do boot-time initialization of a CPU's per-CPU RCU data. */ @@ -3620,31 +3716,23 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - /* Add CPU to rcu_node bitmasks. */ + /* + * Add CPU to leaf rcu_node pending-online bitmask. Any needed + * propagation up the rcu_node tree will happen at the beginning + * of the next grace period. + */ rnp = rdp->mynode; mask = rdp->grpmask; - do { - /* Exclude any attempts to start a new GP on small systems. */ - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - rnp->qsmaskinit |= mask; - mask = rnp->grpmask; - if (rnp == rdp->mynode) { - /* - * If there is a grace period in progress, we will - * set up to wait for it next time we run the - * RCU core code. - */ - rdp->gpnum = rnp->completed; - rdp->completed = rnp->completed; - rdp->passed_quiesce = 0; - rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); - rdp->qs_pending = 0; - trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); - } - raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ - rnp = rnp->parent; - } while (rnp != NULL && !(rnp->qsmaskinit & mask)); - local_irq_restore(flags); + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + smp_mb__after_unlock_lock(); + rnp->qsmaskinitnext |= mask; + rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ + rdp->completed = rnp->completed; + rdp->passed_quiesce = false; + rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); + rdp->qs_pending = false; + trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); + raw_spin_unlock_irqrestore(&rnp->lock, flags); mutex_unlock(&rsp->onoff_mutex); } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 119de399eb2f..aa42562ff5b2 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -141,12 +141,20 @@ struct rcu_node { /* complete (only for PREEMPT_RCU). */ unsigned long qsmaskinit; /* Per-GP initial value for qsmask & expmask. */ + /* Initialized from ->qsmaskinitnext at the */ + /* beginning of each grace period. */ + unsigned long qsmaskinitnext; + /* Online CPUs for next grace period. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ /* Only one bit will be set in this mask. */ int grplo; /* lowest-numbered CPU or group here. */ int grphi; /* highest-numbered CPU or group here. */ u8 grpnum; /* CPU/group number for next level up. */ u8 level; /* root is at level 0. */ + bool wait_blkd_tasks;/* Necessary to wait for blocked tasks to */ + /* exit RCU read-side critical sections */ + /* before propagating offline up the */ + /* rcu_node tree? */ struct rcu_node *parent; struct list_head blkd_tasks; /* Tasks blocked in RCU read-side critical */ @@ -559,6 +567,7 @@ static void rcu_prepare_kthreads(int cpu); static void rcu_cleanup_after_idle(void); static void rcu_prepare_for_idle(void); static void rcu_idle_count_callbacks_posted(void); +static bool rcu_preempt_has_tasks(struct rcu_node *rnp); static void print_cpu_stall_info_begin(void); static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); static void print_cpu_stall_info_end(void); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index ec6c2efb28cd..d45e961515c1 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -180,7 +180,7 @@ static void rcu_preempt_note_context_switch(void) * But first, note that the current CPU must still be * on line! */ - WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0); + WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0); WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) { list_add(&t->rcu_node_entry, rnp->gp_tasks->prev); @@ -263,7 +263,6 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp) */ void rcu_read_unlock_special(struct task_struct *t) { - bool empty; bool empty_exp; bool empty_norm; bool empty_exp_now; @@ -319,7 +318,6 @@ void rcu_read_unlock_special(struct task_struct *t) break; raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } - empty = !rcu_preempt_has_tasks(rnp); empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); empty_exp = !rcu_preempted_readers_exp(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ @@ -339,14 +337,6 @@ void rcu_read_unlock_special(struct task_struct *t) drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t; #endif /* #ifdef CONFIG_RCU_BOOST */ - /* - * If this was the last task on the list, go see if we - * need to propagate ->qsmaskinit bit clearing up the - * rcu_node tree. - */ - if (!empty && !rcu_preempt_has_tasks(rnp)) - rcu_cleanup_dead_rnp(rnp); - /* * If this was the last task on the current list, and if * we aren't waiting on any CPUs, report the quiescent state. @@ -868,8 +858,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) return 0; } -#ifdef CONFIG_HOTPLUG_CPU - /* * Because there is no preemptible RCU, there can be no readers blocked. */ @@ -878,8 +866,6 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp) return false; } -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - /* * Because preemptible RCU does not exist, we never have to check for * tasks blocked within RCU read-side critical sections. @@ -1179,7 +1165,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) * Returns zero if all is well, a negated errno otherwise. */ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, - struct rcu_node *rnp) + struct rcu_node *rnp) { int rnp_index = rnp - &rsp->node[0]; unsigned long flags; @@ -1189,7 +1175,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, if (&rcu_preempt_state != rsp) return 0; - if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0) + if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0) return 0; rsp->boost = 1; @@ -1282,7 +1268,7 @@ static void rcu_cpu_kthread(unsigned int cpu) static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) { struct task_struct *t = rnp->boost_kthread_task; - unsigned long mask = rnp->qsmaskinit; + unsigned long mask = rcu_rnp_online_cpus(rnp); cpumask_var_t cm; int cpu; diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index fbb6240509ea..f92361efd0f5 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -283,8 +283,8 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) seq_puts(m, "\n"); level = rnp->level; } - seq_printf(m, "%lx/%lx %c%c>%c %d:%d ^%d ", - rnp->qsmask, rnp->qsmaskinit, + seq_printf(m, "%lx/%lx->%lx %c%c>%c %d:%d ^%d ", + rnp->qsmask, rnp->qsmaskinit, rnp->qsmaskinitnext, ".G"[rnp->gp_tasks != NULL], ".E"[rnp->exp_tasks != NULL], ".T"[!list_empty(&rnp->blkd_tasks)], -- cgit v1.2.3 From c199068913c9c5cbb5498e289bb387703e087ea8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 23 Jan 2015 22:29:37 -0800 Subject: rcu: Eliminate ->onoff_mutex from rcu_node structure Because that RCU grace-period initialization need no longer exclude CPU-hotplug operations, this commit eliminates the ->onoff_mutex and its uses. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 15 --------------- kernel/rcu/tree.h | 2 -- 2 files changed, 17 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f0f4d3510d24..79d53399247e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -101,7 +101,6 @@ struct rcu_state sname##_state = { \ .orphan_nxttail = &sname##_state.orphan_nxtlist, \ .orphan_donetail = &sname##_state.orphan_donelist, \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ - .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ .name = RCU_STATE_NAME(sname), \ .abbr = sabbr, \ }; \ @@ -1754,10 +1753,6 @@ static int rcu_gp_init(struct rcu_state *rsp) trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); raw_spin_unlock_irq(&rnp->lock); - /* Exclude any concurrent CPU-hotplug operations. */ - mutex_lock(&rsp->onoff_mutex); - smp_mb__after_unlock_lock(); /* ->gpnum increment before GP! */ - /* * Apply per-leaf buffered online and offline operations to the * rcu_node tree. Note that this new grace period need not wait @@ -1844,7 +1839,6 @@ static int rcu_gp_init(struct rcu_state *rsp) schedule_timeout_uninterruptible(gp_init_delay); } - mutex_unlock(&rsp->onoff_mutex); return 1; } @@ -2498,9 +2492,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) /* Adjust any no-longer-needed kthreads. */ rcu_boost_kthread_setaffinity(rnp, -1); - /* Exclude any attempts to start a new grace period. */ - mutex_lock(&rsp->onoff_mutex); - /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ raw_spin_lock_irqsave(&rsp->orphan_lock, flags); rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); @@ -2517,7 +2508,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", cpu, rdp->qlen, rdp->nxtlist); - mutex_unlock(&rsp->onoff_mutex); } #else /* #ifdef CONFIG_HOTPLUG_CPU */ @@ -3700,9 +3690,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rcu_get_root(rsp); - /* Exclude new grace periods. */ - mutex_lock(&rsp->onoff_mutex); - /* Set up local state, ensuring consistent view of global state. */ raw_spin_lock_irqsave(&rnp->lock, flags); rdp->beenonline = 1; /* We have now been online. */ @@ -3733,8 +3720,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->qs_pending = false; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); raw_spin_unlock_irqrestore(&rnp->lock, flags); - - mutex_unlock(&rsp->onoff_mutex); } static void rcu_prepare_cpu(int cpu) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index aa42562ff5b2..a69d3dab2ec4 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -456,8 +456,6 @@ struct rcu_state { long qlen; /* Total number of callbacks. */ /* End of fields guarded by orphan_lock. */ - struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */ - struct mutex barrier_mutex; /* Guards barrier fields. */ atomic_t barrier_cpu_count; /* # CPUs waiting on. */ struct completion barrier_completion; /* Wake at barrier end. */ -- cgit v1.2.3 From 88428cc5c27c63a4313e213813bc39b9899224d5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 28 Jan 2015 14:42:09 -0800 Subject: rcu: Handle outgoing CPUs on exit from idle loop This commit informs RCU of an outgoing CPU just before that CPU invokes arch_cpu_idle_dead() during its last pass through the idle loop (via a new CPU_DYING_IDLE notifier value). This change means that RCU need not deal with outgoing CPUs passing through the scheduler after informing RCU that they are no longer online. Note that removing the CPU from the rcu_node ->qsmaskinit bit masks is done at CPU_DYING_IDLE time, and orphaning callbacks is still done at CPU_DEAD time, the reason being that at CPU_DEAD time we have another CPU that can adopt them. Signed-off-by: Paul E. McKenney --- include/linux/cpu.h | 2 ++ include/linux/rcupdate.h | 2 ++ kernel/rcu/tree.c | 41 +++++++++++++++++++++++++++++++---------- kernel/sched/idle.c | 2 ++ 4 files changed, 37 insertions(+), 10 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 4744ef915acd..d028721748d4 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -95,6 +95,8 @@ enum { * Called on the new cpu, just before * enabling interrupts. Must not sleep, * must not fail */ +#define CPU_DYING_IDLE 0x000B /* CPU (unsigned)v dying, reached + * idle loop. */ #define CPU_BROKEN 0x000C /* CPU (unsigned)v did not die properly, * perhaps due to preemption. */ diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 78097491cd99..762022f07afd 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -266,6 +266,8 @@ void rcu_idle_enter(void); void rcu_idle_exit(void); void rcu_irq_enter(void); void rcu_irq_exit(void); +int rcu_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu); #ifdef CONFIG_RCU_STALL_COMMON void rcu_sysrq_start(void); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 79d53399247e..d5247ed44004 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2475,6 +2475,26 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) } } +/* + * The CPU is exiting the idle loop into the arch_cpu_idle_dead() + * function. We now remove it from the rcu_node tree's ->qsmaskinit + * bit masks. + */ +static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) +{ + unsigned long flags; + unsigned long mask; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ + + /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ + mask = rdp->grpmask; + raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */ + rnp->qsmaskinitnext &= ~mask; + raw_spin_unlock_irqrestore(&rnp->lock, flags); +} + /* * The CPU has been completely removed, and some other CPU is reporting * this fact from process context. Do the remainder of the cleanup, @@ -2485,7 +2505,6 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { unsigned long flags; - unsigned long mask; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ @@ -2498,13 +2517,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) rcu_adopt_orphan_cbs(rsp, flags); raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); - /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ - mask = rdp->grpmask; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */ - rnp->qsmaskinitnext &= ~mask; - raw_spin_unlock_irqrestore(&rnp->lock, flags); - WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", cpu, rdp->qlen, rdp->nxtlist); @@ -2520,6 +2532,10 @@ static void __maybe_unused rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) { } +static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) +{ +} + static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { } @@ -3733,8 +3749,8 @@ static void rcu_prepare_cpu(int cpu) /* * Handle CPU online/offline notification events. */ -static int rcu_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +int rcu_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) { long cpu = (long)hcpu; struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); @@ -3760,6 +3776,11 @@ static int rcu_cpu_notify(struct notifier_block *self, for_each_rcu_flavor(rsp) rcu_cleanup_dying_cpu(rsp); break; + case CPU_DYING_IDLE: + for_each_rcu_flavor(rsp) { + rcu_cleanup_dying_idle_cpu(cpu, rsp); + } + break; case CPU_DEAD: case CPU_DEAD_FROZEN: case CPU_UP_CANCELED: diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index e99e361ade20..b0090accfb5b 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -225,6 +225,8 @@ static void cpu_idle_loop(void) rmb(); if (cpu_is_offline(smp_processor_id())) { + rcu_cpu_notify(NULL, CPU_DYING_IDLE, + (void *)(long)smp_processor_id()); smp_mb(); /* all activity before dead. */ this_cpu_write(cpu_dead_idle, true); arch_cpu_idle_dead(); -- cgit v1.2.3 From 5c60d25fa1b22fdcf141f8006d31c32b08db7311 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 9 Feb 2015 05:37:47 -0800 Subject: rcu: Add diagnostics to grace-period cleanup At grace-period initialization time, RCU checks that all quiescent states were really reported for the previous grace period. Now that grace-period cleanup has been split out of grace-period initialization, this commit also performs those checks at grace-period cleanup time. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d5247ed44004..17b5abf999ca 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1920,6 +1920,8 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) rcu_for_each_node_breadth_first(rsp, rnp) { raw_spin_lock_irq(&rnp->lock); smp_mb__after_unlock_lock(); + WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); + WARN_ON_ONCE(rnp->qsmask); ACCESS_ONCE(rnp->completed) = rsp->gpnum; rdp = this_cpu_ptr(rsp->rda); if (rnp == rdp->mynode) -- cgit v1.2.3 From a77da14ce9afb338040b405f6ab8afddc310411d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 8 Mar 2015 14:52:27 -0700 Subject: rcu: Yet another fix for preemption and CPU hotplug As noted earlier, the following sequence of events can occur when running PREEMPT_RCU and HOTPLUG_CPU on a system with a multi-level rcu_node combining tree: 1. A group of tasks block on CPUs corresponding to a given leaf rcu_node structure while within RCU read-side critical sections. 2. All CPUs corrsponding to that rcu_node structure go offline. 3. The next grace period starts, but because there are still tasks blocked, the upper-level bits corresponding to this leaf rcu_node structure remain set. 4. All the tasks exit their RCU read-side critical sections and remove themselves from the leaf rcu_node structure's list, leaving it empty. 5. But because there now is code to check for this condition at force-quiescent-state time, the upper bits are cleared and the grace period completes. However, there is another complication that can occur following step 4 above: 4a. The grace period starts, and the leaf rcu_node structure's gp_tasks pointer is set to NULL because there are no tasks blocked on this structure. 4b. One of the CPUs corresponding to the leaf rcu_node structure comes back online. 4b. An endless stream of tasks are preempted within RCU read-side critical sections on this CPU, such that the ->blkd_tasks list is always non-empty. The grace period will never end. This commit therefore makes the force-quiescent-state processing check only for absence of tasks blocking the current grace period rather than absence of tasks altogether. This will cause a quiescent state to be reported if the current leaf rcu_node structure is not blocking the current grace period and its parent thinks that it is, regardless of how RCU managed to get itself into this state. Signed-off-by: Paul E. McKenney Cc: # 4.0.x Tested-by: Sasha Levin --- kernel/rcu/tree.c | 43 +++++++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 16 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 17b5abf999ca..b3684b284677 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2199,8 +2199,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, unsigned long mask; struct rcu_node *rnp_p; - WARN_ON_ONCE(rsp == &rcu_bh_state || rsp == &rcu_sched_state); - if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { + if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || + rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); return; /* Still need more quiescent states! */ } @@ -2208,9 +2208,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, rnp_p = rnp->parent; if (rnp_p == NULL) { /* - * Either there is only one rcu_node in the tree, - * or tasks were kicked up to root rcu_node due to - * CPUs going offline. + * Only one rcu_node structure in the tree, so don't + * try to report up to its nonexistent parent! */ rcu_report_qs_rsp(rsp, flags); return; @@ -2713,8 +2712,29 @@ static void force_qs_rnp(struct rcu_state *rsp, return; } if (rnp->qsmask == 0) { - rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ - continue; + if (rcu_state_p == &rcu_sched_state || + rsp != rcu_state_p || + rcu_preempt_blocked_readers_cgp(rnp)) { + /* + * No point in scanning bits because they + * are all zero. But we might need to + * priority-boost blocked readers. + */ + rcu_initiate_boost(rnp, flags); + /* rcu_initiate_boost() releases rnp->lock */ + continue; + } + if (rnp->parent && + (rnp->parent->qsmask & rnp->grpmask)) { + /* + * Race between grace-period + * initialization and task exiting RCU + * read-side critical section: Report. + */ + rcu_report_unblock_qs_rnp(rsp, rnp, flags); + /* rcu_report_unblock_qs_rnp() rlses ->lock */ + continue; + } } cpu = rnp->grplo; bit = 1; @@ -2729,15 +2749,6 @@ static void force_qs_rnp(struct rcu_state *rsp, if (mask != 0) { /* Idle/offline CPUs, report. */ rcu_report_qs_rnp(mask, rsp, rnp, flags); - } else if (rnp->parent && - list_empty(&rnp->blkd_tasks) && - !rnp->qsmask && - (rnp->parent->qsmask & rnp->grpmask)) { - /* - * Race between grace-period initialization and task - * existing RCU read-side critical section, report. - */ - rcu_report_unblock_qs_rnp(rsp, rnp, flags); } else { /* Nothing to do here, so just drop the lock. */ raw_spin_unlock_irqrestore(&rnp->lock, flags); -- cgit v1.2.3 From 654e953340491e498871321d7e2c9b0a12821933 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 15 Mar 2015 09:19:35 -0700 Subject: rcu: Associate quiescent-state reports with grace period As noted in earlier commit logs, CPU hotplug operations running concurrently with grace-period initialization can result in a given leaf rcu_node structure having all CPUs offline and no blocked readers, but with this rcu_node structure nevertheless blocking the current grace period. Therefore, the quiescent-state forcing code now checks for this situation and repairs it. Unfortunately, this checking can result in false positives, for example, when the last task has just removed itself from this leaf rcu_node structure, but has not yet started clearing the ->qsmask bits further up the structure. This means that the grace-period kthread (which forces quiescent states) and some other task might be attempting to concurrently clear these ->qsmask bits. This is usually not a problem: One of these tasks will be the first to acquire the upper-level rcu_node structure's lock and with therefore clear the bit, and the other task, seeing the bit already cleared, will stop trying to clear bits. Sadly, this means that the following unusual sequence of events -can- result in a problem: 1. The grace-period kthread wins, and clears the ->qsmask bits. 2. This is the last thing blocking the current grace period, so that the grace-period kthread clears ->qsmask bits all the way to the root and finds that the root ->qsmask field is now zero. 3. Another grace period is required, so that the grace period kthread initializes it, including setting all the needed qsmask bits. 4. The leaf rcu_node structure (the one that started this whole mess) is blocking this new grace period, either because it has at least one online CPU or because there is at least one task that had blocked within an RCU read-side critical section while running on one of this leaf rcu_node structure's CPUs. (And yes, that CPU might well have gone offline before the grace period in step (3) above started, which can mean that there is a task on the leaf rcu_node structure's ->blkd_tasks list, but ->qsmask equal to zero.) 5. The other kthread didn't get around to trying to clear the upper level ->qsmask bits until all the above had happened. This means that it now sees bits set in the upper-level ->qsmask field, so it proceeds to clear them. Too bad that it is doing so on behalf of a quiescent state that does not apply to the current grace period! This sequence of events can result in the new grace period being too short. It can also result in the new grace period ending before the leaf rcu_node structure's ->qsmask bits have been cleared, which will result in splats during initialization of the next grace period. In addition, it can result in tasks blocking the new grace period still being queued at the start of the next grace period, which will result in other splats. Sasha's testing turned up another of these splats, as did rcutorture testing. (And yes, rcutorture is being adjusted to make these splats show up more quickly. Which probably is having the undesirable side effect of making other problems show up less quickly. Can't have everything!) Reported-by: Sasha Levin Signed-off-by: Paul E. McKenney Cc: # 4.0.x Tested-by: Sasha Levin --- kernel/rcu/tree.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b3684b284677..8fcc64ed858c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2132,25 +2132,32 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) * Similar to rcu_report_qs_rdp(), for which it is a helper function. * Allows quiescent states for a group of CPUs to be reported at one go * to the specified rcu_node structure, though all the CPUs in the group - * must be represented by the same rcu_node structure (which need not be - * a leaf rcu_node structure, though it often will be). That structure's - * lock must be held upon entry, and it is released before return. + * must be represented by the same rcu_node structure (which need not be a + * leaf rcu_node structure, though it often will be). The gps parameter + * is the grace-period snapshot, which means that the quiescent states + * are valid only if rnp->gpnum is equal to gps. That structure's lock + * must be held upon entry, and it is released before return. */ static void rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, - struct rcu_node *rnp, unsigned long flags) + struct rcu_node *rnp, unsigned long gps, unsigned long flags) __releases(rnp->lock) { + unsigned long oldmask = 0; struct rcu_node *rnp_c; /* Walk up the rcu_node hierarchy. */ for (;;) { - if (!(rnp->qsmask & mask)) { + if (!(rnp->qsmask & mask) || rnp->gpnum != gps) { - /* Our bit has already been cleared, so done. */ + /* + * Our bit has already been cleared, or the + * relevant grace period is already over, so done. + */ raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } + WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */ rnp->qsmask &= ~mask; trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum, mask, rnp->qsmask, rnp->level, @@ -2174,7 +2181,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, rnp = rnp->parent; raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); - WARN_ON_ONCE(rnp_c->qsmask); + oldmask = rnp_c->qsmask; } /* @@ -2196,6 +2203,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, struct rcu_node *rnp, unsigned long flags) __releases(rnp->lock) { + unsigned long gps; unsigned long mask; struct rcu_node *rnp_p; @@ -2215,12 +2223,13 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, return; } - /* Report up the rest of the hierarchy. */ + /* Report up the rest of the hierarchy, tracking current ->gpnum. */ + gps = rnp->gpnum; mask = rnp->grpmask; raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ smp_mb__after_unlock_lock(); - rcu_report_qs_rnp(mask, rsp, rnp_p, flags); + rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags); } /* @@ -2271,7 +2280,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) */ needwake = rcu_accelerate_cbs(rsp, rnp, rdp); - rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ + rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags); + /* ^^^ Released rnp->lock */ if (needwake) rcu_gp_kthread_wake(rsp); } @@ -2747,8 +2757,8 @@ static void force_qs_rnp(struct rcu_state *rsp, } } if (mask != 0) { - /* Idle/offline CPUs, report. */ - rcu_report_qs_rnp(mask, rsp, rnp, flags); + /* Idle/offline CPUs, report (releases rnp->lock. */ + rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags); } else { /* Nothing to do here, so just drop the lock. */ raw_spin_unlock_irqrestore(&rnp->lock, flags); -- cgit v1.2.3