From 41050a009640f9f330a5b916563ca7faf853a98c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 18 Dec 2014 12:31:27 -0800 Subject: rcu: Fix rcu_barrier() race that could result in too-short wait The rcu_barrier() no-callbacks check for no-CBs CPUs has race conditions. It checks a given CPU's lists of callbacks, and if all three no-CBs lists are empty, ignores that CPU. However, these three lists could potentially be empty even when callbacks are present if the check executed just as the callbacks were being moved from one list to another. It turns out that recent versions of rcutorture can spot this race. This commit plugs this hole by consolidating the per-list counts of no-CBs callbacks into a single count, which is incremented before the corresponding callback is posted and after it is invoked. Then rcu_barrier() checks this single count to reliably determine whether the corresponding CPU has no-CBs callbacks. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) (limited to 'kernel/rcu/tree.h') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 8e7b1843896e..cb5908672f11 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -340,14 +340,10 @@ struct rcu_data { #ifdef CONFIG_RCU_NOCB_CPU struct rcu_head *nocb_head; /* CBs waiting for kthread. */ struct rcu_head **nocb_tail; - atomic_long_t nocb_q_count; /* # CBs waiting for kthread */ - atomic_long_t nocb_q_count_lazy; /* (approximate). */ + atomic_long_t nocb_q_count; /* # CBs waiting for nocb */ + atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */ struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */ struct rcu_head **nocb_follower_tail; - atomic_long_t nocb_follower_count; /* # CBs ready to invoke. */ - atomic_long_t nocb_follower_count_lazy; /* (approximate). */ - int nocb_p_count; /* # CBs being invoked by kthread */ - int nocb_p_count_lazy; /* (approximate). */ wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ struct task_struct *nocb_kthread; int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ @@ -356,8 +352,6 @@ struct rcu_data { struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp; /* CBs waiting for GP. */ struct rcu_head **nocb_gp_tail; - long nocb_gp_count; - long nocb_gp_count_lazy; bool nocb_leader_sleep; /* Is the nocb leader thread asleep? */ struct rcu_data *nocb_next_follower; /* Next follower in wakeup chain. */ @@ -622,24 +616,15 @@ static void rcu_dynticks_task_exit(void); #endif /* #ifndef RCU_TREE_NONCORE */ #ifdef CONFIG_RCU_TRACE -#ifdef CONFIG_RCU_NOCB_CPU -/* Sum up queue lengths for tracing. */ +/* Read out queue lengths for tracing. */ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) { - *ql = atomic_long_read(&rdp->nocb_q_count) + - rdp->nocb_p_count + - atomic_long_read(&rdp->nocb_follower_count) + - rdp->nocb_p_count + rdp->nocb_gp_count; - *qll = atomic_long_read(&rdp->nocb_q_count_lazy) + - rdp->nocb_p_count_lazy + - atomic_long_read(&rdp->nocb_follower_count_lazy) + - rdp->nocb_p_count_lazy + rdp->nocb_gp_count_lazy; -} +#ifdef CONFIG_RCU_NOCB_CPU + *ql = atomic_long_read(&rdp->nocb_q_count); + *qll = atomic_long_read(&rdp->nocb_q_count_lazy); #else /* #ifdef CONFIG_RCU_NOCB_CPU */ -static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) -{ *ql = 0; *qll = 0; -} #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ +} #endif /* #ifdef CONFIG_RCU_TRACE */ -- cgit v1.2.3 From 5a43b88e98eaca88fa12abcacd0000adc879dd2c Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 24 Dec 2014 17:55:28 +0800 Subject: rcu: Remove "select IRQ_WORK" from config TREE_RCU The 48a7639ce80c ("rcu: Make callers awaken grace-period kthread") removed the irq_work_queue(), so the TREE_RCU doesn't need irq work any more. This commit therefore updates RCU's Kconfig and Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- init/Kconfig | 2 -- kernel/rcu/tree.h | 1 - 2 files changed, 3 deletions(-) (limited to 'kernel/rcu/tree.h') diff --git a/init/Kconfig b/init/Kconfig index 9afb971497f4..39b4313c6ca6 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -470,7 +470,6 @@ choice config TREE_RCU bool "Tree-based hierarchical RCU" depends on !PREEMPT && SMP - select IRQ_WORK help This option selects the RCU implementation that is designed for very large SMP system with hundreds or @@ -480,7 +479,6 @@ config TREE_RCU config PREEMPT_RCU bool "Preemptible tree-based hierarchical RCU" depends on PREEMPT - select IRQ_WORK help This option selects the RCU implementation that is designed for very large SMP systems with hundreds or diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index cb5908672f11..41860b9a04a2 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -27,7 +27,6 @@ #include #include #include -#include /* * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and -- cgit v1.2.3 From 8af3a5e78cfb63abe8813743946b7bd5a8a3134c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 31 Oct 2014 11:22:37 -0700 Subject: rcu: Abstract rcu_cleanup_dead_rnp() from rcu_cleanup_dead_cpu() This commit abstracts rcu_cleanup_dead_rnp() from rcu_cleanup_dead_cpu() in preparation for the rework of RCU priority boosting. This new function will be invoked from rcu_read_unlock_special() in the reworked scheme, which is why rcu_cleanup_dead_rnp() assumes that the leaf rcu_node structure's ->qsmaskinit field has already been updated. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 67 ++++++++++++++++++++++++++++++++++-------------- kernel/rcu/tree.h | 1 + kernel/rcu/tree_plugin.h | 17 ++++++++++++ 3 files changed, 66 insertions(+), 19 deletions(-) (limited to 'kernel/rcu/tree.h') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4c106fcc0d54..75c6b3301abb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2226,6 +2226,46 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) TPS("cpuofl")); } +/* + * All CPUs for the specified rcu_node structure have gone offline, + * and all tasks that were preempted within an RCU read-side critical + * section while running on one of those CPUs have since exited their RCU + * read-side critical section. Some other CPU is reporting this fact with + * the specified rcu_node structure's ->lock held and interrupts disabled. + * This function therefore goes up the tree of rcu_node structures, + * clearing the corresponding bits in the ->qsmaskinit fields. Note that + * the leaf rcu_node structure's ->qsmaskinit field has already been + * updated + * + * This function does check that the specified rcu_node structure has + * all CPUs offline and no blocked tasks, so it is OK to invoke it + * prematurely. That said, invoking it after the fact will cost you + * a needless lock acquisition. So once it has done its work, don't + * invoke it again. + */ +static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) +{ + long mask; + struct rcu_node *rnp = rnp_leaf; + + if (rnp->qsmaskinit || rcu_preempt_has_tasks(rnp)) + return; + for (;;) { + mask = rnp->grpmask; + rnp = rnp->parent; + if (!rnp) + break; + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + smp_mb__after_unlock_lock(); /* GP memory ordering. */ + rnp->qsmaskinit &= ~mask; + if (rnp->qsmaskinit) { + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + return; + } + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + } +} + /* * The CPU has been completely removed, and some other CPU is reporting * this fact from process context. Do the remainder of the cleanup, @@ -2236,7 +2276,6 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { unsigned long flags; - unsigned long mask; int need_report = 0; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ @@ -2252,24 +2291,14 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); rcu_adopt_orphan_cbs(rsp, flags); - /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ - mask = rdp->grpmask; /* rnp->grplo is constant. */ - do { - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); - rnp->qsmaskinit &= ~mask; - if (rnp->qsmaskinit != 0) { - if (rnp != rdp->mynode) - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - break; - } - if (rnp == rdp->mynode) - need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); - else - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - mask = rnp->grpmask; - rnp = rnp->parent; - } while (rnp != NULL); + /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */ + rnp->qsmaskinit &= ~rdp->grpmask; + if (rnp->qsmaskinit == 0) { + need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); + rcu_cleanup_dead_rnp(rnp); + } /* * We still hold the leaf rcu_node structure lock here, and diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 8e7b1843896e..9315477b47d9 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -552,6 +552,7 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags); +static bool rcu_preempt_has_tasks(struct rcu_node *rnp); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ static void rcu_print_detail_task_stall(struct rcu_state *rsp); static int rcu_print_task_stall(struct rcu_node *rnp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3d22f0b2dea9..d044b9cbbd97 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -306,6 +306,15 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t, return np; } +/* + * Return true if the specified rcu_node structure has tasks that were + * preempted within an RCU read-side critical section. + */ +static bool rcu_preempt_has_tasks(struct rcu_node *rnp) +{ + return !list_empty(&rnp->blkd_tasks); +} + /* * Handle special cases during rcu_read_unlock(), such as needing to * notify RCU core processing or task having blocked during the RCU @@ -967,6 +976,14 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) raw_spin_unlock_irqrestore(&rnp->lock, flags); } +/* + * Because there is no preemptible RCU, there can be no readers blocked. + */ +static bool rcu_preempt_has_tasks(struct rcu_node *rnp) +{ + return false; +} + #endif /* #ifdef CONFIG_HOTPLUG_CPU */ /* -- cgit v1.2.3 From d19fb8d1f3f66cc342d30aa48f090c70afb753ed Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 31 Oct 2014 12:56:16 -0700 Subject: rcu: Don't migrate blocked tasks even if all corresponding CPUs offline When the last CPU associated with a given leaf rcu_node structure goes offline, something must be done about the tasks queued on that rcu_node structure. Each of these tasks has been preempted on one of the leaf rcu_node structure's CPUs while in an RCU read-side critical section that it have not yet exited. Handling these tasks is the job of rcu_preempt_offline_tasks(), which migrates them from the leaf rcu_node structure to the root rcu_node structure. Unfortunately, this migration has to be done one task at a time because each tasks allegiance must be shifted from the original leaf rcu_node to the root, so that future attempts to deal with these tasks will acquire the root rcu_node structure's ->lock rather than that of the leaf. Worse yet, this migration must be done with interrupts disabled, which is not so good for realtime response, especially given that there is no bound on the number of tasks on a given rcu_node structure's list. (OK, OK, there is a bound, it is just that it is unreasonably large, especially on 64-bit systems.) This was not considered a problem back when rcu_preempt_offline_tasks() was first written because realtime systems were assumed not to do CPU-hotplug operations while real-time applications were running. This assumption has proved of dubious validity given that people are starting to run multiple realtime applications on a single SMP system and that it is common practice to offline then online a CPU before starting its real-time application in order to clear extraneous processing off of that CPU. So we now need CPU hotplug operations to avoid undue latencies. This commit therefore avoids migrating these tasks, instead letting them be dequeued one by one from the original leaf rcu_node structure by rcu_read_unlock_special(). This means that the clearing of bits from the upper-level rcu_node structures must be deferred until the last such task has been dequeued, because otherwise subsequent grace periods won't wait on them. This commit has the beneficial side effect of simplifying the CPU-hotplug code for TREE_PREEMPT_RCU, especially in CONFIG_RCU_BOOST builds. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 20 +------- kernel/rcu/tree.h | 18 ------- kernel/rcu/tree_plugin.h | 126 +---------------------------------------------- 3 files changed, 4 insertions(+), 160 deletions(-) (limited to 'kernel/rcu/tree.h') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6625a1b5d9a1..84f16cf05991 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2276,7 +2276,6 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { unsigned long flags; - int need_report = 0; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ @@ -2295,25 +2294,10 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) raw_spin_lock(&rnp->lock); /* irqs already disabled. */ smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */ rnp->qsmaskinit &= ~rdp->grpmask; - if (rnp->qsmaskinit == 0) { - need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); + if (rnp->qsmaskinit == 0 && !rcu_preempt_has_tasks(rnp)) rcu_cleanup_dead_rnp(rnp); - } - - /* - * We still hold the leaf rcu_node structure lock here, and - * irqs are still disabled. The reason for this subterfuge is - * because invoking rcu_report_unblock_qs_rnp() with ->orphan_lock - * held leads to deadlock. - */ raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */ - rnp = rdp->mynode; - if (need_report & RCU_OFL_TASKS_NORM_GP) - rcu_report_unblock_qs_rnp(rnp, flags); - else - raw_spin_unlock_irqrestore(&rnp->lock, flags); - if (need_report & RCU_OFL_TASKS_EXP_GP) - rcu_report_exp_rnp(rsp, rnp, true); + raw_spin_unlock_irqrestore(&rnp->lock, flags); WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", cpu, rdp->qlen, rdp->nxtlist); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9315477b47d9..883ebc8e2b6e 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -514,13 +514,6 @@ extern struct list_head rcu_struct_flavors; #define for_each_rcu_flavor(rsp) \ list_for_each_entry((rsp), &rcu_struct_flavors, flavors) -/* Return values for rcu_preempt_offline_tasks(). */ - -#define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */ - /* GP were moved to root. */ -#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */ - /* GP were moved to root. */ - /* * RCU implementation internal declarations: */ @@ -550,24 +543,13 @@ long rcu_batches_completed(void); static void rcu_preempt_note_context_switch(void); static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU -static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, - unsigned long flags); static bool rcu_preempt_has_tasks(struct rcu_node *rnp); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ static void rcu_print_detail_task_stall(struct rcu_state *rsp); static int rcu_print_task_stall(struct rcu_node *rnp); static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); -#ifdef CONFIG_HOTPLUG_CPU -static int rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp); -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ static void rcu_preempt_check_callbacks(void); void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); -#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU) -static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, - bool wake); -#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU) */ static void __init __rcu_init_preempt(void); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 8a2b84157d34..d594da48f4b4 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -103,6 +103,8 @@ RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); static struct rcu_state *rcu_state_p = &rcu_preempt_state; static int rcu_preempted_readers_exp(struct rcu_node *rnp); +static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, + bool wake); /* * Tell them what RCU they are running. @@ -545,92 +547,6 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) #ifdef CONFIG_HOTPLUG_CPU -/* - * Handle tasklist migration for case in which all CPUs covered by the - * specified rcu_node have gone offline. Move them up to the root - * rcu_node. The reason for not just moving them to the immediate - * parent is to remove the need for rcu_read_unlock_special() to - * make more than two attempts to acquire the target rcu_node's lock. - * Returns true if there were tasks blocking the current RCU grace - * period. - * - * Returns 1 if there was previously a task blocking the current grace - * period on the specified rcu_node structure. - * - * The caller must hold rnp->lock with irqs disabled. - */ -static int rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp) -{ - struct list_head *lp; - struct list_head *lp_root; - int retval = 0; - struct rcu_node *rnp_root = rcu_get_root(rsp); - struct task_struct *t; - - if (rnp == rnp_root) { - WARN_ONCE(1, "Last CPU thought to be offlined?"); - return 0; /* Shouldn't happen: at least one CPU online. */ - } - - /* If we are on an internal node, complain bitterly. */ - WARN_ON_ONCE(rnp != rdp->mynode); - - /* - * Move tasks up to root rcu_node. Don't try to get fancy for - * this corner-case operation -- just put this node's tasks - * at the head of the root node's list, and update the root node's - * ->gp_tasks and ->exp_tasks pointers to those of this node's, - * if non-NULL. This might result in waiting for more tasks than - * absolutely necessary, but this is a good performance/complexity - * tradeoff. - */ - if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0) - retval |= RCU_OFL_TASKS_NORM_GP; - if (rcu_preempted_readers_exp(rnp)) - retval |= RCU_OFL_TASKS_EXP_GP; - lp = &rnp->blkd_tasks; - lp_root = &rnp_root->blkd_tasks; - while (!list_empty(lp)) { - t = list_entry(lp->next, typeof(*t), rcu_node_entry); - raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ - smp_mb__after_unlock_lock(); - list_del(&t->rcu_node_entry); - t->rcu_blocked_node = rnp_root; - list_add(&t->rcu_node_entry, lp_root); - if (&t->rcu_node_entry == rnp->gp_tasks) - rnp_root->gp_tasks = rnp->gp_tasks; - if (&t->rcu_node_entry == rnp->exp_tasks) - rnp_root->exp_tasks = rnp->exp_tasks; -#ifdef CONFIG_RCU_BOOST - if (&t->rcu_node_entry == rnp->boost_tasks) - rnp_root->boost_tasks = rnp->boost_tasks; -#endif /* #ifdef CONFIG_RCU_BOOST */ - raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ - } - - rnp->gp_tasks = NULL; - rnp->exp_tasks = NULL; -#ifdef CONFIG_RCU_BOOST - rnp->boost_tasks = NULL; - /* - * In case root is being boosted and leaf was not. Make sure - * that we boost the tasks blocking the current grace period - * in this case. - */ - raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ - smp_mb__after_unlock_lock(); - if (rnp_root->boost_tasks != NULL && - rnp_root->boost_tasks != rnp_root->gp_tasks && - rnp_root->boost_tasks != rnp_root->exp_tasks) - rnp_root->boost_tasks = rnp_root->gp_tasks; - raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ -#endif /* #ifdef CONFIG_RCU_BOOST */ - - return retval; -} - #endif /* #ifdef CONFIG_HOTPLUG_CPU */ /* @@ -979,13 +895,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) #ifdef CONFIG_HOTPLUG_CPU -/* Because preemptible RCU does not exist, no quieting of tasks. */ -static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) - __releases(rnp->lock) -{ - raw_spin_unlock_irqrestore(&rnp->lock, flags); -} - /* * Because there is no preemptible RCU, there can be no readers blocked. */ @@ -1023,23 +932,6 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) WARN_ON_ONCE(rnp->qsmask); } -#ifdef CONFIG_HOTPLUG_CPU - -/* - * Because preemptible RCU does not exist, it never needs to migrate - * tasks that were blocked within RCU read-side critical sections, and - * such non-existent tasks cannot possibly have been blocking the current - * grace period. - */ -static int rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp) -{ - return 0; -} - -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - /* * Because preemptible RCU does not exist, it never has any callbacks * to check. @@ -1058,20 +950,6 @@ void synchronize_rcu_expedited(void) } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); -#ifdef CONFIG_HOTPLUG_CPU - -/* - * Because preemptible RCU does not exist, there is never any need to - * report on tasks preempted in RCU read-side critical sections during - * expedited RCU grace periods. - */ -static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, - bool wake) -{ -} - -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - /* * Because preemptible RCU does not exist, rcu_barrier() is just * another name for rcu_barrier_sched(). -- cgit v1.2.3 From abaf3f9d275b8d856ae5e47531e40c0bfeac012b Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 18 Nov 2014 16:30:01 +0800 Subject: rcu: Revert "Allow post-unlock reference for rt_mutex" to avoid priority-inversion The patch dfeb9765ce3c ("Allow post-unlock reference for rt_mutex") ensured rcu-boost safe even the rt_mutex has post-unlock reference. But rt_mutex allowing post-unlock reference is definitely a bug and it was fixed by the commit 27e35715df54 ("rtmutex: Plug slow unlock race"). This fix made the previous patch (dfeb9765ce3c) useless. And even worse, the priority-inversion introduced by the the previous patch still exists. rcu_read_unlock_special() { rt_mutex_unlock(&rnp->boost_mtx); /* Priority-Inversion: * the current task had been deboosted and preempted as a low * priority task immediately, it could wait long before reschedule in, * and the rcu-booster also waits on this low priority task and sleeps. * This priority-inversion makes rcu-booster can't work * as expected. */ complete(&rnp->boost_completion); } Just revert the patch to avoid it. Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Peter Zijlstra Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 5 ----- kernel/rcu/tree_plugin.h | 8 +------- 2 files changed, 1 insertion(+), 12 deletions(-) (limited to 'kernel/rcu/tree.h') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 883ebc8e2b6e..95356477d560 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -172,11 +172,6 @@ struct rcu_node { /* queued on this rcu_node structure that */ /* are blocking the current grace period, */ /* there can be no such task. */ - struct completion boost_completion; - /* Used to ensure that the rt_mutex used */ - /* to carry out the boosting is fully */ - /* released with no future boostee accesses */ - /* before that rt_mutex is re-initialized. */ struct rt_mutex boost_mtx; /* Used only for the priority-boosting */ /* side effect, not as a lock. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 1fac68220999..625e26040e6b 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -429,10 +429,8 @@ void rcu_read_unlock_special(struct task_struct *t) #ifdef CONFIG_RCU_BOOST /* Unboost if we were boosted. */ - if (drop_boost_mutex) { + if (drop_boost_mutex) rt_mutex_unlock(&rnp->boost_mtx); - complete(&rnp->boost_completion); - } #endif /* #ifdef CONFIG_RCU_BOOST */ /* @@ -1081,15 +1079,11 @@ static int rcu_boost(struct rcu_node *rnp) */ t = container_of(tb, struct task_struct, rcu_node_entry); rt_mutex_init_proxy_locked(&rnp->boost_mtx, t); - init_completion(&rnp->boost_completion); raw_spin_unlock_irqrestore(&rnp->lock, flags); /* Lock only for side effect: boosts task t's priority. */ rt_mutex_lock(&rnp->boost_mtx); rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */ - /* Wait for boostee to be done w/boost_mtx before reinitializing. */ - wait_for_completion(&rnp->boost_completion); - return ACCESS_ONCE(rnp->exp_tasks) != NULL || ACCESS_ONCE(rnp->boost_tasks) != NULL; } -- cgit v1.2.3 From fc908ed33e7c1428f799abb12399f906da03b397 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 8 Dec 2014 09:57:48 -0800 Subject: rcu: Make RCU_CPU_STALL_INFO include number of fqs attempts One way that an RCU CPU stall warning can happen is if the grace-period kthread is not allowed to execute. One proxy for this kthread's forward progress is the number of force-quiescent-state (fqs) scans. This commit therefore adds the number of fqs scans to the RCU CPU stall warning printouts when CONFIG_RCU_CPU_STALL_INFO=y. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 + kernel/rcu/tree.h | 2 ++ kernel/rcu/tree_plugin.h | 3 ++- 3 files changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel/rcu/tree.h') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4c106fcc0d54..654b15be1e36 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1043,6 +1043,7 @@ static void record_gp_stall_check_time(struct rcu_state *rsp) j1 = rcu_jiffies_till_stall_check(); ACCESS_ONCE(rsp->jiffies_stall) = j + j1; rsp->jiffies_resched = j + j1 / 2; + rsp->n_force_qs_gpstart = ACCESS_ONCE(rsp->n_force_qs); } /* diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 8e7b1843896e..e300848cc0cf 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -492,6 +492,8 @@ struct rcu_state { /* for CPU stalls. */ unsigned long jiffies_resched; /* Time at which to resched */ /* a reluctant CPU. */ + unsigned long n_force_qs_gpstart; /* Snapshot of n_force_qs at */ + /* GP start. */ unsigned long gp_max; /* Maximum GP duration in */ /* jiffies. */ const char *name; /* Name of structure. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3ec85cb5d544..769384d77437 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1898,11 +1898,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) ticks_value = rsp->gpnum - rdp->gpnum; } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); - pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n", + pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n", cpu, ticks_value, ticks_title, atomic_read(&rdtp->dynticks) & 0xfff, rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), + ACCESS_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart, fast_no_hz); } -- cgit v1.2.3 From 6ccd2ecd422644277b7d8b37222e3af3f43ea9ae Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Dec 2014 10:20:59 -0800 Subject: rcu: Improve diagnostics for spurious RCU CPU stall warnings The current RCU CPU stall warning code will print "Stall ended before state dump start" any time that the stall-warning code is triggered on a CPU that has already reported a quiescent state for the current grace period and if all quiescent states have been reported for the current grace period. However, a true stall can result in these symptoms, for example, by preventing RCU's grace-period kthreads from ever running This commit therefore checks for this condition, reporting the end of the stall only if one of the grace-period counters has actually advanced. Otherwise, it reports the last time that the grace-period kthread made meaningful progress. (In normal situations, the grace-period kthread should make meaningful progress at least every jiffies_till_next_fqs jiffies.) Reported-by: Miroslav Benes Signed-off-by: Paul E. McKenney Tested-by: Miroslav Benes --- Documentation/RCU/stallwarn.txt | 5 +++++ kernel/rcu/tree.c | 32 +++++++++++++++++++++++++++----- kernel/rcu/tree.h | 2 ++ 3 files changed, 34 insertions(+), 5 deletions(-) (limited to 'kernel/rcu/tree.h') diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index ed186a902d31..55f9707fe60a 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -187,6 +187,11 @@ o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the behavior, you might need to replace some of the cond_resched() calls with calls to cond_resched_rcu_qs(). +o Anything that prevents RCU's grace-period kthreads from running. + This can result in the "All QSes seen" console-log message. + This message will include information on when the kthread last + ran and how often it should be expected to run. + o A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might happen to preempt a low-priority task in the middle of an RCU read-side critical section. This is especially damaging if diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 654b15be1e36..a2ceb66bcd67 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1066,11 +1066,13 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) } } -static void print_other_cpu_stall(struct rcu_state *rsp) +static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) { int cpu; long delta; unsigned long flags; + unsigned long gpa; + unsigned long j; int ndetected = 0; struct rcu_node *rnp = rcu_get_root(rsp); long totqlen = 0; @@ -1123,10 +1125,22 @@ static void print_other_cpu_stall(struct rcu_state *rsp) pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n", smp_processor_id(), (long)(jiffies - rsp->gp_start), (long)rsp->gpnum, (long)rsp->completed, totqlen); - if (ndetected == 0) - pr_err("INFO: Stall ended before state dump start\n"); - else + if (ndetected) { rcu_dump_cpu_stacks(rsp); + } else { + if (ACCESS_ONCE(rsp->gpnum) != gpnum || + ACCESS_ONCE(rsp->completed) == gpnum) { + pr_err("INFO: Stall ended before state dump start\n"); + } else { + j = jiffies; + gpa = ACCESS_ONCE(rsp->gp_activity); + pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld\n", + rsp->name, j - gpa, j, gpa, + jiffies_till_next_fqs); + /* In this case, the current CPU might be at fault. */ + sched_show_task(current); + } + } /* Complain about tasks blocking the grace period. */ @@ -1226,7 +1240,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) { /* They had a few time units to dump stack, so complain. */ - print_other_cpu_stall(rsp); + print_other_cpu_stall(rsp, gpnum); } } @@ -1622,6 +1636,7 @@ static int rcu_gp_init(struct rcu_state *rsp) struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); + ACCESS_ONCE(rsp->gp_activity) = jiffies; rcu_bind_gp_kthread(); raw_spin_lock_irq(&rnp->lock); smp_mb__after_unlock_lock(); @@ -1682,6 +1697,7 @@ static int rcu_gp_init(struct rcu_state *rsp) rnp->grphi, rnp->qsmask); raw_spin_unlock_irq(&rnp->lock); cond_resched_rcu_qs(); + ACCESS_ONCE(rsp->gp_activity) = jiffies; } mutex_unlock(&rsp->onoff_mutex); @@ -1698,6 +1714,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) unsigned long maxj; struct rcu_node *rnp = rcu_get_root(rsp); + ACCESS_ONCE(rsp->gp_activity) = jiffies; rsp->n_force_qs++; if (fqs_state == RCU_SAVE_DYNTICK) { /* Collect dyntick-idle snapshots. */ @@ -1736,6 +1753,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); + ACCESS_ONCE(rsp->gp_activity) = jiffies; raw_spin_lock_irq(&rnp->lock); smp_mb__after_unlock_lock(); gp_duration = jiffies - rsp->gp_start; @@ -1772,6 +1790,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) nocb += rcu_future_gp_cleanup(rsp, rnp); raw_spin_unlock_irq(&rnp->lock); cond_resched_rcu_qs(); + ACCESS_ONCE(rsp->gp_activity) = jiffies; } rnp = rcu_get_root(rsp); raw_spin_lock_irq(&rnp->lock); @@ -1821,6 +1840,7 @@ static int __noreturn rcu_gp_kthread(void *arg) if (rcu_gp_init(rsp)) break; cond_resched_rcu_qs(); + ACCESS_ONCE(rsp->gp_activity) = jiffies; WARN_ON(signal_pending(current)); trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), @@ -1864,9 +1884,11 @@ static int __noreturn rcu_gp_kthread(void *arg) ACCESS_ONCE(rsp->gpnum), TPS("fqsend")); cond_resched_rcu_qs(); + ACCESS_ONCE(rsp->gp_activity) = jiffies; } else { /* Deal with stray signal. */ cond_resched_rcu_qs(); + ACCESS_ONCE(rsp->gp_activity) = jiffies; WARN_ON(signal_pending(current)); trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index e300848cc0cf..5ec81cf938fd 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -488,6 +488,8 @@ struct rcu_state { /* due to no GP active. */ unsigned long gp_start; /* Time at which GP started, */ /* but in jiffies. */ + unsigned long gp_activity; /* Time of last GP kthread */ + /* activity in jiffies. */ unsigned long jiffies_stall; /* Time at which to check */ /* for CPU stalls. */ unsigned long jiffies_resched; /* Time at which to resched */ -- cgit v1.2.3 From e3663b1024d1f94688e5233440ad67a9bc10b94e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 8 Dec 2014 20:26:55 -0800 Subject: rcu: Handle gpnum/completed wrap while dyntick idle Subtle race conditions can result if a CPU stays in dyntick-idle mode long enough for the ->gpnum and ->completed fields to wrap. For example, consider the following sequence of events: o CPU 1 encounters a quiescent state while waiting for grace period 5 to complete, but then enters dyntick-idle mode. o While CPU 1 is in dyntick-idle mode, the grace-period counters wrap around so that the grace period number is now 4. o Just as CPU 1 exits dyntick-idle mode, grace period 4 completes and grace period 5 begins. o The quiescent state that CPU 1 passed through during the old grace period 5 looks like it applies to the new grace period 5. Therefore, the new grace period 5 completes without CPU 1 having passed through a quiescent state. This could clearly be a fatal surprise to any long-running RCU read-side critical section that happened to be running on CPU 1 at the time. At one time, this was not a problem, given that it takes significant time for the grace-period counters to overflow even on 32-bit systems. However, with the advent of NO_HZ_FULL and SMP embedded systems, arbitrarily long idle periods are now becoming quite feasible. It is therefore time to close this race. This commit therefore avoids this race condition by having the quiescent-state forcing code detect when a CPU is falling too far behind, and setting a new rcu_data field ->gpwrap when this happens. Whenever this new ->gpwrap field is set, the CPU's ->gpnum and ->completed fields are known to be untrustworthy, and can be ignored, along with any associated quiescent states. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 17 ++++++++++++----- kernel/rcu/tree.h | 1 + kernel/rcu/tree_plugin.h | 3 ++- 3 files changed, 15 insertions(+), 6 deletions(-) (limited to 'kernel/rcu/tree.h') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a2ceb66bcd67..5987fdc85fc4 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -930,6 +930,9 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); return 1; } else { + if (ULONG_CMP_LT(ACCESS_ONCE(rdp->gpnum) + ULONG_MAX / 4, + rdp->mynode->gpnum)) + ACCESS_ONCE(rdp->gpwrap) = true; return 0; } } @@ -1577,7 +1580,8 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, bool ret; /* Handle the ends of any preceding grace periods first. */ - if (rdp->completed == rnp->completed) { + if (rdp->completed == rnp->completed && + !unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* No grace period end, so just accelerate recent callbacks. */ ret = rcu_accelerate_cbs(rsp, rnp, rdp); @@ -1592,7 +1596,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend")); } - if (rdp->gpnum != rnp->gpnum) { + if (rdp->gpnum != rnp->gpnum || unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* * If the current grace period is waiting for this CPU, * set up to detect a quiescent state, otherwise don't @@ -1603,6 +1607,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, rdp->passed_quiesce = 0; rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); zero_cpu_stall_ticks(rdp); + ACCESS_ONCE(rdp->gpwrap) = false; } return ret; } @@ -1616,7 +1621,8 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) local_irq_save(flags); rnp = rdp->mynode; if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) && - rdp->completed == ACCESS_ONCE(rnp->completed)) || /* w/out lock. */ + rdp->completed == ACCESS_ONCE(rnp->completed) && + !unlikely(ACCESS_ONCE(rdp->gpwrap))) || /* w/out lock. */ !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ local_irq_restore(flags); return; @@ -2066,7 +2072,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum || - rnp->completed == rnp->gpnum) { + rnp->completed == rnp->gpnum || rdp->gpwrap) { /* * The grace period in which this quiescent state was @@ -3190,7 +3196,8 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) } /* Has a new RCU grace period started? */ - if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */ + if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum || + unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* outside lock */ rdp->n_rp_gp_started++; return 1; } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 5ec81cf938fd..7472ff388d55 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -260,6 +260,7 @@ struct rcu_data { bool passed_quiesce; /* User-mode/idle loop etc. */ bool qs_pending; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ + bool gpwrap; /* Possible gpnum/completed wrap. */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ #ifdef CONFIG_RCU_CPU_STALL_INFO diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 769384d77437..81ff8b9a5a39 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1605,7 +1605,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) * completed since we last checked and there are * callbacks not yet ready to invoke. */ - if (rdp->completed != rnp->completed && + if ((rdp->completed != rnp->completed || + unlikely(ACCESS_ONCE(rdp->gpwrap))) && rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) note_gp_changes(rsp, rdp); -- cgit v1.2.3 From 9733e4f0a973a354034f5dd603b4142a3095c85f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Nov 2014 12:49:13 -0800 Subject: rcu: Make _batches_completed() functions return unsigned long Long ago, the various ->completed fields were of type long, but now are unsigned long due to signed-integer-overflow concerns. However, the various _batches_completed() functions remained of type long, even though their only purpose in life is to return the corresponding ->completed field. This patch cleans this up by changing these functions' return types to unsigned long. Signed-off-by: Paul E. McKenney --- include/linux/rcutiny.h | 4 ++-- include/linux/rcutree.h | 6 +++--- kernel/rcu/tree.c | 4 ++-- kernel/rcu/tree.h | 2 +- kernel/rcu/tree_plugin.h | 6 +++--- 5 files changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel/rcu/tree.h') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 0e5366200154..91f7e4c37800 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -94,7 +94,7 @@ static inline void rcu_virt_note_context_switch(int cpu) /* * Return the number of grace periods. */ -static inline long rcu_batches_completed(void) +static inline unsigned long rcu_batches_completed(void) { return 0; } @@ -102,7 +102,7 @@ static inline long rcu_batches_completed(void) /* * Return the number of bottom-half grace periods. */ -static inline long rcu_batches_completed_bh(void) +static inline unsigned long rcu_batches_completed_bh(void) { return 0; } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 52953790dcca..9885bfb6b123 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -81,9 +81,9 @@ void cond_synchronize_rcu(unsigned long oldstate); extern unsigned long rcutorture_testseq; extern unsigned long rcutorture_vernum; -long rcu_batches_completed(void); -long rcu_batches_completed_bh(void); -long rcu_batches_completed_sched(void); +unsigned long rcu_batches_completed(void); +unsigned long rcu_batches_completed_bh(void); +unsigned long rcu_batches_completed_sched(void); void show_rcu_gp_kthreads(void); void rcu_force_quiescent_state(void); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4c106fcc0d54..e26d78712e16 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -317,7 +317,7 @@ static int rcu_pending(void); /* * Return the number of RCU-sched batches processed thus far for debug & stats. */ -long rcu_batches_completed_sched(void) +unsigned long rcu_batches_completed_sched(void) { return rcu_sched_state.completed; } @@ -326,7 +326,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_sched); /* * Return the number of RCU BH batches processed thus far for debug & stats. */ -long rcu_batches_completed_bh(void) +unsigned long rcu_batches_completed_bh(void) { return rcu_bh_state.completed; } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 8e7b1843896e..1a07d7379ac6 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -546,7 +546,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work); /* Forward declarations for rcutree_plugin.h */ static void rcu_bootup_announce(void); -long rcu_batches_completed(void); +unsigned long rcu_batches_completed(void); static void rcu_preempt_note_context_switch(void); static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3ec85cb5d544..f69300d4a51f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -117,7 +117,7 @@ static void __init rcu_bootup_announce(void) * Return the number of RCU-preempt batches processed thus far * for debug and statistics. */ -static long rcu_batches_completed_preempt(void) +static unsigned long rcu_batches_completed_preempt(void) { return rcu_preempt_state.completed; } @@ -126,7 +126,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt); /* * Return the number of RCU batches processed thus far for debug & stats. */ -long rcu_batches_completed(void) +unsigned long rcu_batches_completed(void) { return rcu_batches_completed_preempt(); } @@ -935,7 +935,7 @@ static void __init rcu_bootup_announce(void) /* * Return the number of RCU batches processed thus far for debug & stats. */ -long rcu_batches_completed(void) +unsigned long rcu_batches_completed(void) { return rcu_batches_completed_sched(); } -- cgit v1.2.3 From f9103c390257d06c162d9e3c2a90d2bdedadfe17 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Nov 2014 16:48:20 -0800 Subject: rcu: Remove redundant rcu_batches_completed() declaration Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/rcu/tree.h') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 1a07d7379ac6..69eb422445f9 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -546,7 +546,6 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work); /* Forward declarations for rcutree_plugin.h */ static void rcu_bootup_announce(void); -unsigned long rcu_batches_completed(void); static void rcu_preempt_note_context_switch(void); static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU -- cgit v1.2.3 From 5cd37193ce8539be1e6ef76be226f4bcc984e0f5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 13 Dec 2014 20:32:04 -0800 Subject: rcu: Make cond_resched_rcu_qs() apply to normal RCU flavors Although cond_resched_rcu_qs() only applies to TASKS_RCU, it is used in places where it would be useful for it to apply to the normal RCU flavors, rcu_preempt, rcu_sched, and rcu_bh. This is especially the case for workloads that aggressively overload the system, particularly those that generate large numbers of RCU updates on systems running NO_HZ_FULL CPUs. This commit therefore communicates quiescent states from cond_resched_rcu_qs() to the normal RCU flavors. Note that it is unfortunately necessary to leave the old ->passed_quiesce mechanism in place to allow quiescent states that apply to only one flavor to be recorded. (Yes, we could decrement ->rcu_qs_ctr_snap in that case, but that is not so good for debugging of RCU internals.) In addition, if one of the RCU flavor's grace period has stalled, this will invoke rcu_momentary_dyntick_idle(), resulting in a heavy-weight quiescent state visible from other CPUs. Reported-by: Sasha Levin Reported-by: Dave Jones Signed-off-by: Paul E. McKenney [ paulmck: Merge commit from Sasha Levin fixing a bug where __this_cpu() was used in preemptible code. ] --- Documentation/RCU/trace.txt | 32 ++++++++++++++++---------------- include/linux/rcupdate.h | 3 ++- include/linux/rcutiny.h | 5 ++++- include/linux/rcutree.h | 2 ++ kernel/rcu/tree.c | 38 +++++++++++++++++++++++++++++++++----- kernel/rcu/tree.h | 2 ++ kernel/rcu/tree_trace.c | 8 ++++++-- 7 files changed, 65 insertions(+), 25 deletions(-) (limited to 'kernel/rcu/tree.h') diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index b63b9bb3bc0c..08651da15448 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt @@ -56,14 +56,14 @@ rcuboost: The output of "cat rcu/rcu_preempt/rcudata" looks as follows: - 0!c=30455 g=30456 pq=1 qp=1 dt=126535/140000000000000/0 df=2002 of=4 ql=0/0 qs=N... b=10 ci=74572 nci=0 co=1131 ca=716 - 1!c=30719 g=30720 pq=1 qp=0 dt=132007/140000000000000/0 df=1874 of=10 ql=0/0 qs=N... b=10 ci=123209 nci=0 co=685 ca=982 - 2!c=30150 g=30151 pq=1 qp=1 dt=138537/140000000000000/0 df=1707 of=8 ql=0/0 qs=N... b=10 ci=80132 nci=0 co=1328 ca=1458 - 3 c=31249 g=31250 pq=1 qp=0 dt=107255/140000000000000/0 df=1749 of=6 ql=0/450 qs=NRW. b=10 ci=151700 nci=0 co=509 ca=622 - 4!c=29502 g=29503 pq=1 qp=1 dt=83647/140000000000000/0 df=965 of=5 ql=0/0 qs=N... b=10 ci=65643 nci=0 co=1373 ca=1521 - 5 c=31201 g=31202 pq=1 qp=1 dt=70422/0/0 df=535 of=7 ql=0/0 qs=.... b=10 ci=58500 nci=0 co=764 ca=698 - 6!c=30253 g=30254 pq=1 qp=1 dt=95363/140000000000000/0 df=780 of=5 ql=0/0 qs=N... b=10 ci=100607 nci=0 co=1414 ca=1353 - 7 c=31178 g=31178 pq=1 qp=0 dt=91536/0/0 df=547 of=4 ql=0/0 qs=.... b=10 ci=109819 nci=0 co=1115 ca=969 + 0!c=30455 g=30456 pq=1/0 qp=1 dt=126535/140000000000000/0 df=2002 of=4 ql=0/0 qs=N... b=10 ci=74572 nci=0 co=1131 ca=716 + 1!c=30719 g=30720 pq=1/0 qp=0 dt=132007/140000000000000/0 df=1874 of=10 ql=0/0 qs=N... b=10 ci=123209 nci=0 co=685 ca=982 + 2!c=30150 g=30151 pq=1/1 qp=1 dt=138537/140000000000000/0 df=1707 of=8 ql=0/0 qs=N... b=10 ci=80132 nci=0 co=1328 ca=1458 + 3 c=31249 g=31250 pq=1/1 qp=0 dt=107255/140000000000000/0 df=1749 of=6 ql=0/450 qs=NRW. b=10 ci=151700 nci=0 co=509 ca=622 + 4!c=29502 g=29503 pq=1/0 qp=1 dt=83647/140000000000000/0 df=965 of=5 ql=0/0 qs=N... b=10 ci=65643 nci=0 co=1373 ca=1521 + 5 c=31201 g=31202 pq=1/0 qp=1 dt=70422/0/0 df=535 of=7 ql=0/0 qs=.... b=10 ci=58500 nci=0 co=764 ca=698 + 6!c=30253 g=30254 pq=1/0 qp=1 dt=95363/140000000000000/0 df=780 of=5 ql=0/0 qs=N... b=10 ci=100607 nci=0 co=1414 ca=1353 + 7 c=31178 g=31178 pq=1/0 qp=0 dt=91536/0/0 df=547 of=4 ql=0/0 qs=.... b=10 ci=109819 nci=0 co=1115 ca=969 This file has one line per CPU, or eight for this 8-CPU system. The fields are as follows: @@ -188,14 +188,14 @@ o "ca" is the number of RCU callbacks that have been adopted by this Kernels compiled with CONFIG_RCU_BOOST=y display the following from /debug/rcu/rcu_preempt/rcudata: - 0!c=12865 g=12866 pq=1 qp=1 dt=83113/140000000000000/0 df=288 of=11 ql=0/0 qs=N... kt=0/O ktl=944 b=10 ci=60709 nci=0 co=748 ca=871 - 1 c=14407 g=14408 pq=1 qp=0 dt=100679/140000000000000/0 df=378 of=7 ql=0/119 qs=NRW. kt=0/W ktl=9b6 b=10 ci=109740 nci=0 co=589 ca=485 - 2 c=14407 g=14408 pq=1 qp=0 dt=105486/0/0 df=90 of=9 ql=0/89 qs=NRW. kt=0/W ktl=c0c b=10 ci=83113 nci=0 co=533 ca=490 - 3 c=14407 g=14408 pq=1 qp=0 dt=107138/0/0 df=142 of=8 ql=0/188 qs=NRW. kt=0/W ktl=b96 b=10 ci=121114 nci=0 co=426 ca=290 - 4 c=14405 g=14406 pq=1 qp=1 dt=50238/0/0 df=706 of=7 ql=0/0 qs=.... kt=0/W ktl=812 b=10 ci=34929 nci=0 co=643 ca=114 - 5!c=14168 g=14169 pq=1 qp=0 dt=45465/140000000000000/0 df=161 of=11 ql=0/0 qs=N... kt=0/O ktl=b4d b=10 ci=47712 nci=0 co=677 ca=722 - 6 c=14404 g=14405 pq=1 qp=0 dt=59454/0/0 df=94 of=6 ql=0/0 qs=.... kt=0/W ktl=e57 b=10 ci=55597 nci=0 co=701 ca=811 - 7 c=14407 g=14408 pq=1 qp=1 dt=68850/0/0 df=31 of=8 ql=0/0 qs=.... kt=0/W ktl=14bd b=10 ci=77475 nci=0 co=508 ca=1042 + 0!c=12865 g=12866 pq=1/0 qp=1 dt=83113/140000000000000/0 df=288 of=11 ql=0/0 qs=N... kt=0/O ktl=944 b=10 ci=60709 nci=0 co=748 ca=871 + 1 c=14407 g=14408 pq=1/0 qp=0 dt=100679/140000000000000/0 df=378 of=7 ql=0/119 qs=NRW. kt=0/W ktl=9b6 b=10 ci=109740 nci=0 co=589 ca=485 + 2 c=14407 g=14408 pq=1/0 qp=0 dt=105486/0/0 df=90 of=9 ql=0/89 qs=NRW. kt=0/W ktl=c0c b=10 ci=83113 nci=0 co=533 ca=490 + 3 c=14407 g=14408 pq=1/0 qp=0 dt=107138/0/0 df=142 of=8 ql=0/188 qs=NRW. kt=0/W ktl=b96 b=10 ci=121114 nci=0 co=426 ca=290 + 4 c=14405 g=14406 pq=1/0 qp=1 dt=50238/0/0 df=706 of=7 ql=0/0 qs=.... kt=0/W ktl=812 b=10 ci=34929 nci=0 co=643 ca=114 + 5!c=14168 g=14169 pq=1/0 qp=0 dt=45465/140000000000000/0 df=161 of=11 ql=0/0 qs=N... kt=0/O ktl=b4d b=10 ci=47712 nci=0 co=677 ca=722 + 6 c=14404 g=14405 pq=1/0 qp=0 dt=59454/0/0 df=94 of=6 ql=0/0 qs=.... kt=0/W ktl=e57 b=10 ci=55597 nci=0 co=701 ca=811 + 7 c=14407 g=14408 pq=1/0 qp=1 dt=68850/0/0 df=31 of=8 ql=0/0 qs=.... kt=0/W ktl=14bd b=10 ci=77475 nci=0 co=508 ca=1042 This is similar to the output discussed above, but contains the following additional fields: diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index ed4f5939a452..468228750299 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -331,12 +331,13 @@ static inline void rcu_init_nohz(void) extern struct srcu_struct tasks_rcu_exit_srcu; #define rcu_note_voluntary_context_switch(t) \ do { \ + rcu_all_qs(); \ if (ACCESS_ONCE((t)->rcu_tasks_holdout)) \ ACCESS_ONCE((t)->rcu_tasks_holdout) = false; \ } while (0) #else /* #ifdef CONFIG_TASKS_RCU */ #define TASKS_RCU(x) do { } while (0) -#define rcu_note_voluntary_context_switch(t) do { } while (0) +#define rcu_note_voluntary_context_switch(t) rcu_all_qs() #endif /* #else #ifdef CONFIG_TASKS_RCU */ /** diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 0e5366200154..fabd3fad8516 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -154,7 +154,10 @@ static inline bool rcu_is_watching(void) return true; } - #endif /* #else defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ +static inline void rcu_all_qs(void) +{ +} + #endif /* __LINUX_RCUTINY_H */ diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 52953790dcca..ddba927f7316 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -97,4 +97,6 @@ extern int rcu_scheduler_active __read_mostly; bool rcu_is_watching(void); +void rcu_all_qs(void); + #endif /* __LINUX_RCUTREE_H */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 75ce12316b4c..cb00e038c2f2 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -219,6 +219,9 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ }; +DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr); +EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr); + /* * Let the RCU core know that this CPU has gone through the scheduler, * which is a quiescent state. This is called when the need for a @@ -288,6 +291,22 @@ void rcu_note_context_switch(void) } EXPORT_SYMBOL_GPL(rcu_note_context_switch); +/* + * Register a quiesecent state for all RCU flavors. If there is an + * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight + * dyntick-idle quiescent state visible to other CPUs (but only for those + * RCU flavors in desparate need of a quiescent state, which will normally + * be none of them). Either way, do a lightweight quiescent state for + * all RCU flavors. + */ +void rcu_all_qs(void) +{ + if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) + rcu_momentary_dyntick_idle(); + this_cpu_inc(rcu_qs_ctr); +} +EXPORT_SYMBOL_GPL(rcu_all_qs); + static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ static long qhimark = 10000; /* If this many pending, ignore blimit. */ static long qlowmark = 100; /* Once only this many pending, use blimit. */ @@ -1609,6 +1628,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, rdp->gpnum = rnp->gpnum; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); rdp->passed_quiesce = 0; + rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); zero_cpu_stall_ticks(rdp); ACCESS_ONCE(rdp->gpwrap) = false; @@ -2075,8 +2095,10 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); - if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum || - rnp->completed == rnp->gpnum || rdp->gpwrap) { + if ((rdp->passed_quiesce == 0 && + rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) || + rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum || + rdp->gpwrap) { /* * The grace period in which this quiescent state was @@ -2085,6 +2107,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) * within the current grace period. */ rdp->passed_quiesce = 0; /* need qs for new gp. */ + rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } @@ -2129,7 +2152,8 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) * Was there a quiescent state since the beginning of the grace * period? If no, then exit and wait for the next call. */ - if (!rdp->passed_quiesce) + if (!rdp->passed_quiesce && + rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) return; /* @@ -3174,9 +3198,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) /* Is the RCU core waiting for a quiescent state from this CPU? */ if (rcu_scheduler_fully_active && - rdp->qs_pending && !rdp->passed_quiesce) { + rdp->qs_pending && !rdp->passed_quiesce && + rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) { rdp->n_rp_qs_pending++; - } else if (rdp->qs_pending && rdp->passed_quiesce) { + } else if (rdp->qs_pending && + (rdp->passed_quiesce || + rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) { rdp->n_rp_report_qs++; return 1; } @@ -3510,6 +3537,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->gpnum = rnp->completed; rdp->completed = rnp->completed; rdp->passed_quiesce = 0; + rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); rdp->qs_pending = 0; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 7472ff388d55..1e7f8b05714e 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -257,6 +257,8 @@ struct rcu_data { /* in order to detect GP end. */ unsigned long gpnum; /* Highest gp number that this CPU */ /* is aware of having started. */ + unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */ + /* for rcu_all_qs() invocations. */ bool passed_quiesce; /* User-mode/idle loop etc. */ bool qs_pending; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 5cdc62e1beeb..fbb6240509ea 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -46,6 +46,8 @@ #define RCU_TREE_NONCORE #include "tree.h" +DECLARE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr); + static int r_open(struct inode *inode, struct file *file, const struct seq_operations *op) { @@ -115,11 +117,13 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) if (!rdp->beenonline) return; - seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d", + seq_printf(m, "%3d%cc=%ld g=%ld pq=%d/%d qp=%d", rdp->cpu, cpu_is_offline(rdp->cpu) ? '!' : ' ', ulong2long(rdp->completed), ulong2long(rdp->gpnum), - rdp->passed_quiesce, rdp->qs_pending); + rdp->passed_quiesce, + rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu), + rdp->qs_pending); seq_printf(m, " dt=%d/%llx/%d df=%lu", atomic_read(&rdp->dynticks->dynticks), rdp->dynticks->dynticks_nesting, -- cgit v1.2.3