From 0260b92e1c39412b1e345e202355c43169c16274 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 8 Apr 2021 13:01:14 -0700 Subject: rcutorture: Forgive RCU boost failures when CPUs don't pass through QS Currently, rcu_torture_boost() runs CPU-bound at real-time priority to force RCU priority inversions. It then checks that grace periods progress during this CPU-bound time. If grace periods fail to progress, it reports and RCU priority boosting failure. However, it is possible (and sometimes does happen) that the grace period fails to progress due to a CPU failing to pass through a quiescent state for an extended time period (3.5 seconds by default). This can happen due to vCPU preemption, long-running interrupts, and much else besides. There is nothing that RCU priority boosting can do about these situations, and so they should not be counted as RCU priority boosting failures. This commit therefore checks for CPUs (as opposed to preempted tasks) holding up a grace period, and flags the resulting RCU priority boosting failures, but does not splat nor count them as errors. It does rate-limit them to avoid flooding the console log. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'kernel/rcu/tree_stall.h') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 59b95cc5cbdf..af92d9fee0d4 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -716,6 +716,42 @@ static void check_cpu_stall(struct rcu_data *rdp) // RCU forward-progress mechanisms, including of callback invocation. +/* + * Check to see if a failure to end RCU priority inversion was due to + * a CPU not passing through a quiescent state. When this happens, there + * is nothing that RCU priority boosting can do to help, so we shouldn't + * count this as an RCU priority boosting failure. A return of true says + * RCU priority boosting is to blame, and false says otherwise. If false + * is returned, the first of the CPUs to blame is stored through cpup. + */ +bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) +{ + int cpu; + unsigned long flags; + struct rcu_node *rnp; + + rcu_for_each_leaf_node(rnp) { + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (!rnp->qsmask) { + // No CPUs without quiescent states for this rnp. + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + continue; + } + // Find the first holdout CPU. + for_each_leaf_node_possible_cpu(rnp, cpu) { + if (rnp->qsmask & (1UL << (cpu - rnp->grplo))) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + *cpup = cpu; + return false; + } + } + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } + // Can't blame CPUs, so must blame RCU priority boosting. + return true; +} +EXPORT_SYMBOL_GPL(rcu_check_boost_fail); + /* * Show the state of the grace-period kthreads. */ -- cgit v1.2.3 From 063f5a4df99145ba0a5d4879d171a8175235f37b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 14 Apr 2021 13:00:10 -0700 Subject: rcutorture: Don't count CPU-stalled time against priority boosting It will frequently be the case that rcu_torture_boost() will get a ->start_gp_poll() cookie that needs almost all of the current grace period plus an additional grace period to elapse before ->poll_gp_state() will return true. It is quite possible that the current grace period will have (say) two seconds of stall by a CPU failing to pass through a quiescent state, followed by 300 milliseconds of delay due to a preempted reader. The next grace period might suffer only one second of stall by a CPU, followed by another 300 milliseconds of delay due to a preempted reader. This is an example of RCU priority boosting doing its job, but the full elapsed time of 3.6 seconds exceeds the 3.5-second limit. In addition, there is no CPU stall in force at the 3.5-second mark, so this would nevertheless currently be counted as an RCU priority boosting failure. This commit therefore avoids this sort of false positive by resetting the gp_state_time timestamp any time that the current grace period is being blocked by a CPU. This results in extremely frequent calls to the ->check_boost_failed() function, so this commit provides a lockless fastpath that is selected by supplying a NULL CPU-number pointer. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 13 ++++++++----- kernel/rcu/tree_stall.h | 10 ++++++++++ 2 files changed, 18 insertions(+), 5 deletions(-) (limited to 'kernel/rcu/tree_stall.h') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 5ae4dcc6ba27..8b347b9659aa 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -918,17 +918,18 @@ static void rcu_torture_enable_rt_throttle(void) old_rt_runtime = -1; } -static bool rcu_torture_boost_failed(unsigned long gp_state, unsigned long start, unsigned long end) +static bool rcu_torture_boost_failed(unsigned long gp_state, unsigned long *start) { int cpu; static int dbg_done; + unsigned long end = jiffies; bool gp_done; unsigned long j; static unsigned long last_persist; unsigned long lp; unsigned long mininterval = test_boost_duration * HZ - HZ / 2; - if (end - start > mininterval) { + if (end - *start > mininterval) { // Recheck after checking time to avoid false positives. smp_mb(); // Time check before grace-period check. if (cur_ops->poll_gp_state(gp_state)) @@ -945,7 +946,7 @@ static bool rcu_torture_boost_failed(unsigned long gp_state, unsigned long start n_rcu_torture_boost_failure++; if (!xchg(&dbg_done, 1) && cur_ops->gp_kthread_dbg) { pr_info("Boost inversion thread ->rt_priority %u gp_state %lu jiffies %lu\n", - current->rt_priority, gp_state, end - start); + current->rt_priority, gp_state, end - *start); cur_ops->gp_kthread_dbg(); // Recheck after print to flag grace period ending during splat. gp_done = cur_ops->poll_gp_state(gp_state); @@ -955,6 +956,8 @@ static bool rcu_torture_boost_failed(unsigned long gp_state, unsigned long start } return true; // failed + } else if (cur_ops->check_boost_failed && !cur_ops->check_boost_failed(gp_state, NULL)) { + *start = jiffies; } return false; // passed @@ -995,7 +998,7 @@ static int rcu_torture_boost(void *arg) while (time_before(jiffies, endtime)) { // Has current GP gone too long? if (gp_initiated && !failed && !cur_ops->poll_gp_state(gp_state)) - failed = rcu_torture_boost_failed(gp_state, gp_state_time, jiffies); + failed = rcu_torture_boost_failed(gp_state, &gp_state_time); // If we don't have a grace period in flight, start one. if (!gp_initiated || cur_ops->poll_gp_state(gp_state)) { gp_state = cur_ops->start_gp_poll(); @@ -1016,7 +1019,7 @@ static int rcu_torture_boost(void *arg) // In case the grace period extended beyond the end of the loop. if (gp_initiated && !failed && !cur_ops->poll_gp_state(gp_state)) - rcu_torture_boost_failed(gp_state, gp_state_time, jiffies); + rcu_torture_boost_failed(gp_state, &gp_state_time); /* * Set the start time of the next test interval. diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index af92d9fee0d4..8bde1b53b0c9 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -723,6 +723,10 @@ static void check_cpu_stall(struct rcu_data *rdp) * count this as an RCU priority boosting failure. A return of true says * RCU priority boosting is to blame, and false says otherwise. If false * is returned, the first of the CPUs to blame is stored through cpup. + * + * If cpup is NULL, then a lockless quick check is carried out, suitable + * for high-rate usage. On the other hand, if cpup is non-NULL, each + * rcu_node structure's ->lock is acquired, ruling out high-rate usage. */ bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) { @@ -731,6 +735,12 @@ bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) struct rcu_node *rnp; rcu_for_each_leaf_node(rnp) { + if (!cpup) { + if (READ_ONCE(rnp->qsmask)) + return false; + else + continue; + } raw_spin_lock_irqsave_rcu_node(rnp, flags); if (!rnp->qsmask) { // No CPUs without quiescent states for this rnp. -- cgit v1.2.3 From e44111ed20d8b2d7b05b20d694358ae77d4e93e2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 2 Apr 2021 21:51:50 -0700 Subject: rcu: Add ->rt_priority and ->gp_start to show_rcu_gp_kthreads() output This commit adds ->rt_priority and ->gp_start to show_rcu_gp_kthreads() output in order to better diagnose RCU priority boosting failures. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel/rcu/tree_stall.h') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 59b95cc5cbdf..fb4702570316 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -726,6 +726,7 @@ void show_rcu_gp_kthreads(void) unsigned long j; unsigned long ja; unsigned long jr; + unsigned long js; unsigned long jw; struct rcu_data *rdp; struct rcu_node *rnp; @@ -734,11 +735,12 @@ void show_rcu_gp_kthreads(void) j = jiffies; ja = j - data_race(rcu_state.gp_activity); jr = j - data_race(rcu_state.gp_req_activity); + js = j - data_race(rcu_state.gp_start); jw = j - data_race(rcu_state.gp_wake_time); - pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n", + pr_info("%s: wait state: %s(%d) ->state: %#lx ->rt_priority %u delta ->gp_start %lu ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n", rcu_state.name, gp_state_getname(rcu_state.gp_state), - rcu_state.gp_state, t ? t->state : 0x1ffffL, - ja, jr, jw, (long)data_race(rcu_state.gp_wake_seq), + rcu_state.gp_state, t ? t->state : 0x1ffffL, t ? t->rt_priority : 0xffU, + js, ja, jr, jw, (long)data_race(rcu_state.gp_wake_seq), (long)data_race(rcu_state.gp_seq), (long)data_race(rcu_get_root()->gp_seq_needed), data_race(rcu_state.gp_flags)); -- cgit v1.2.3 From 27ba76e164fc83ffe6ceeb0415c427ad1191af6c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 4 Apr 2021 17:23:36 -0700 Subject: rcu: Add ->gp_max to show_rcu_gp_kthreads() output This commit adds ->gp_max to show_rcu_gp_kthreads() output in order to better diagnose RCU priority boosting failures. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/rcu/tree_stall.h') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index fb4702570316..a4e2bb3bdce7 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -737,12 +737,13 @@ void show_rcu_gp_kthreads(void) jr = j - data_race(rcu_state.gp_req_activity); js = j - data_race(rcu_state.gp_start); jw = j - data_race(rcu_state.gp_wake_time); - pr_info("%s: wait state: %s(%d) ->state: %#lx ->rt_priority %u delta ->gp_start %lu ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n", + pr_info("%s: wait state: %s(%d) ->state: %#lx ->rt_priority %u delta ->gp_start %lu ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_max %lu ->gp_flags %#x\n", rcu_state.name, gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, t ? t->state : 0x1ffffL, t ? t->rt_priority : 0xffU, js, ja, jr, jw, (long)data_race(rcu_state.gp_wake_seq), (long)data_race(rcu_state.gp_seq), (long)data_race(rcu_get_root()->gp_seq_needed), + data_race(rcu_state.gp_max), data_race(rcu_state.gp_flags)); rcu_for_each_node_breadth_first(rnp) { if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), -- cgit v1.2.3 From 396eba65f62414ee8850ed5f7b5ce844719ebebf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 6 Apr 2021 16:31:42 -0700 Subject: rcu: Add quiescent states and boost states to show_rcu_gp_kthreads() output This commit adds each rcu_node structure's ->qsmask and "bBEG" output indicating whether: (1) There is a boost kthread, (2) A reader needs to be (or is in the process of being) boosted, (3) A reader is blocking an expedited grace period, and (4) A reader is blocking a normal grace period. This helps diagnose RCU priority boosting failures. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 1 + kernel/rcu/tree_plugin.h | 1 + kernel/rcu/tree_stall.h | 12 +++++++++--- 3 files changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel/rcu/tree_stall.h') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 71821d59d95c..5fd0c443517e 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -115,6 +115,7 @@ struct rcu_node { /* boosting for this rcu_node structure. */ unsigned int boost_kthread_status; /* State of boost_kthread_task for tracing. */ + unsigned long n_boosts; /* Number of boosts for this rcu_node structure. */ #ifdef CONFIG_RCU_NOCB_CPU struct swait_queue_head nocb_gp_wq[2]; /* Place for rcu_nocb_kthread() to wait GP. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 2cbe8f8456e6..ef004cc7101d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1098,6 +1098,7 @@ static int rcu_boost(struct rcu_node *rnp) /* Lock only for side effect: boosts task t's priority. */ rt_mutex_lock(&rnp->boost_mtx); rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */ + rnp->n_boosts++; return READ_ONCE(rnp->exp_tasks) != NULL || READ_ONCE(rnp->boost_tasks) != NULL; diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index a4e2bb3bdce7..c1f83864a18e 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -749,9 +749,15 @@ void show_rcu_gp_kthreads(void) if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), READ_ONCE(rnp->gp_seq_needed))) continue; - pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n", - rnp->grplo, rnp->grphi, (long)data_race(rnp->gp_seq), - (long)data_race(rnp->gp_seq_needed)); + pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld ->qsmask %#lx %c%c%c%c ->n_boosts %ld\n", + rnp->grplo, rnp->grphi, + (long)data_race(rnp->gp_seq), (long)data_race(rnp->gp_seq_needed), + data_race(rnp->qsmask), + ".b"[!!data_race(rnp->boost_kthread_task)], + ".B"[!!data_race(rnp->boost_tasks)], + ".E"[!!data_race(rnp->exp_tasks)], + ".G"[!!data_race(rnp->gp_tasks)], + data_race(rnp->n_boosts)); if (!rcu_is_leaf_node(rnp)) continue; for_each_leaf_node_possible_cpu(rnp, cpu) { -- cgit v1.2.3 From b15805013b441b13fcf6e402c03421c03edb79c6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 7 Apr 2021 15:14:01 -0700 Subject: rcu: Make show_rcu_gp_kthreads() dump rcu_node structures blocking GP Currently, show_rcu_gp_kthreads() only dumps rcu_node structures that have outdated ideas of the current grace-period number. This commit also dumps those that are in any way blocking the current grace period. This helps diagnose RCU priority boosting failures. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel/rcu/tree_stall.h') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index c1f83864a18e..e6bd518e0bc4 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -746,8 +746,9 @@ void show_rcu_gp_kthreads(void) data_race(rcu_state.gp_max), data_race(rcu_state.gp_flags)); rcu_for_each_node_breadth_first(rnp) { - if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), - READ_ONCE(rnp->gp_seq_needed))) + if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), READ_ONCE(rnp->gp_seq_needed)) && + !data_race(rnp->qsmask) && !data_race(rnp->boost_tasks) && + !data_race(rnp->exp_tasks) && !data_race(rnp->gp_tasks)) continue; pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld ->qsmask %#lx %c%c%c%c ->n_boosts %ld\n", rnp->grplo, rnp->grphi, -- cgit v1.2.3 From 5390473ec1697b71af0e9d63ef7aaa7ecd27e2c9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 15 Apr 2021 16:30:34 -0700 Subject: rcu: Don't penalize priority boosting when there is nothing to boost RCU priority boosting cannot do anything unless there is at least one task blocking the current RCU grace period that was preempted within the RCU read-side critical section that it still resides in. However, the current rcu_torture_boost_failed() code will count this as an RCU priority-boosting failure if there were no CPUs blocking the current grace period. This situation can happen (for example) if the last CPU blocking the current grace period was subjected to vCPU preemption, which is always a risk for rcutorture guest OSes. This commit therefore causes rcu_torture_boost_failed() to refrain from reporting failure unless there is at least one task blocking the current RCU grace period that was preempted within the RCU read-side critical section that it still resides in. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'kernel/rcu/tree_stall.h') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 8bde1b53b0c9..65302518e006 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -723,6 +723,10 @@ static void check_cpu_stall(struct rcu_data *rdp) * count this as an RCU priority boosting failure. A return of true says * RCU priority boosting is to blame, and false says otherwise. If false * is returned, the first of the CPUs to blame is stored through cpup. + * If there was no CPU blocking the current grace period, but also nothing + * in need of being boosted, *cpup is set to -1. This can happen in case + * of vCPU preemption while the last CPU is reporting its quiscent state, + * for example. * * If cpup is NULL, then a lockless quick check is carried out, suitable * for high-rate usage. On the other hand, if cpup is non-NULL, each @@ -730,18 +734,25 @@ static void check_cpu_stall(struct rcu_data *rdp) */ bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) { + bool atb = false; int cpu; unsigned long flags; struct rcu_node *rnp; rcu_for_each_leaf_node(rnp) { if (!cpup) { - if (READ_ONCE(rnp->qsmask)) + if (READ_ONCE(rnp->qsmask)) { return false; - else + } else { + if (READ_ONCE(rnp->gp_tasks)) + atb = true; continue; + } } + *cpup = -1; raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (rnp->gp_tasks) + atb = true; if (!rnp->qsmask) { // No CPUs without quiescent states for this rnp. raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -758,7 +769,7 @@ bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } // Can't blame CPUs, so must blame RCU priority boosting. - return true; + return atb; } EXPORT_SYMBOL_GPL(rcu_check_boost_fail); -- cgit v1.2.3 From c70360c3343f975bd066b6b98159d93f1bd4219f Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Thu, 29 Apr 2021 00:12:19 +0100 Subject: rcu: Add missing __releases() annotation Sparse reports a warning at rcu_print_task_stall(): "warning: context imbalance in rcu_print_task_stall - unexpected unlock" The root cause is a missing annotation on rcu_print_task_stall(). This commit therefore adds the missing __releases(rnp->lock) annotation. Signed-off-by: Jules Irenge Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/rcu/tree_stall.h') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index e6bd518e0bc4..ffb8cf6c6437 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -314,6 +314,7 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) * tasks blocked within RCU read-side critical sections. */ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags) + __releases(rnp->lock) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return 0; -- cgit v1.2.3