diff options
Diffstat (limited to 'kernel/rcu')
-rw-r--r-- | kernel/rcu/rcu.h | 1 | ||||
-rw-r--r-- | kernel/rcu/rcu_segcblist.c | 18 | ||||
-rw-r--r-- | kernel/rcu/rcu_segcblist.h | 2 | ||||
-rw-r--r-- | kernel/rcu/rcutorture.c | 8 | ||||
-rw-r--r-- | kernel/rcu/tree.c | 331 | ||||
-rw-r--r-- | kernel/rcu/tree.h | 30 | ||||
-rw-r--r-- | kernel/rcu/tree_exp.h | 222 | ||||
-rw-r--r-- | kernel/rcu/tree_plugin.h | 26 |
8 files changed, 289 insertions, 349 deletions
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 5b5bb9ee2e20..40cea6735c2d 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -491,6 +491,7 @@ void rcu_force_quiescent_state(void); void rcu_bh_force_quiescent_state(void); void rcu_sched_force_quiescent_state(void); extern struct workqueue_struct *rcu_gp_wq; +extern struct workqueue_struct *rcu_par_gp_wq; #endif /* #else #ifdef CONFIG_TINY_RCU */ #ifdef CONFIG_RCU_NOCB_CPU diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 88cba7c2956c..5aff271adf1e 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -404,24 +404,6 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq) } /* - * Scan the specified rcu_segcblist structure for callbacks that need - * a grace period later than the one specified by "seq". We don't look - * at the RCU_DONE_TAIL or RCU_NEXT_TAIL segments because they don't - * have a grace-period sequence number. - */ -bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp, - unsigned long seq) -{ - int i; - - for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) - if (rsclp->tails[i - 1] != rsclp->tails[i] && - ULONG_CMP_LT(seq, rsclp->gp_seq[i])) - return true; - return false; -} - -/* * Merge the source rcu_segcblist structure into the destination * rcu_segcblist structure, then initialize the source. Any pending * callbacks from the source get to start over. It is best to diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 581c12b63544..948470cef385 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -134,7 +134,5 @@ void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp, struct rcu_cblist *rclp); void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq); bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq); -bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp, - unsigned long seq); void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp, struct rcu_segcblist *src_rsclp); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index f0e1d44459f8..e628fcfd1bde 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1614,6 +1614,9 @@ static enum cpuhp_state rcutor_hp; static void rcu_torture_cleanup(void) { + int flags = 0; + unsigned long gpnum = 0; + unsigned long completed = 0; int i; rcutorture_record_test_transition(); @@ -1644,6 +1647,11 @@ rcu_torture_cleanup(void) fakewriter_tasks = NULL; } + rcutorture_get_gp_data(cur_ops->ttype, &flags, &gpnum, &completed); + srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, + &flags, &gpnum, &completed); + pr_alert("%s: End-test grace-period state: g%lu c%lu f%#x\n", + cur_ops->name, gpnum, completed, flags); torture_stop_kthread(rcu_torture_stats, stats_task); torture_stop_kthread(rcu_torture_fqs, fqs_task); for (i = 0; i < ncbflooders; i++) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b22d2e1ca5c0..4fccdfa25716 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -524,8 +524,6 @@ module_param(rcu_kick_kthreads, bool, 0644); static ulong jiffies_till_sched_qs = HZ / 10; module_param(jiffies_till_sched_qs, ulong, 0444); -static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, - struct rcu_data *rdp); static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)); static void force_quiescent_state(struct rcu_state *rsp); static int rcu_pending(void); @@ -711,44 +709,6 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) } /* - * Is there any need for future grace periods? - * Interrupts must be disabled. If the caller does not hold the root - * rnp_node structure's ->lock, the results are advisory only. - */ -static int rcu_future_needs_gp(struct rcu_state *rsp) -{ - struct rcu_node *rnp = rcu_get_root(rsp); - int idx = (READ_ONCE(rnp->completed) + 1) & 0x1; - int *fp = &rnp->need_future_gp[idx]; - - lockdep_assert_irqs_disabled(); - return READ_ONCE(*fp); -} - -/* - * Does the current CPU require a not-yet-started grace period? - * The caller must have disabled interrupts to prevent races with - * normal callback registry. - */ -static bool -cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) -{ - lockdep_assert_irqs_disabled(); - if (rcu_gp_in_progress(rsp)) - return false; /* No, a grace period is already in progress. */ - if (rcu_future_needs_gp(rsp)) - return true; /* Yes, a no-CBs CPU needs one. */ - if (!rcu_segcblist_is_enabled(&rdp->cblist)) - return false; /* No, this is a no-CBs (or offline) CPU. */ - if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) - return true; /* Yes, CPU has newly registered callbacks. */ - if (rcu_segcblist_future_gp_needed(&rdp->cblist, - READ_ONCE(rsp->completed))) - return true; /* Yes, CBs for future grace period. */ - return false; /* No grace period needed. */ -} - -/* * Enter an RCU extended quiescent state, which can be either the * idle loop or adaptive-tickless usermode execution. * @@ -1642,18 +1602,30 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp, return rnp->completed + 1; /* + * If the current rcu_node structure believes that RCU is + * idle, and if the rcu_state structure does not yet reflect + * the start of a new grace period, then the next grace period + * will suffice. The memory barrier is needed to accurately + * sample the rsp->gpnum, and pairs with the second lock + * acquisition in rcu_gp_init(), which is augmented with + * smp_mb__after_unlock_lock() for this purpose. + */ + if (rnp->gpnum == rnp->completed) { + smp_mb(); /* See above block comment. */ + if (READ_ONCE(rsp->gpnum) == rnp->completed) + return rnp->completed + 1; + } + + /* * Otherwise, wait for a possible partial grace period and * then the subsequent full grace period. */ return rnp->completed + 2; } -/* - * Trace-event helper function for rcu_start_future_gp() and - * rcu_nocb_wait_gp(). - */ -static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, - unsigned long c, const char *s) +/* Trace-event wrapper function for trace_rcu_future_grace_period. */ +static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, + unsigned long c, const char *s) { trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, rnp->completed, c, rnp->level, @@ -1661,96 +1633,67 @@ static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, } /* - * Start some future grace period, as needed to handle newly arrived + * Start the specified grace period, as needed to handle newly arrived * callbacks. The required future grace periods are recorded in each - * rcu_node structure's ->need_future_gp field. Returns true if there + * rcu_node structure's ->need_future_gp[] field. Returns true if there * is reason to awaken the grace-period kthread. * - * The caller must hold the specified rcu_node structure's ->lock. + * The caller must hold the specified rcu_node structure's ->lock, which + * is why the caller is responsible for waking the grace-period kthread. */ -static bool __maybe_unused -rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, - unsigned long *c_out) +static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, + unsigned long c) { - unsigned long c; bool ret = false; - struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); - - raw_lockdep_assert_held_rcu_node(rnp); - - /* - * Pick up grace-period number for new callbacks. If this - * grace period is already marked as needed, return to the caller. - */ - c = rcu_cbs_completed(rdp->rsp, rnp); - trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf")); - if (rnp->need_future_gp[c & 0x1]) { - trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf")); - goto out; - } + struct rcu_state *rsp = rdp->rsp; + struct rcu_node *rnp_root; /* - * If either this rcu_node structure or the root rcu_node structure - * believe that a grace period is in progress, then we must wait - * for the one following, which is in "c". Because our request - * will be noticed at the end of the current grace period, we don't - * need to explicitly start one. We only do the lockless check - * of rnp_root's fields if the current rcu_node structure thinks - * there is no grace period in flight, and because we hold rnp->lock, - * the only possible change is when rnp_root's two fields are - * equal, in which case rnp_root->gpnum might be concurrently - * incremented. But that is OK, as it will just result in our - * doing some extra useless work. + * Use funnel locking to either acquire the root rcu_node + * structure's lock or bail out if the need for this grace period + * has already been recorded -- or has already started. If there + * is already a grace period in progress in a non-leaf node, no + * recording is needed because the end of the grace period will + * scan the leaf rcu_node structures. Note that rnp->lock must + * not be released. */ - if (rnp->gpnum != rnp->completed || - READ_ONCE(rnp_root->gpnum) != READ_ONCE(rnp_root->completed)) { - rnp->need_future_gp[c & 0x1]++; - trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); - goto out; + raw_lockdep_assert_held_rcu_node(rnp); + trace_rcu_this_gp(rnp, rdp, c, TPS("Startleaf")); + for (rnp_root = rnp; 1; rnp_root = rnp_root->parent) { + if (rnp_root != rnp) + raw_spin_lock_rcu_node(rnp_root); + WARN_ON_ONCE(ULONG_CMP_LT(rnp_root->gpnum + + need_future_gp_mask(), c)); + if (need_future_gp_element(rnp_root, c) || + ULONG_CMP_GE(rnp_root->gpnum, c) || + (rnp != rnp_root && + rnp_root->gpnum != rnp_root->completed)) { + trace_rcu_this_gp(rnp_root, rdp, c, TPS("Prestarted")); + goto unlock_out; + } + need_future_gp_element(rnp_root, c) = true; + if (rnp_root != rnp && rnp_root->parent != NULL) + raw_spin_unlock_rcu_node(rnp_root); + if (!rnp_root->parent) + break; /* At root, and perhaps also leaf. */ } - /* - * There might be no grace period in progress. If we don't already - * hold it, acquire the root rcu_node structure's lock in order to - * start one (if needed). - */ - if (rnp != rnp_root) - raw_spin_lock_rcu_node(rnp_root); - - /* - * Get a new grace-period number. If there really is no grace - * period in progress, it will be smaller than the one we obtained - * earlier. Adjust callbacks as needed. - */ - c = rcu_cbs_completed(rdp->rsp, rnp_root); - if (!rcu_is_nocb_cpu(rdp->cpu)) - (void)rcu_segcblist_accelerate(&rdp->cblist, c); - - /* - * If the needed for the required grace period is already - * recorded, trace and leave. - */ - if (rnp_root->need_future_gp[c & 0x1]) { - trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartedroot")); + /* If GP already in progress, just leave, otherwise start one. */ + if (rnp_root->gpnum != rnp_root->completed) { + trace_rcu_this_gp(rnp_root, rdp, c, TPS("Startedleafroot")); goto unlock_out; } - - /* Record the need for the future grace period. */ - rnp_root->need_future_gp[c & 0x1]++; - - /* If a grace period is not already in progress, start one. */ - if (rnp_root->gpnum != rnp_root->completed) { - trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot")); - } else { - trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot")); - ret = rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); + trace_rcu_this_gp(rnp_root, rdp, c, TPS("Startedroot")); + WRITE_ONCE(rsp->gp_flags, rsp->gp_flags | RCU_GP_FLAG_INIT); + if (!rsp->gp_kthread) { + trace_rcu_this_gp(rnp_root, rdp, c, TPS("NoGPkthread")); + goto unlock_out; } + trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("newreq")); + ret = true; /* Caller must wake GP kthread. */ unlock_out: if (rnp != rnp_root) raw_spin_unlock_rcu_node(rnp_root); -out: - if (c_out != NULL) - *c_out = c; return ret; } @@ -1758,16 +1701,16 @@ out: * Clean up any old requests for the just-ended grace period. Also return * whether any additional grace periods have been requested. */ -static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) +static bool rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) { - int c = rnp->completed; - int needmore; + unsigned long c = rnp->completed; + bool needmore; struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - rnp->need_future_gp[c & 0x1] = 0; - needmore = rnp->need_future_gp[(c + 1) & 0x1]; - trace_rcu_future_gp(rnp, rdp, c, - needmore ? TPS("CleanupMore") : TPS("Cleanup")); + need_future_gp_element(rnp, c) = false; + needmore = need_any_future_gp(rnp); + trace_rcu_this_gp(rnp, rdp, c, + needmore ? TPS("CleanupMore") : TPS("Cleanup")); return needmore; } @@ -1802,6 +1745,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp) static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { + unsigned long c; bool ret = false; raw_lockdep_assert_held_rcu_node(rnp); @@ -1820,8 +1764,9 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, * accelerating callback invocation to an earlier grace-period * number. */ - if (rcu_segcblist_accelerate(&rdp->cblist, rcu_cbs_completed(rsp, rnp))) - ret = rcu_start_future_gp(rnp, rdp, NULL); + c = rcu_cbs_completed(rsp, rnp); + if (rcu_segcblist_accelerate(&rdp->cblist, c)) + ret = rcu_start_this_gp(rnp, rdp, c); /* Trace depending on how much we were able to accelerate. */ if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL)) @@ -2108,7 +2053,6 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) { unsigned long gp_duration; bool needgp = false; - int nocb = 0; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); struct swait_queue_head *sq; @@ -2147,7 +2091,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) if (rnp == rdp->mynode) needgp = __note_gp_changes(rsp, rnp, rdp) || needgp; /* smp_mb() provided by prior unlock-lock pair. */ - nocb += rcu_future_gp_cleanup(rsp, rnp); + needgp = rcu_future_gp_cleanup(rsp, rnp) || needgp; sq = rcu_nocb_gp_get(rnp); raw_spin_unlock_irq_rcu_node(rnp); rcu_nocb_gp_cleanup(sq); @@ -2157,21 +2101,25 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) } rnp = rcu_get_root(rsp); raw_spin_lock_irq_rcu_node(rnp); /* Order GP before ->completed update. */ - rcu_nocb_gp_set(rnp, nocb); /* Declare grace period done. */ WRITE_ONCE(rsp->completed, rsp->gpnum); trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); rsp->gp_state = RCU_GP_IDLE; + /* Check for GP requests since above loop. */ rdp = this_cpu_ptr(rsp->rda); + if (need_any_future_gp(rnp)) { + trace_rcu_this_gp(rnp, rdp, rsp->completed - 1, + TPS("CleanupMore")); + needgp = true; + } /* Advance CBs to reduce false positives below. */ - needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp; - if (needgp || cpu_needs_another_gp(rsp, rdp)) { + if (!rcu_accelerate_cbs(rsp, rnp, rdp) && needgp) { WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT); - trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gpnum), + trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("newreq")); } + WRITE_ONCE(rsp->gp_flags, rsp->gp_flags & RCU_GP_FLAG_INIT); raw_spin_unlock_irq_rcu_node(rnp); } @@ -2283,71 +2231,6 @@ static int __noreturn rcu_gp_kthread(void *arg) } /* - * Start a new RCU grace period if warranted, re-initializing the hierarchy - * in preparation for detecting the next grace period. The caller must hold - * the root node's ->lock and hard irqs must be disabled. - * - * Note that it is legal for a dying CPU (which is marked as offline) to - * invoke this function. This can happen when the dying CPU reports its - * quiescent state. - * - * Returns true if the grace-period kthread must be awakened. - */ -static bool -rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, - struct rcu_data *rdp) -{ - raw_lockdep_assert_held_rcu_node(rnp); - if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { - /* - * Either we have not yet spawned the grace-period - * task, this CPU does not need another grace period, - * or a grace period is already in progress. - * Either way, don't start a new grace period. - */ - return false; - } - WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT); - trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), - TPS("newreq")); - - /* - * We can't do wakeups while holding the rnp->lock, as that - * could cause possible deadlocks with the rq->lock. Defer - * the wakeup to our caller. - */ - return true; -} - -/* - * Similar to rcu_start_gp_advanced(), but also advance the calling CPU's - * callbacks. Note that rcu_start_gp_advanced() cannot do this because it - * is invoked indirectly from rcu_advance_cbs(), which would result in - * endless recursion -- or would do so if it wasn't for the self-deadlock - * that is encountered beforehand. - * - * Returns true if the grace-period kthread needs to be awakened. - */ -static bool rcu_start_gp(struct rcu_state *rsp) -{ - struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - struct rcu_node *rnp = rcu_get_root(rsp); - bool ret = false; - - /* - * If there is no grace period in progress right now, any - * callbacks we have up to this point will be satisfied by the - * next grace period. Also, advancing the callbacks reduces the - * probability of false positives from cpu_needs_another_gp() - * resulting in pointless grace periods. So, advance callbacks - * then start the grace period! - */ - ret = rcu_advance_cbs(rsp, rnp, rdp) || ret; - ret = rcu_start_gp_advanced(rsp, rnp, rdp) || ret; - return ret; -} - -/* * Report a full set of quiescent states to the specified rcu_state data * structure. Invoke rcu_gp_kthread_wake() to awaken the grace-period * kthread if another grace period is required. Whether we wake @@ -2874,22 +2757,27 @@ __rcu_process_callbacks(struct rcu_state *rsp) unsigned long flags; bool needwake; struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); + struct rcu_node *rnp; WARN_ON_ONCE(!rdp->beenonline); /* Update RCU state based on any recent quiescent states. */ rcu_check_quiescent_state(rsp, rdp); - /* Does this CPU require a not-yet-started grace period? */ - local_irq_save(flags); - if (cpu_needs_another_gp(rsp, rdp)) { - raw_spin_lock_rcu_node(rcu_get_root(rsp)); /* irqs disabled. */ - needwake = rcu_start_gp(rsp); - raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); - if (needwake) - rcu_gp_kthread_wake(rsp); - } else { - local_irq_restore(flags); + /* No grace period and unregistered callbacks? */ + if (!rcu_gp_in_progress(rsp) && + rcu_segcblist_is_enabled(&rdp->cblist)) { + local_irq_save(flags); + if (rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) { + local_irq_restore(flags); + } else { + rnp = rdp->mynode; + raw_spin_lock_rcu_node(rnp); /* irqs disabled. */ + needwake = rcu_accelerate_cbs(rsp, rnp, rdp); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + if (needwake) + rcu_gp_kthread_wake(rsp); + } } /* If there are callbacks ready, invoke them. */ @@ -2973,11 +2861,11 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, /* Start a new grace period if one not already started. */ if (!rcu_gp_in_progress(rsp)) { - struct rcu_node *rnp_root = rcu_get_root(rsp); + struct rcu_node *rnp = rdp->mynode; - raw_spin_lock_rcu_node(rnp_root); - needwake = rcu_start_gp(rsp); - raw_spin_unlock_rcu_node(rnp_root); + raw_spin_lock_rcu_node(rnp); + needwake = rcu_accelerate_cbs(rsp, rnp, rdp); + raw_spin_unlock_rcu_node(rnp); if (needwake) rcu_gp_kthread_wake(rsp); } else { @@ -3368,7 +3256,9 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) return 1; /* Has RCU gone idle with this CPU needing another grace period? */ - if (cpu_needs_another_gp(rsp, rdp)) + if (!rcu_gp_in_progress(rsp) && + rcu_segcblist_is_enabled(&rdp->cblist) && + !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) return 1; /* Has another RCU grace period completed? */ @@ -3861,6 +3751,7 @@ static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) struct rcu_data *my_rdp; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); + bool needwake; if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist)) return; /* No callbacks to migrate. */ @@ -3872,12 +3763,15 @@ static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) return; } raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ - rcu_advance_cbs(rsp, rnp_root, rdp); /* Leverage recent GPs. */ - rcu_advance_cbs(rsp, rnp_root, my_rdp); /* Assign GP to pending CBs. */ + /* Leverage recent GPs and set GP for new callbacks. */ + needwake = rcu_advance_cbs(rsp, rnp_root, rdp) || + rcu_advance_cbs(rsp, rnp_root, my_rdp); rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist); WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist)); raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags); + if (needwake) + rcu_gp_kthread_wake(rsp); WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || !rcu_segcblist_empty(&rdp->cblist), "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", @@ -4168,6 +4062,7 @@ static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp) } struct workqueue_struct *rcu_gp_wq; +struct workqueue_struct *rcu_par_gp_wq; void __init rcu_init(void) { @@ -4199,6 +4094,8 @@ void __init rcu_init(void) /* Create workqueue for expedited GPs and for Tree SRCU. */ rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); WARN_ON(!rcu_gp_wq); + rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); + WARN_ON(!rcu_par_gp_wq); } #include "tree_exp.h" diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 0b3a90ebe225..78e051dffc5b 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -58,6 +58,14 @@ struct rcu_dynticks { #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ }; +/* Communicate arguments to a workqueue handler. */ +struct rcu_exp_work { + smp_call_func_t rew_func; + struct rcu_state *rew_rsp; + unsigned long rew_s; + struct work_struct rew_work; +}; + /* RCU's kthread states for tracing. */ #define RCU_KTHREAD_STOPPED 0 #define RCU_KTHREAD_RUNNING 1 @@ -150,15 +158,32 @@ struct rcu_node { struct swait_queue_head nocb_gp_wq[2]; /* Place for rcu_nocb_kthread() to wait GP. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ - int need_future_gp[2]; - /* Counts of upcoming no-CB GP requests. */ + u8 need_future_gp[4]; /* Counts of upcoming GP requests. */ raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; spinlock_t exp_lock ____cacheline_internodealigned_in_smp; unsigned long exp_seq_rq; wait_queue_head_t exp_wq[4]; + struct rcu_exp_work rew; + bool exp_need_flush; /* Need to flush workitem? */ } ____cacheline_internodealigned_in_smp; +/* Accessors for ->need_future_gp[] array. */ +#define need_future_gp_mask() \ + (ARRAY_SIZE(((struct rcu_node *)NULL)->need_future_gp) - 1) +#define need_future_gp_element(rnp, c) \ + ((rnp)->need_future_gp[(c) & need_future_gp_mask()]) +#define need_any_future_gp(rnp) \ +({ \ + int __i; \ + bool __nonzero = false; \ + \ + for (__i = 0; __i < ARRAY_SIZE((rnp)->need_future_gp); __i++) \ + __nonzero = __nonzero || \ + READ_ONCE((rnp)->need_future_gp[__i]); \ + __nonzero; \ +}) + /* * Bitmasks in an rcu_node cover the interval [grplo, grphi] of CPU IDs, and * are indexed relative to this interval rather than the global CPU ID space. @@ -448,7 +473,6 @@ static void print_cpu_stall_info_end(void); static void zero_cpu_stall_ticks(struct rcu_data *rdp); static void increment_cpu_stall_ticks(void); static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu); -static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp); static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq); static void rcu_init_one_nocb(struct rcu_node *rnp); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index f512dd4e57a8..d40708e8c5d6 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -20,6 +20,8 @@ * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> */ +#include <linux/lockdep.h> + /* * Record the start of an expedited grace period. */ @@ -154,15 +156,35 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) * for the current expedited grace period. Works only for preemptible * RCU -- other RCU implementation use other means. * - * Caller must hold the rcu_state's exp_mutex. + * Caller must hold the specificed rcu_node structure's ->lock */ static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp) { + raw_lockdep_assert_held_rcu_node(rnp); + return rnp->exp_tasks == NULL && READ_ONCE(rnp->expmask) == 0; } /* + * Like sync_rcu_preempt_exp_done(), but this function assumes the caller + * doesn't hold the rcu_node's ->lock, and will acquire and release the lock + * itself + */ +static bool sync_rcu_preempt_exp_done_unlocked(struct rcu_node *rnp) +{ + unsigned long flags; + bool ret; + + raw_spin_lock_irqsave_rcu_node(rnp, flags); + ret = sync_rcu_preempt_exp_done(rnp); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + + return ret; +} + + +/* * Report the exit from RCU read-side critical section for the last task * that queued itself during or before the current expedited preemptible-RCU * grace period. This event is reported either to the rcu_node structure on @@ -170,8 +192,7 @@ static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp) * recursively up the tree. (Calm down, calm down, we do the recursion * iteratively!) * - * Caller must hold the rcu_state's exp_mutex and the specified rcu_node - * structure's ->lock. + * Caller must hold the specified rcu_node structure's ->lock. */ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake, unsigned long flags) @@ -207,8 +228,6 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, /* * Report expedited quiescent state for specified node. This is a * lock-acquisition wrapper function for __rcu_report_exp_rnp(). - * - * Caller must hold the rcu_state's exp_mutex. */ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake) @@ -221,8 +240,7 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, /* * Report expedited quiescent state for multiple CPUs, all covered by the - * specified leaf rcu_node structure. Caller must hold the rcu_state's - * exp_mutex. + * specified leaf rcu_node structure. */ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, unsigned long mask, bool wake) @@ -359,93 +377,129 @@ static void sync_sched_exp_online_cleanup(int cpu) } /* - * Select the nodes that the upcoming expedited grace period needs - * to wait for. + * Select the CPUs within the specified rcu_node that the upcoming + * expedited grace period needs to wait for. */ -static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, - smp_call_func_t func) +static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) { int cpu; unsigned long flags; + smp_call_func_t func; unsigned long mask_ofl_test; unsigned long mask_ofl_ipi; int ret; - struct rcu_node *rnp; - - trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset")); - sync_exp_reset_tree(rsp); - trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select")); - rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave_rcu_node(rnp, flags); + struct rcu_exp_work *rewp = + container_of(wp, struct rcu_exp_work, rew_work); + struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew); + struct rcu_state *rsp = rewp->rew_rsp; - /* Each pass checks a CPU for identity, offline, and idle. */ - mask_ofl_test = 0; - for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { - unsigned long mask = leaf_node_cpu_bit(rnp, cpu); - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu); - int snap; + func = rewp->rew_func; + raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (raw_smp_processor_id() == cpu || - !(rnp->qsmaskinitnext & mask)) { + /* Each pass checks a CPU for identity, offline, and idle. */ + mask_ofl_test = 0; + for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { + unsigned long mask = leaf_node_cpu_bit(rnp, cpu); + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu); + int snap; + + if (raw_smp_processor_id() == cpu || + !(rnp->qsmaskinitnext & mask)) { + mask_ofl_test |= mask; + } else { + snap = rcu_dynticks_snap(rdtp); + if (rcu_dynticks_in_eqs(snap)) mask_ofl_test |= mask; - } else { - snap = rcu_dynticks_snap(rdtp); - if (rcu_dynticks_in_eqs(snap)) - mask_ofl_test |= mask; - else - rdp->exp_dynticks_snap = snap; - } + else + rdp->exp_dynticks_snap = snap; } - mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; - - /* - * Need to wait for any blocked tasks as well. Note that - * additional blocking tasks will also block the expedited - * GP until such time as the ->expmask bits are cleared. - */ - if (rcu_preempt_has_tasks(rnp)) - rnp->exp_tasks = rnp->blkd_tasks.next; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } + mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; - /* IPI the remaining CPUs for expedited quiescent state. */ - for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { - unsigned long mask = leaf_node_cpu_bit(rnp, cpu); - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + /* + * Need to wait for any blocked tasks as well. Note that + * additional blocking tasks will also block the expedited GP + * until such time as the ->expmask bits are cleared. + */ + if (rcu_preempt_has_tasks(rnp)) + rnp->exp_tasks = rnp->blkd_tasks.next; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - if (!(mask_ofl_ipi & mask)) - continue; + /* IPI the remaining CPUs for expedited quiescent state. */ + for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { + unsigned long mask = leaf_node_cpu_bit(rnp, cpu); + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + + if (!(mask_ofl_ipi & mask)) + continue; retry_ipi: - if (rcu_dynticks_in_eqs_since(rdp->dynticks, - rdp->exp_dynticks_snap)) { - mask_ofl_test |= mask; - continue; - } - ret = smp_call_function_single(cpu, func, rsp, 0); - if (!ret) { - mask_ofl_ipi &= ~mask; - continue; - } - /* Failed, raced with CPU hotplug operation. */ - raw_spin_lock_irqsave_rcu_node(rnp, flags); - if ((rnp->qsmaskinitnext & mask) && - (rnp->expmask & mask)) { - /* Online, so delay for a bit and try again. */ - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl")); - schedule_timeout_uninterruptible(1); - goto retry_ipi; - } - /* CPU really is offline, so we can ignore it. */ - if (!(rnp->expmask & mask)) - mask_ofl_ipi &= ~mask; + if (rcu_dynticks_in_eqs_since(rdp->dynticks, + rdp->exp_dynticks_snap)) { + mask_ofl_test |= mask; + continue; + } + ret = smp_call_function_single(cpu, func, rsp, 0); + if (!ret) { + mask_ofl_ipi &= ~mask; + continue; + } + /* Failed, raced with CPU hotplug operation. */ + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if ((rnp->qsmaskinitnext & mask) && + (rnp->expmask & mask)) { + /* Online, so delay for a bit and try again. */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl")); + schedule_timeout_uninterruptible(1); + goto retry_ipi; + } + /* CPU really is offline, so we can ignore it. */ + if (!(rnp->expmask & mask)) + mask_ofl_ipi &= ~mask; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } + /* Report quiescent states for those that went offline. */ + mask_ofl_test |= mask_ofl_ipi; + if (mask_ofl_test) + rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false); +} + +/* + * Select the nodes that the upcoming expedited grace period needs + * to wait for. + */ +static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, + smp_call_func_t func) +{ + struct rcu_node *rnp; + + trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset")); + sync_exp_reset_tree(rsp); + trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select")); + + /* Schedule work for each leaf rcu_node structure. */ + rcu_for_each_leaf_node(rsp, rnp) { + rnp->exp_need_flush = false; + if (!READ_ONCE(rnp->expmask)) + continue; /* Avoid early boot non-existent wq. */ + rnp->rew.rew_func = func; + rnp->rew.rew_rsp = rsp; + if (!READ_ONCE(rcu_par_gp_wq) || + rcu_scheduler_active != RCU_SCHEDULER_RUNNING) { + /* No workqueues yet. */ + sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work); + continue; } - /* Report quiescent states for those that went offline. */ - mask_ofl_test |= mask_ofl_ipi; - if (mask_ofl_test) - rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false); + INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); + queue_work_on(rnp->grplo, rcu_par_gp_wq, &rnp->rew.rew_work); + rnp->exp_need_flush = true; } + + /* Wait for workqueue jobs (if any) to complete. */ + rcu_for_each_leaf_node(rsp, rnp) + if (rnp->exp_need_flush) + flush_work(&rnp->rew.rew_work); } static void synchronize_sched_expedited_wait(struct rcu_state *rsp) @@ -466,9 +520,9 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) for (;;) { ret = swait_event_timeout( rsp->expedited_wq, - sync_rcu_preempt_exp_done(rnp_root), + sync_rcu_preempt_exp_done_unlocked(rnp_root), jiffies_stall); - if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) + if (ret > 0 || sync_rcu_preempt_exp_done_unlocked(rnp_root)) return; WARN_ON(ret < 0); /* workqueues should not be signaled. */ if (rcu_cpu_stall_suppress) @@ -501,7 +555,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) rcu_for_each_node_breadth_first(rsp, rnp) { if (rnp == rnp_root) continue; /* printed unconditionally */ - if (sync_rcu_preempt_exp_done(rnp)) + if (sync_rcu_preempt_exp_done_unlocked(rnp)) continue; pr_cont(" l=%u:%d-%d:%#lx/%c", rnp->level, rnp->grplo, rnp->grphi, @@ -557,14 +611,6 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) mutex_unlock(&rsp->exp_wake_mutex); } -/* Let the workqueue handler know what it is supposed to do. */ -struct rcu_exp_work { - smp_call_func_t rew_func; - struct rcu_state *rew_rsp; - unsigned long rew_s; - struct work_struct rew_work; -}; - /* * Common code to drive an expedited grace period forward, used by * workqueues and mid-boot-time tasks. diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index b999032e9466..7fd12039e512 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1815,19 +1815,6 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) swake_up_all(sq); } -/* - * Set the root rcu_node structure's ->need_future_gp field - * based on the sum of those of all rcu_node structures. This does - * double-count the root rcu_node structure's requests, but this - * is necessary to handle the possibility of a rcu_nocb_kthread() - * having awakened during the time that the rcu_node structures - * were being updated for the end of the previous grace period. - */ -static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) -{ - rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq; -} - static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) { return &rnp->nocb_gp_wq[rnp->completed & 0x1]; @@ -2083,7 +2070,8 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) struct rcu_node *rnp = rdp->mynode; raw_spin_lock_irqsave_rcu_node(rnp, flags); - needwake = rcu_start_future_gp(rnp, rdp, &c); + c = rcu_cbs_completed(rdp->rsp, rnp); + needwake = rcu_start_this_gp(rnp, rdp, c); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (needwake) rcu_gp_kthread_wake(rdp->rsp); @@ -2092,7 +2080,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) * Wait for the grace period. Do so interruptibly to avoid messing * up the load average. */ - trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait")); + trace_rcu_this_gp(rnp, rdp, c, TPS("StartWait")); for (;;) { swait_event_interruptible( rnp->nocb_gp_wq[c & 0x1], @@ -2100,9 +2088,9 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) if (likely(d)) break; WARN_ON(signal_pending(current)); - trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait")); + trace_rcu_this_gp(rnp, rdp, c, TPS("ResumeWait")); } - trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait")); + trace_rcu_this_gp(rnp, rdp, c, TPS("EndWait")); smp_mb(); /* Ensure that CB invocation happens after GP end. */ } @@ -2530,10 +2518,6 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) { } -static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) -{ -} - static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) { return NULL; |