summaryrefslogtreecommitdiffstats
path: root/kernel/rcu
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/rcu')
-rw-r--r--kernel/rcu/rcu.h1
-rw-r--r--kernel/rcu/rcu_segcblist.c18
-rw-r--r--kernel/rcu/rcu_segcblist.h2
-rw-r--r--kernel/rcu/rcutorture.c8
-rw-r--r--kernel/rcu/tree.c331
-rw-r--r--kernel/rcu/tree.h30
-rw-r--r--kernel/rcu/tree_exp.h222
-rw-r--r--kernel/rcu/tree_plugin.h26
8 files changed, 289 insertions, 349 deletions
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 5b5bb9ee2e20..40cea6735c2d 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -491,6 +491,7 @@ void rcu_force_quiescent_state(void);
void rcu_bh_force_quiescent_state(void);
void rcu_sched_force_quiescent_state(void);
extern struct workqueue_struct *rcu_gp_wq;
+extern struct workqueue_struct *rcu_par_gp_wq;
#endif /* #else #ifdef CONFIG_TINY_RCU */
#ifdef CONFIG_RCU_NOCB_CPU
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 88cba7c2956c..5aff271adf1e 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -404,24 +404,6 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
}
/*
- * Scan the specified rcu_segcblist structure for callbacks that need
- * a grace period later than the one specified by "seq". We don't look
- * at the RCU_DONE_TAIL or RCU_NEXT_TAIL segments because they don't
- * have a grace-period sequence number.
- */
-bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
- unsigned long seq)
-{
- int i;
-
- for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
- if (rsclp->tails[i - 1] != rsclp->tails[i] &&
- ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
- return true;
- return false;
-}
-
-/*
* Merge the source rcu_segcblist structure into the destination
* rcu_segcblist structure, then initialize the source. Any pending
* callbacks from the source get to start over. It is best to
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 581c12b63544..948470cef385 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -134,7 +134,5 @@ void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
struct rcu_cblist *rclp);
void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq);
bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq);
-bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
- unsigned long seq);
void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
struct rcu_segcblist *src_rsclp);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index f0e1d44459f8..e628fcfd1bde 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1614,6 +1614,9 @@ static enum cpuhp_state rcutor_hp;
static void
rcu_torture_cleanup(void)
{
+ int flags = 0;
+ unsigned long gpnum = 0;
+ unsigned long completed = 0;
int i;
rcutorture_record_test_transition();
@@ -1644,6 +1647,11 @@ rcu_torture_cleanup(void)
fakewriter_tasks = NULL;
}
+ rcutorture_get_gp_data(cur_ops->ttype, &flags, &gpnum, &completed);
+ srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp,
+ &flags, &gpnum, &completed);
+ pr_alert("%s: End-test grace-period state: g%lu c%lu f%#x\n",
+ cur_ops->name, gpnum, completed, flags);
torture_stop_kthread(rcu_torture_stats, stats_task);
torture_stop_kthread(rcu_torture_fqs, fqs_task);
for (i = 0; i < ncbflooders; i++)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index b22d2e1ca5c0..4fccdfa25716 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -524,8 +524,6 @@ module_param(rcu_kick_kthreads, bool, 0644);
static ulong jiffies_till_sched_qs = HZ / 10;
module_param(jiffies_till_sched_qs, ulong, 0444);
-static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
- struct rcu_data *rdp);
static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp));
static void force_quiescent_state(struct rcu_state *rsp);
static int rcu_pending(void);
@@ -711,44 +709,6 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
}
/*
- * Is there any need for future grace periods?
- * Interrupts must be disabled. If the caller does not hold the root
- * rnp_node structure's ->lock, the results are advisory only.
- */
-static int rcu_future_needs_gp(struct rcu_state *rsp)
-{
- struct rcu_node *rnp = rcu_get_root(rsp);
- int idx = (READ_ONCE(rnp->completed) + 1) & 0x1;
- int *fp = &rnp->need_future_gp[idx];
-
- lockdep_assert_irqs_disabled();
- return READ_ONCE(*fp);
-}
-
-/*
- * Does the current CPU require a not-yet-started grace period?
- * The caller must have disabled interrupts to prevent races with
- * normal callback registry.
- */
-static bool
-cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
-{
- lockdep_assert_irqs_disabled();
- if (rcu_gp_in_progress(rsp))
- return false; /* No, a grace period is already in progress. */
- if (rcu_future_needs_gp(rsp))
- return true; /* Yes, a no-CBs CPU needs one. */
- if (!rcu_segcblist_is_enabled(&rdp->cblist))
- return false; /* No, this is a no-CBs (or offline) CPU. */
- if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
- return true; /* Yes, CPU has newly registered callbacks. */
- if (rcu_segcblist_future_gp_needed(&rdp->cblist,
- READ_ONCE(rsp->completed)))
- return true; /* Yes, CBs for future grace period. */
- return false; /* No grace period needed. */
-}
-
-/*
* Enter an RCU extended quiescent state, which can be either the
* idle loop or adaptive-tickless usermode execution.
*
@@ -1642,18 +1602,30 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
return rnp->completed + 1;
/*
+ * If the current rcu_node structure believes that RCU is
+ * idle, and if the rcu_state structure does not yet reflect
+ * the start of a new grace period, then the next grace period
+ * will suffice. The memory barrier is needed to accurately
+ * sample the rsp->gpnum, and pairs with the second lock
+ * acquisition in rcu_gp_init(), which is augmented with
+ * smp_mb__after_unlock_lock() for this purpose.
+ */
+ if (rnp->gpnum == rnp->completed) {
+ smp_mb(); /* See above block comment. */
+ if (READ_ONCE(rsp->gpnum) == rnp->completed)
+ return rnp->completed + 1;
+ }
+
+ /*
* Otherwise, wait for a possible partial grace period and
* then the subsequent full grace period.
*/
return rnp->completed + 2;
}
-/*
- * Trace-event helper function for rcu_start_future_gp() and
- * rcu_nocb_wait_gp().
- */
-static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
- unsigned long c, const char *s)
+/* Trace-event wrapper function for trace_rcu_future_grace_period. */
+static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp,
+ unsigned long c, const char *s)
{
trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum,
rnp->completed, c, rnp->level,
@@ -1661,96 +1633,67 @@ static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
}
/*
- * Start some future grace period, as needed to handle newly arrived
+ * Start the specified grace period, as needed to handle newly arrived
* callbacks. The required future grace periods are recorded in each
- * rcu_node structure's ->need_future_gp field. Returns true if there
+ * rcu_node structure's ->need_future_gp[] field. Returns true if there
* is reason to awaken the grace-period kthread.
*
- * The caller must hold the specified rcu_node structure's ->lock.
+ * The caller must hold the specified rcu_node structure's ->lock, which
+ * is why the caller is responsible for waking the grace-period kthread.
*/
-static bool __maybe_unused
-rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
- unsigned long *c_out)
+static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp,
+ unsigned long c)
{
- unsigned long c;
bool ret = false;
- struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
-
- raw_lockdep_assert_held_rcu_node(rnp);
-
- /*
- * Pick up grace-period number for new callbacks. If this
- * grace period is already marked as needed, return to the caller.
- */
- c = rcu_cbs_completed(rdp->rsp, rnp);
- trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf"));
- if (rnp->need_future_gp[c & 0x1]) {
- trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf"));
- goto out;
- }
+ struct rcu_state *rsp = rdp->rsp;
+ struct rcu_node *rnp_root;
/*
- * If either this rcu_node structure or the root rcu_node structure
- * believe that a grace period is in progress, then we must wait
- * for the one following, which is in "c". Because our request
- * will be noticed at the end of the current grace period, we don't
- * need to explicitly start one. We only do the lockless check
- * of rnp_root's fields if the current rcu_node structure thinks
- * there is no grace period in flight, and because we hold rnp->lock,
- * the only possible change is when rnp_root's two fields are
- * equal, in which case rnp_root->gpnum might be concurrently
- * incremented. But that is OK, as it will just result in our
- * doing some extra useless work.
+ * Use funnel locking to either acquire the root rcu_node
+ * structure's lock or bail out if the need for this grace period
+ * has already been recorded -- or has already started. If there
+ * is already a grace period in progress in a non-leaf node, no
+ * recording is needed because the end of the grace period will
+ * scan the leaf rcu_node structures. Note that rnp->lock must
+ * not be released.
*/
- if (rnp->gpnum != rnp->completed ||
- READ_ONCE(rnp_root->gpnum) != READ_ONCE(rnp_root->completed)) {
- rnp->need_future_gp[c & 0x1]++;
- trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf"));
- goto out;
+ raw_lockdep_assert_held_rcu_node(rnp);
+ trace_rcu_this_gp(rnp, rdp, c, TPS("Startleaf"));
+ for (rnp_root = rnp; 1; rnp_root = rnp_root->parent) {
+ if (rnp_root != rnp)
+ raw_spin_lock_rcu_node(rnp_root);
+ WARN_ON_ONCE(ULONG_CMP_LT(rnp_root->gpnum +
+ need_future_gp_mask(), c));
+ if (need_future_gp_element(rnp_root, c) ||
+ ULONG_CMP_GE(rnp_root->gpnum, c) ||
+ (rnp != rnp_root &&
+ rnp_root->gpnum != rnp_root->completed)) {
+ trace_rcu_this_gp(rnp_root, rdp, c, TPS("Prestarted"));
+ goto unlock_out;
+ }
+ need_future_gp_element(rnp_root, c) = true;
+ if (rnp_root != rnp && rnp_root->parent != NULL)
+ raw_spin_unlock_rcu_node(rnp_root);
+ if (!rnp_root->parent)
+ break; /* At root, and perhaps also leaf. */
}
- /*
- * There might be no grace period in progress. If we don't already
- * hold it, acquire the root rcu_node structure's lock in order to
- * start one (if needed).
- */
- if (rnp != rnp_root)
- raw_spin_lock_rcu_node(rnp_root);
-
- /*
- * Get a new grace-period number. If there really is no grace
- * period in progress, it will be smaller than the one we obtained
- * earlier. Adjust callbacks as needed.
- */
- c = rcu_cbs_completed(rdp->rsp, rnp_root);
- if (!rcu_is_nocb_cpu(rdp->cpu))
- (void)rcu_segcblist_accelerate(&rdp->cblist, c);
-
- /*
- * If the needed for the required grace period is already
- * recorded, trace and leave.
- */
- if (rnp_root->need_future_gp[c & 0x1]) {
- trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartedroot"));
+ /* If GP already in progress, just leave, otherwise start one. */
+ if (rnp_root->gpnum != rnp_root->completed) {
+ trace_rcu_this_gp(rnp_root, rdp, c, TPS("Startedleafroot"));
goto unlock_out;
}
-
- /* Record the need for the future grace period. */
- rnp_root->need_future_gp[c & 0x1]++;
-
- /* If a grace period is not already in progress, start one. */
- if (rnp_root->gpnum != rnp_root->completed) {
- trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot"));
- } else {
- trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot"));
- ret = rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp);
+ trace_rcu_this_gp(rnp_root, rdp, c, TPS("Startedroot"));
+ WRITE_ONCE(rsp->gp_flags, rsp->gp_flags | RCU_GP_FLAG_INIT);
+ if (!rsp->gp_kthread) {
+ trace_rcu_this_gp(rnp_root, rdp, c, TPS("NoGPkthread"));
+ goto unlock_out;
}
+ trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("newreq"));
+ ret = true; /* Caller must wake GP kthread. */
unlock_out:
if (rnp != rnp_root)
raw_spin_unlock_rcu_node(rnp_root);
-out:
- if (c_out != NULL)
- *c_out = c;
return ret;
}
@@ -1758,16 +1701,16 @@ out:
* Clean up any old requests for the just-ended grace period. Also return
* whether any additional grace periods have been requested.
*/
-static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+static bool rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
{
- int c = rnp->completed;
- int needmore;
+ unsigned long c = rnp->completed;
+ bool needmore;
struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
- rnp->need_future_gp[c & 0x1] = 0;
- needmore = rnp->need_future_gp[(c + 1) & 0x1];
- trace_rcu_future_gp(rnp, rdp, c,
- needmore ? TPS("CleanupMore") : TPS("Cleanup"));
+ need_future_gp_element(rnp, c) = false;
+ needmore = need_any_future_gp(rnp);
+ trace_rcu_this_gp(rnp, rdp, c,
+ needmore ? TPS("CleanupMore") : TPS("Cleanup"));
return needmore;
}
@@ -1802,6 +1745,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp)
static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp)
{
+ unsigned long c;
bool ret = false;
raw_lockdep_assert_held_rcu_node(rnp);
@@ -1820,8 +1764,9 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
* accelerating callback invocation to an earlier grace-period
* number.
*/
- if (rcu_segcblist_accelerate(&rdp->cblist, rcu_cbs_completed(rsp, rnp)))
- ret = rcu_start_future_gp(rnp, rdp, NULL);
+ c = rcu_cbs_completed(rsp, rnp);
+ if (rcu_segcblist_accelerate(&rdp->cblist, c))
+ ret = rcu_start_this_gp(rnp, rdp, c);
/* Trace depending on how much we were able to accelerate. */
if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL))
@@ -2108,7 +2053,6 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
{
unsigned long gp_duration;
bool needgp = false;
- int nocb = 0;
struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root(rsp);
struct swait_queue_head *sq;
@@ -2147,7 +2091,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
if (rnp == rdp->mynode)
needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
/* smp_mb() provided by prior unlock-lock pair. */
- nocb += rcu_future_gp_cleanup(rsp, rnp);
+ needgp = rcu_future_gp_cleanup(rsp, rnp) || needgp;
sq = rcu_nocb_gp_get(rnp);
raw_spin_unlock_irq_rcu_node(rnp);
rcu_nocb_gp_cleanup(sq);
@@ -2157,21 +2101,25 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
}
rnp = rcu_get_root(rsp);
raw_spin_lock_irq_rcu_node(rnp); /* Order GP before ->completed update. */
- rcu_nocb_gp_set(rnp, nocb);
/* Declare grace period done. */
WRITE_ONCE(rsp->completed, rsp->gpnum);
trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));
rsp->gp_state = RCU_GP_IDLE;
+ /* Check for GP requests since above loop. */
rdp = this_cpu_ptr(rsp->rda);
+ if (need_any_future_gp(rnp)) {
+ trace_rcu_this_gp(rnp, rdp, rsp->completed - 1,
+ TPS("CleanupMore"));
+ needgp = true;
+ }
/* Advance CBs to reduce false positives below. */
- needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp;
- if (needgp || cpu_needs_another_gp(rsp, rdp)) {
+ if (!rcu_accelerate_cbs(rsp, rnp, rdp) && needgp) {
WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT);
- trace_rcu_grace_period(rsp->name,
- READ_ONCE(rsp->gpnum),
+ trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum),
TPS("newreq"));
}
+ WRITE_ONCE(rsp->gp_flags, rsp->gp_flags & RCU_GP_FLAG_INIT);
raw_spin_unlock_irq_rcu_node(rnp);
}
@@ -2283,71 +2231,6 @@ static int __noreturn rcu_gp_kthread(void *arg)
}
/*
- * Start a new RCU grace period if warranted, re-initializing the hierarchy
- * in preparation for detecting the next grace period. The caller must hold
- * the root node's ->lock and hard irqs must be disabled.
- *
- * Note that it is legal for a dying CPU (which is marked as offline) to
- * invoke this function. This can happen when the dying CPU reports its
- * quiescent state.
- *
- * Returns true if the grace-period kthread must be awakened.
- */
-static bool
-rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
- struct rcu_data *rdp)
-{
- raw_lockdep_assert_held_rcu_node(rnp);
- if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) {
- /*
- * Either we have not yet spawned the grace-period
- * task, this CPU does not need another grace period,
- * or a grace period is already in progress.
- * Either way, don't start a new grace period.
- */
- return false;
- }
- WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT);
- trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum),
- TPS("newreq"));
-
- /*
- * We can't do wakeups while holding the rnp->lock, as that
- * could cause possible deadlocks with the rq->lock. Defer
- * the wakeup to our caller.
- */
- return true;
-}
-
-/*
- * Similar to rcu_start_gp_advanced(), but also advance the calling CPU's
- * callbacks. Note that rcu_start_gp_advanced() cannot do this because it
- * is invoked indirectly from rcu_advance_cbs(), which would result in
- * endless recursion -- or would do so if it wasn't for the self-deadlock
- * that is encountered beforehand.
- *
- * Returns true if the grace-period kthread needs to be awakened.
- */
-static bool rcu_start_gp(struct rcu_state *rsp)
-{
- struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
- struct rcu_node *rnp = rcu_get_root(rsp);
- bool ret = false;
-
- /*
- * If there is no grace period in progress right now, any
- * callbacks we have up to this point will be satisfied by the
- * next grace period. Also, advancing the callbacks reduces the
- * probability of false positives from cpu_needs_another_gp()
- * resulting in pointless grace periods. So, advance callbacks
- * then start the grace period!
- */
- ret = rcu_advance_cbs(rsp, rnp, rdp) || ret;
- ret = rcu_start_gp_advanced(rsp, rnp, rdp) || ret;
- return ret;
-}
-
-/*
* Report a full set of quiescent states to the specified rcu_state data
* structure. Invoke rcu_gp_kthread_wake() to awaken the grace-period
* kthread if another grace period is required. Whether we wake
@@ -2874,22 +2757,27 @@ __rcu_process_callbacks(struct rcu_state *rsp)
unsigned long flags;
bool needwake;
struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
+ struct rcu_node *rnp;
WARN_ON_ONCE(!rdp->beenonline);
/* Update RCU state based on any recent quiescent states. */
rcu_check_quiescent_state(rsp, rdp);
- /* Does this CPU require a not-yet-started grace period? */
- local_irq_save(flags);
- if (cpu_needs_another_gp(rsp, rdp)) {
- raw_spin_lock_rcu_node(rcu_get_root(rsp)); /* irqs disabled. */
- needwake = rcu_start_gp(rsp);
- raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags);
- if (needwake)
- rcu_gp_kthread_wake(rsp);
- } else {
- local_irq_restore(flags);
+ /* No grace period and unregistered callbacks? */
+ if (!rcu_gp_in_progress(rsp) &&
+ rcu_segcblist_is_enabled(&rdp->cblist)) {
+ local_irq_save(flags);
+ if (rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) {
+ local_irq_restore(flags);
+ } else {
+ rnp = rdp->mynode;
+ raw_spin_lock_rcu_node(rnp); /* irqs disabled. */
+ needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ if (needwake)
+ rcu_gp_kthread_wake(rsp);
+ }
}
/* If there are callbacks ready, invoke them. */
@@ -2973,11 +2861,11 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
/* Start a new grace period if one not already started. */
if (!rcu_gp_in_progress(rsp)) {
- struct rcu_node *rnp_root = rcu_get_root(rsp);
+ struct rcu_node *rnp = rdp->mynode;
- raw_spin_lock_rcu_node(rnp_root);
- needwake = rcu_start_gp(rsp);
- raw_spin_unlock_rcu_node(rnp_root);
+ raw_spin_lock_rcu_node(rnp);
+ needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
+ raw_spin_unlock_rcu_node(rnp);
if (needwake)
rcu_gp_kthread_wake(rsp);
} else {
@@ -3368,7 +3256,9 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
return 1;
/* Has RCU gone idle with this CPU needing another grace period? */
- if (cpu_needs_another_gp(rsp, rdp))
+ if (!rcu_gp_in_progress(rsp) &&
+ rcu_segcblist_is_enabled(&rdp->cblist) &&
+ !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
return 1;
/* Has another RCU grace period completed? */
@@ -3861,6 +3751,7 @@ static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp)
struct rcu_data *my_rdp;
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
+ bool needwake;
if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist))
return; /* No callbacks to migrate. */
@@ -3872,12 +3763,15 @@ static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp)
return;
}
raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */
- rcu_advance_cbs(rsp, rnp_root, rdp); /* Leverage recent GPs. */
- rcu_advance_cbs(rsp, rnp_root, my_rdp); /* Assign GP to pending CBs. */
+ /* Leverage recent GPs and set GP for new callbacks. */
+ needwake = rcu_advance_cbs(rsp, rnp_root, rdp) ||
+ rcu_advance_cbs(rsp, rnp_root, my_rdp);
rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist);
WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) !=
!rcu_segcblist_n_cbs(&my_rdp->cblist));
raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags);
+ if (needwake)
+ rcu_gp_kthread_wake(rsp);
WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||
!rcu_segcblist_empty(&rdp->cblist),
"rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
@@ -4168,6 +4062,7 @@ static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp)
}
struct workqueue_struct *rcu_gp_wq;
+struct workqueue_struct *rcu_par_gp_wq;
void __init rcu_init(void)
{
@@ -4199,6 +4094,8 @@ void __init rcu_init(void)
/* Create workqueue for expedited GPs and for Tree SRCU. */
rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
WARN_ON(!rcu_gp_wq);
+ rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
+ WARN_ON(!rcu_par_gp_wq);
}
#include "tree_exp.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 0b3a90ebe225..78e051dffc5b 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -58,6 +58,14 @@ struct rcu_dynticks {
#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
};
+/* Communicate arguments to a workqueue handler. */
+struct rcu_exp_work {
+ smp_call_func_t rew_func;
+ struct rcu_state *rew_rsp;
+ unsigned long rew_s;
+ struct work_struct rew_work;
+};
+
/* RCU's kthread states for tracing. */
#define RCU_KTHREAD_STOPPED 0
#define RCU_KTHREAD_RUNNING 1
@@ -150,15 +158,32 @@ struct rcu_node {
struct swait_queue_head nocb_gp_wq[2];
/* Place for rcu_nocb_kthread() to wait GP. */
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
- int need_future_gp[2];
- /* Counts of upcoming no-CB GP requests. */
+ u8 need_future_gp[4]; /* Counts of upcoming GP requests. */
raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
spinlock_t exp_lock ____cacheline_internodealigned_in_smp;
unsigned long exp_seq_rq;
wait_queue_head_t exp_wq[4];
+ struct rcu_exp_work rew;
+ bool exp_need_flush; /* Need to flush workitem? */
} ____cacheline_internodealigned_in_smp;
+/* Accessors for ->need_future_gp[] array. */
+#define need_future_gp_mask() \
+ (ARRAY_SIZE(((struct rcu_node *)NULL)->need_future_gp) - 1)
+#define need_future_gp_element(rnp, c) \
+ ((rnp)->need_future_gp[(c) & need_future_gp_mask()])
+#define need_any_future_gp(rnp) \
+({ \
+ int __i; \
+ bool __nonzero = false; \
+ \
+ for (__i = 0; __i < ARRAY_SIZE((rnp)->need_future_gp); __i++) \
+ __nonzero = __nonzero || \
+ READ_ONCE((rnp)->need_future_gp[__i]); \
+ __nonzero; \
+})
+
/*
* Bitmasks in an rcu_node cover the interval [grplo, grphi] of CPU IDs, and
* are indexed relative to this interval rather than the global CPU ID space.
@@ -448,7 +473,6 @@ static void print_cpu_stall_info_end(void);
static void zero_cpu_stall_ticks(struct rcu_data *rdp);
static void increment_cpu_stall_ticks(void);
static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu);
-static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
static void rcu_init_one_nocb(struct rcu_node *rnp);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index f512dd4e57a8..d40708e8c5d6 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -20,6 +20,8 @@
* Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
*/
+#include <linux/lockdep.h>
+
/*
* Record the start of an expedited grace period.
*/
@@ -154,15 +156,35 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
* for the current expedited grace period. Works only for preemptible
* RCU -- other RCU implementation use other means.
*
- * Caller must hold the rcu_state's exp_mutex.
+ * Caller must hold the specificed rcu_node structure's ->lock
*/
static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp)
{
+ raw_lockdep_assert_held_rcu_node(rnp);
+
return rnp->exp_tasks == NULL &&
READ_ONCE(rnp->expmask) == 0;
}
/*
+ * Like sync_rcu_preempt_exp_done(), but this function assumes the caller
+ * doesn't hold the rcu_node's ->lock, and will acquire and release the lock
+ * itself
+ */
+static bool sync_rcu_preempt_exp_done_unlocked(struct rcu_node *rnp)
+{
+ unsigned long flags;
+ bool ret;
+
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ ret = sync_rcu_preempt_exp_done(rnp);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+
+ return ret;
+}
+
+
+/*
* Report the exit from RCU read-side critical section for the last task
* that queued itself during or before the current expedited preemptible-RCU
* grace period. This event is reported either to the rcu_node structure on
@@ -170,8 +192,7 @@ static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp)
* recursively up the tree. (Calm down, calm down, we do the recursion
* iteratively!)
*
- * Caller must hold the rcu_state's exp_mutex and the specified rcu_node
- * structure's ->lock.
+ * Caller must hold the specified rcu_node structure's ->lock.
*/
static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
bool wake, unsigned long flags)
@@ -207,8 +228,6 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
/*
* Report expedited quiescent state for specified node. This is a
* lock-acquisition wrapper function for __rcu_report_exp_rnp().
- *
- * Caller must hold the rcu_state's exp_mutex.
*/
static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
struct rcu_node *rnp, bool wake)
@@ -221,8 +240,7 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
/*
* Report expedited quiescent state for multiple CPUs, all covered by the
- * specified leaf rcu_node structure. Caller must hold the rcu_state's
- * exp_mutex.
+ * specified leaf rcu_node structure.
*/
static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
unsigned long mask, bool wake)
@@ -359,93 +377,129 @@ static void sync_sched_exp_online_cleanup(int cpu)
}
/*
- * Select the nodes that the upcoming expedited grace period needs
- * to wait for.
+ * Select the CPUs within the specified rcu_node that the upcoming
+ * expedited grace period needs to wait for.
*/
-static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
- smp_call_func_t func)
+static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
{
int cpu;
unsigned long flags;
+ smp_call_func_t func;
unsigned long mask_ofl_test;
unsigned long mask_ofl_ipi;
int ret;
- struct rcu_node *rnp;
-
- trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset"));
- sync_exp_reset_tree(rsp);
- trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select"));
- rcu_for_each_leaf_node(rsp, rnp) {
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ struct rcu_exp_work *rewp =
+ container_of(wp, struct rcu_exp_work, rew_work);
+ struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew);
+ struct rcu_state *rsp = rewp->rew_rsp;
- /* Each pass checks a CPU for identity, offline, and idle. */
- mask_ofl_test = 0;
- for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
- unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
- struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
- struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu);
- int snap;
+ func = rewp->rew_func;
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
- if (raw_smp_processor_id() == cpu ||
- !(rnp->qsmaskinitnext & mask)) {
+ /* Each pass checks a CPU for identity, offline, and idle. */
+ mask_ofl_test = 0;
+ for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
+ unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+ struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu);
+ int snap;
+
+ if (raw_smp_processor_id() == cpu ||
+ !(rnp->qsmaskinitnext & mask)) {
+ mask_ofl_test |= mask;
+ } else {
+ snap = rcu_dynticks_snap(rdtp);
+ if (rcu_dynticks_in_eqs(snap))
mask_ofl_test |= mask;
- } else {
- snap = rcu_dynticks_snap(rdtp);
- if (rcu_dynticks_in_eqs(snap))
- mask_ofl_test |= mask;
- else
- rdp->exp_dynticks_snap = snap;
- }
+ else
+ rdp->exp_dynticks_snap = snap;
}
- mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
-
- /*
- * Need to wait for any blocked tasks as well. Note that
- * additional blocking tasks will also block the expedited
- * GP until such time as the ->expmask bits are cleared.
- */
- if (rcu_preempt_has_tasks(rnp))
- rnp->exp_tasks = rnp->blkd_tasks.next;
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ }
+ mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
- /* IPI the remaining CPUs for expedited quiescent state. */
- for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
- unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
- struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+ /*
+ * Need to wait for any blocked tasks as well. Note that
+ * additional blocking tasks will also block the expedited GP
+ * until such time as the ->expmask bits are cleared.
+ */
+ if (rcu_preempt_has_tasks(rnp))
+ rnp->exp_tasks = rnp->blkd_tasks.next;
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- if (!(mask_ofl_ipi & mask))
- continue;
+ /* IPI the remaining CPUs for expedited quiescent state. */
+ for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
+ unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+
+ if (!(mask_ofl_ipi & mask))
+ continue;
retry_ipi:
- if (rcu_dynticks_in_eqs_since(rdp->dynticks,
- rdp->exp_dynticks_snap)) {
- mask_ofl_test |= mask;
- continue;
- }
- ret = smp_call_function_single(cpu, func, rsp, 0);
- if (!ret) {
- mask_ofl_ipi &= ~mask;
- continue;
- }
- /* Failed, raced with CPU hotplug operation. */
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- if ((rnp->qsmaskinitnext & mask) &&
- (rnp->expmask & mask)) {
- /* Online, so delay for a bit and try again. */
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl"));
- schedule_timeout_uninterruptible(1);
- goto retry_ipi;
- }
- /* CPU really is offline, so we can ignore it. */
- if (!(rnp->expmask & mask))
- mask_ofl_ipi &= ~mask;
+ if (rcu_dynticks_in_eqs_since(rdp->dynticks,
+ rdp->exp_dynticks_snap)) {
+ mask_ofl_test |= mask;
+ continue;
+ }
+ ret = smp_call_function_single(cpu, func, rsp, 0);
+ if (!ret) {
+ mask_ofl_ipi &= ~mask;
+ continue;
+ }
+ /* Failed, raced with CPU hotplug operation. */
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ if ((rnp->qsmaskinitnext & mask) &&
+ (rnp->expmask & mask)) {
+ /* Online, so delay for a bit and try again. */
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl"));
+ schedule_timeout_uninterruptible(1);
+ goto retry_ipi;
+ }
+ /* CPU really is offline, so we can ignore it. */
+ if (!(rnp->expmask & mask))
+ mask_ofl_ipi &= ~mask;
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ }
+ /* Report quiescent states for those that went offline. */
+ mask_ofl_test |= mask_ofl_ipi;
+ if (mask_ofl_test)
+ rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false);
+}
+
+/*
+ * Select the nodes that the upcoming expedited grace period needs
+ * to wait for.
+ */
+static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
+ smp_call_func_t func)
+{
+ struct rcu_node *rnp;
+
+ trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset"));
+ sync_exp_reset_tree(rsp);
+ trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select"));
+
+ /* Schedule work for each leaf rcu_node structure. */
+ rcu_for_each_leaf_node(rsp, rnp) {
+ rnp->exp_need_flush = false;
+ if (!READ_ONCE(rnp->expmask))
+ continue; /* Avoid early boot non-existent wq. */
+ rnp->rew.rew_func = func;
+ rnp->rew.rew_rsp = rsp;
+ if (!READ_ONCE(rcu_par_gp_wq) ||
+ rcu_scheduler_active != RCU_SCHEDULER_RUNNING) {
+ /* No workqueues yet. */
+ sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work);
+ continue;
}
- /* Report quiescent states for those that went offline. */
- mask_ofl_test |= mask_ofl_ipi;
- if (mask_ofl_test)
- rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false);
+ INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
+ queue_work_on(rnp->grplo, rcu_par_gp_wq, &rnp->rew.rew_work);
+ rnp->exp_need_flush = true;
}
+
+ /* Wait for workqueue jobs (if any) to complete. */
+ rcu_for_each_leaf_node(rsp, rnp)
+ if (rnp->exp_need_flush)
+ flush_work(&rnp->rew.rew_work);
}
static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
@@ -466,9 +520,9 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
for (;;) {
ret = swait_event_timeout(
rsp->expedited_wq,
- sync_rcu_preempt_exp_done(rnp_root),
+ sync_rcu_preempt_exp_done_unlocked(rnp_root),
jiffies_stall);
- if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root))
+ if (ret > 0 || sync_rcu_preempt_exp_done_unlocked(rnp_root))
return;
WARN_ON(ret < 0); /* workqueues should not be signaled. */
if (rcu_cpu_stall_suppress)
@@ -501,7 +555,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
rcu_for_each_node_breadth_first(rsp, rnp) {
if (rnp == rnp_root)
continue; /* printed unconditionally */
- if (sync_rcu_preempt_exp_done(rnp))
+ if (sync_rcu_preempt_exp_done_unlocked(rnp))
continue;
pr_cont(" l=%u:%d-%d:%#lx/%c",
rnp->level, rnp->grplo, rnp->grphi,
@@ -557,14 +611,6 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
mutex_unlock(&rsp->exp_wake_mutex);
}
-/* Let the workqueue handler know what it is supposed to do. */
-struct rcu_exp_work {
- smp_call_func_t rew_func;
- struct rcu_state *rew_rsp;
- unsigned long rew_s;
- struct work_struct rew_work;
-};
-
/*
* Common code to drive an expedited grace period forward, used by
* workqueues and mid-boot-time tasks.
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index b999032e9466..7fd12039e512 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1815,19 +1815,6 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
swake_up_all(sq);
}
-/*
- * Set the root rcu_node structure's ->need_future_gp field
- * based on the sum of those of all rcu_node structures. This does
- * double-count the root rcu_node structure's requests, but this
- * is necessary to handle the possibility of a rcu_nocb_kthread()
- * having awakened during the time that the rcu_node structures
- * were being updated for the end of the previous grace period.
- */
-static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
-{
- rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
-}
-
static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
{
return &rnp->nocb_gp_wq[rnp->completed & 0x1];
@@ -2083,7 +2070,8 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
struct rcu_node *rnp = rdp->mynode;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
- needwake = rcu_start_future_gp(rnp, rdp, &c);
+ c = rcu_cbs_completed(rdp->rsp, rnp);
+ needwake = rcu_start_this_gp(rnp, rdp, c);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
if (needwake)
rcu_gp_kthread_wake(rdp->rsp);
@@ -2092,7 +2080,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
* Wait for the grace period. Do so interruptibly to avoid messing
* up the load average.
*/
- trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait"));
+ trace_rcu_this_gp(rnp, rdp, c, TPS("StartWait"));
for (;;) {
swait_event_interruptible(
rnp->nocb_gp_wq[c & 0x1],
@@ -2100,9 +2088,9 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
if (likely(d))
break;
WARN_ON(signal_pending(current));
- trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait"));
+ trace_rcu_this_gp(rnp, rdp, c, TPS("ResumeWait"));
}
- trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait"));
+ trace_rcu_this_gp(rnp, rdp, c, TPS("EndWait"));
smp_mb(); /* Ensure that CB invocation happens after GP end. */
}
@@ -2530,10 +2518,6 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
{
}
-static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
-{
-}
-
static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
{
return NULL;