diff options
author | Paul E. McKenney <paulmck@linux.vnet.ibm.com> | 2015-08-02 22:53:17 +0200 |
---|---|---|
committer | Paul E. McKenney <paulmck@linux.vnet.ibm.com> | 2015-09-21 06:16:19 +0200 |
commit | 8203d6d0ee784cfb2ebf89053f7fe399abc867d7 (patch) | |
tree | cb0598f473834f117a06b1eeab32576fcddd0e52 /kernel/rcu/tree.c | |
parent | rcu: Consolidate tree setup for synchronize_rcu_expedited() (diff) | |
download | linux-8203d6d0ee784cfb2ebf89053f7fe399abc867d7.tar.xz linux-8203d6d0ee784cfb2ebf89053f7fe399abc867d7.zip |
rcu: Use single-stage IPI algorithm for RCU expedited grace period
The current preemptible-RCU expedited grace-period algorithm invokes
synchronize_sched_expedited() to enqueue all tasks currently running
in a preemptible-RCU read-side critical section, then waits for all the
->blkd_tasks lists to drain. This works, but results in both an IPI and
a double context switch even on CPUs that do not happen to be running
in a preemptible RCU read-side critical section.
This commit implements a new algorithm that causes less OS jitter.
This new algorithm IPIs all online CPUs that are not idle (from an
RCU perspective), but refrains from self-IPIs. If a CPU receiving
this IPI is not in a preemptible RCU read-side critical section (or
is just now exiting one), it pushes quiescence up the rcu_node tree,
otherwise, it sets a flag that will be handled by the upcoming outermost
rcu_read_unlock(), which will then push quiescence up the tree.
The expedited grace period must of course wait on any pre-existing blocked
readers, and newly blocked readers must be queued carefully based on
the state of both the normal and the expedited grace periods. This
new queueing approach also avoids the need to update boost state,
courtesy of the fact that blocked tasks are no longer ever migrated to
the root rcu_node structure.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Diffstat (limited to 'kernel/rcu/tree.c')
-rw-r--r-- | kernel/rcu/tree.c | 75 |
1 files changed, 55 insertions, 20 deletions
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8526896afea7..6e92bf4337bd 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3461,18 +3461,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) } /* - * Return non-zero if there are any tasks in RCU read-side critical - * sections blocking the current preemptible-RCU expedited grace period. - * If there is no preemptible-RCU expedited grace period currently in - * progress, returns zero unconditionally. - */ -static int rcu_preempted_readers_exp(struct rcu_node *rnp) -{ - return rnp->exp_tasks != NULL; -} - -/* - * return non-zero if there is no RCU expedited grace period in progress + * Return non-zero if there is no RCU expedited grace period in progress * for the specified rcu_node structure, in other words, if all CPUs and * tasks covered by the specified rcu_node structure have done their bit * for the current expedited grace period. Works only for preemptible @@ -3482,7 +3471,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp) */ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) { - return !rcu_preempted_readers_exp(rnp) && + return rnp->exp_tasks == NULL && READ_ONCE(rnp->expmask) == 0; } @@ -3494,19 +3483,21 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) * recursively up the tree. (Calm down, calm down, we do the recursion * iteratively!) * - * Caller must hold the root rcu_node's exp_funnel_mutex. + * Caller must hold the root rcu_node's exp_funnel_mutex and the + * specified rcu_node structure's ->lock. */ -static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, - struct rcu_node *rnp, bool wake) +static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, + bool wake, unsigned long flags) + __releases(rnp->lock) { - unsigned long flags; unsigned long mask; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); for (;;) { if (!sync_rcu_preempt_exp_done(rnp)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + if (!rnp->expmask) + rcu_initiate_boost(rnp, flags); + else + raw_spin_unlock_irqrestore(&rnp->lock, flags); break; } if (rnp->parent == NULL) { @@ -3522,10 +3513,54 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, rnp = rnp->parent; raw_spin_lock(&rnp->lock); /* irqs already disabled */ smp_mb__after_unlock_lock(); + WARN_ON_ONCE(!(rnp->expmask & mask)); rnp->expmask &= ~mask; } } +/* + * Report expedited quiescent state for specified node. This is a + * lock-acquisition wrapper function for __rcu_report_exp_rnp(). + * + * Caller must hold the root rcu_node's exp_funnel_mutex. + */ +static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, + struct rcu_node *rnp, bool wake) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); + __rcu_report_exp_rnp(rsp, rnp, wake, flags); +} + +/* + * Report expedited quiescent state for multiple CPUs, all covered by the + * specified leaf rcu_node structure. Caller must hold the root + * rcu_node's exp_funnel_mutex. + */ +static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, + unsigned long mask, bool wake) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); + WARN_ON_ONCE((rnp->expmask & mask) != mask); + rnp->expmask &= ~mask; + __rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */ +} + +/* + * Report expedited quiescent state for specified rcu_data (CPU). + * Caller must hold the root rcu_node's exp_funnel_mutex. + */ +static void __maybe_unused rcu_report_exp_rdp(struct rcu_state *rsp, + struct rcu_data *rdp, bool wake) +{ + rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake); +} + /* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp, |