summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/rcupdate.c28
-rw-r--r--kernel/rcutiny_plugin.h16
-rw-r--r--kernel/rcutree.c332
-rw-r--r--kernel/rcutree.h23
-rw-r--r--kernel/rcutree_plugin.h154
-rw-r--r--kernel/rcutree_trace.c4
-rw-r--r--kernel/sched/core.c1
-rw-r--r--kernel/timer.c8
8 files changed, 396 insertions, 170 deletions
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a86f1741cc27..95cba41ce1e9 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -51,6 +51,34 @@
#include "rcu.h"
+#ifdef CONFIG_PREEMPT_RCU
+
+/*
+ * Check for a task exiting while in a preemptible-RCU read-side
+ * critical section, clean up if so. No need to issue warnings,
+ * as debug_check_no_locks_held() already does this if lockdep
+ * is enabled.
+ */
+void exit_rcu(void)
+{
+ struct task_struct *t = current;
+
+ if (likely(list_empty(&current->rcu_node_entry)))
+ return;
+ t->rcu_read_lock_nesting = 1;
+ barrier();
+ t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED;
+ __rcu_read_unlock();
+}
+
+#else /* #ifdef CONFIG_PREEMPT_RCU */
+
+void exit_rcu(void)
+{
+}
+
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static struct lock_class_key rcu_lock_key;
struct lockdep_map rcu_lock_map =
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 22ecea0dfb62..fc31a2d65100 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -851,22 +851,6 @@ int rcu_preempt_needs_cpu(void)
return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;
}
-/*
- * Check for a task exiting while in a preemptible -RCU read-side
- * critical section, clean up if so. No need to issue warnings,
- * as debug_check_no_locks_held() already does this if lockdep
- * is enabled.
- */
-void exit_rcu(void)
-{
- struct task_struct *t = current;
-
- if (t->rcu_read_lock_nesting == 0)
- return;
- t->rcu_read_lock_nesting = 1;
- __rcu_read_unlock();
-}
-
#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
#ifdef CONFIG_RCU_TRACE
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 1050d6d3922c..b3ea3ac3a2b5 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -75,6 +75,8 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
.gpnum = -300, \
.completed = -300, \
.onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \
+ .orphan_nxttail = &structname##_state.orphan_nxtlist, \
+ .orphan_donetail = &structname##_state.orphan_donelist, \
.fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \
.n_force_qs = 0, \
.n_force_qs_ngp = 0, \
@@ -145,6 +147,13 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
unsigned long rcutorture_testseq;
unsigned long rcutorture_vernum;
+/* State information for rcu_barrier() and friends. */
+
+static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
+static atomic_t rcu_barrier_cpu_count;
+static DEFINE_MUTEX(rcu_barrier_mutex);
+static struct completion rcu_barrier_completion;
+
/*
* Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
* permit this function to be invoked without holding the root rcu_node
@@ -192,7 +201,6 @@ void rcu_note_context_switch(int cpu)
{
trace_rcu_utilization("Start context switch");
rcu_sched_qs(cpu);
- rcu_preempt_note_context_switch(cpu);
trace_rcu_utilization("End context switch");
}
EXPORT_SYMBOL_GPL(rcu_note_context_switch);
@@ -1311,95 +1319,133 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
#ifdef CONFIG_HOTPLUG_CPU
/*
- * Move a dying CPU's RCU callbacks to online CPU's callback list.
- * Also record a quiescent state for this CPU for the current grace period.
- * Synchronization and interrupt disabling are not required because
- * this function executes in stop_machine() context. Therefore, cleanup
- * operations that might block must be done later from the CPU_DEAD
- * notifier.
- *
- * Note that the outgoing CPU's bit has already been cleared in the
- * cpu_online_mask. This allows us to randomly pick a callback
- * destination from the bits set in that mask.
+ * Send the specified CPU's RCU callbacks to the orphanage. The
+ * specified CPU must be offline, and the caller must hold the
+ * ->onofflock.
*/
-static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
+static void
+rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
+ struct rcu_node *rnp, struct rcu_data *rdp)
{
int i;
- unsigned long mask;
- int receive_cpu = cpumask_any(cpu_online_mask);
- struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
- struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
- RCU_TRACE(struct rcu_node *rnp = rdp->mynode); /* For dying CPU. */
- /* First, adjust the counts. */
+ /*
+ * Orphan the callbacks. First adjust the counts. This is safe
+ * because ->onofflock excludes _rcu_barrier()'s adoption of
+ * the callbacks, thus no memory barrier is required.
+ */
if (rdp->nxtlist != NULL) {
- receive_rdp->qlen_lazy += rdp->qlen_lazy;
- receive_rdp->qlen += rdp->qlen;
+ rsp->qlen_lazy += rdp->qlen_lazy;
+ rsp->qlen += rdp->qlen;
+ rdp->n_cbs_orphaned += rdp->qlen;
rdp->qlen_lazy = 0;
rdp->qlen = 0;
}
/*
- * Next, move ready-to-invoke callbacks to be invoked on some
- * other CPU. These will not be required to pass through another
- * grace period: They are done, regardless of CPU.
+ * Next, move those callbacks still needing a grace period to
+ * the orphanage, where some other CPU will pick them up.
+ * Some of the callbacks might have gone partway through a grace
+ * period, but that is too bad. They get to start over because we
+ * cannot assume that grace periods are synchronized across CPUs.
+ * We don't bother updating the ->nxttail[] array yet, instead
+ * we just reset the whole thing later on.
*/
- if (rdp->nxtlist != NULL &&
- rdp->nxttail[RCU_DONE_TAIL] != &rdp->nxtlist) {
- struct rcu_head *oldhead;
- struct rcu_head **oldtail;
- struct rcu_head **newtail;
-
- oldhead = rdp->nxtlist;
- oldtail = receive_rdp->nxttail[RCU_DONE_TAIL];
- rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
- *rdp->nxttail[RCU_DONE_TAIL] = *oldtail;
- *receive_rdp->nxttail[RCU_DONE_TAIL] = oldhead;
- newtail = rdp->nxttail[RCU_DONE_TAIL];
- for (i = RCU_DONE_TAIL; i < RCU_NEXT_SIZE; i++) {
- if (receive_rdp->nxttail[i] == oldtail)
- receive_rdp->nxttail[i] = newtail;
- if (rdp->nxttail[i] == newtail)
- rdp->nxttail[i] = &rdp->nxtlist;
- }
+ if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) {
+ *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL];
+ rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL];
+ *rdp->nxttail[RCU_DONE_TAIL] = NULL;
}
/*
- * Finally, put the rest of the callbacks at the end of the list.
- * The ones that made it partway through get to start over: We
- * cannot assume that grace periods are synchronized across CPUs.
- * (We could splice RCU_WAIT_TAIL into RCU_NEXT_READY_TAIL, but
- * this does not seem compelling. Not yet, anyway.)
+ * Then move the ready-to-invoke callbacks to the orphanage,
+ * where some other CPU will pick them up. These will not be
+ * required to pass though another grace period: They are done.
*/
if (rdp->nxtlist != NULL) {
- *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
- receive_rdp->nxttail[RCU_NEXT_TAIL] =
- rdp->nxttail[RCU_NEXT_TAIL];
- receive_rdp->n_cbs_adopted += rdp->qlen;
- rdp->n_cbs_orphaned += rdp->qlen;
-
- rdp->nxtlist = NULL;
- for (i = 0; i < RCU_NEXT_SIZE; i++)
- rdp->nxttail[i] = &rdp->nxtlist;
+ *rsp->orphan_donetail = rdp->nxtlist;
+ rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL];
}
+ /* Finally, initialize the rcu_data structure's list to empty. */
+ rdp->nxtlist = NULL;
+ for (i = 0; i < RCU_NEXT_SIZE; i++)
+ rdp->nxttail[i] = &rdp->nxtlist;
+}
+
+/*
+ * Adopt the RCU callbacks from the specified rcu_state structure's
+ * orphanage. The caller must hold the ->onofflock.
+ */
+static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
+{
+ int i;
+ struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
+
/*
- * Record a quiescent state for the dying CPU. This is safe
- * only because we have already cleared out the callbacks.
- * (Otherwise, the RCU core might try to schedule the invocation
- * of callbacks on this now-offline CPU, which would be bad.)
+ * If there is an rcu_barrier() operation in progress, then
+ * only the task doing that operation is permitted to adopt
+ * callbacks. To do otherwise breaks rcu_barrier() and friends
+ * by causing them to fail to wait for the callbacks in the
+ * orphanage.
*/
- mask = rdp->grpmask; /* rnp->grplo is constant. */
+ if (rsp->rcu_barrier_in_progress &&
+ rsp->rcu_barrier_in_progress != current)
+ return;
+
+ /* Do the accounting first. */
+ rdp->qlen_lazy += rsp->qlen_lazy;
+ rdp->qlen += rsp->qlen;
+ rdp->n_cbs_adopted += rsp->qlen;
+ rsp->qlen_lazy = 0;
+ rsp->qlen = 0;
+
+ /*
+ * We do not need a memory barrier here because the only way we
+ * can get here if there is an rcu_barrier() in flight is if
+ * we are the task doing the rcu_barrier().
+ */
+
+ /* First adopt the ready-to-invoke callbacks. */
+ if (rsp->orphan_donelist != NULL) {
+ *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL];
+ *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist;
+ for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--)
+ if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
+ rdp->nxttail[i] = rsp->orphan_donetail;
+ rsp->orphan_donelist = NULL;
+ rsp->orphan_donetail = &rsp->orphan_donelist;
+ }
+
+ /* And then adopt the callbacks that still need a grace period. */
+ if (rsp->orphan_nxtlist != NULL) {
+ *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist;
+ rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail;
+ rsp->orphan_nxtlist = NULL;
+ rsp->orphan_nxttail = &rsp->orphan_nxtlist;
+ }
+}
+
+/*
+ * Trace the fact that this CPU is going offline.
+ */
+static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
+{
+ RCU_TRACE(unsigned long mask);
+ RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda));
+ RCU_TRACE(struct rcu_node *rnp = rdp->mynode);
+
+ RCU_TRACE(mask = rdp->grpmask);
trace_rcu_grace_period(rsp->name,
rnp->gpnum + 1 - !!(rnp->qsmask & mask),
"cpuofl");
- rcu_report_qs_rdp(smp_processor_id(), rsp, rdp, rsp->gpnum);
- /* Note that rcu_report_qs_rdp() might call trace_rcu_grace_period(). */
}
/*
* The CPU has been completely removed, and some other CPU is reporting
- * this fact from process context. Do the remainder of the cleanup.
+ * this fact from process context. Do the remainder of the cleanup,
+ * including orphaning the outgoing CPU's RCU callbacks, and also
+ * adopting them, if there is no _rcu_barrier() instance running.
* There can only be one CPU hotplug operation at a time, so no other
* CPU can be attempting to update rcu_cpu_kthread_task.
*/
@@ -1409,17 +1455,21 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
unsigned long mask;
int need_report = 0;
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
- struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rnp. */
+ struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
/* Adjust any no-longer-needed kthreads. */
rcu_stop_cpu_kthread(cpu);
rcu_node_kthread_setaffinity(rnp, -1);
- /* Remove the dying CPU from the bitmasks in the rcu_node hierarchy. */
+ /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */
/* Exclude any attempts to start a new grace period. */
raw_spin_lock_irqsave(&rsp->onofflock, flags);
+ /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
+ rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
+ rcu_adopt_orphan_cbs(rsp);
+
/* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
mask = rdp->grpmask; /* rnp->grplo is constant. */
do {
@@ -1456,6 +1506,10 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
#else /* #ifdef CONFIG_HOTPLUG_CPU */
+static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
+{
+}
+
static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
{
}
@@ -1524,9 +1578,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
rcu_is_callbacks_kthread());
/* Update count, and requeue any remaining callbacks. */
- rdp->qlen_lazy -= count_lazy;
- rdp->qlen -= count;
- rdp->n_cbs_invoked += count;
if (list != NULL) {
*tail = rdp->nxtlist;
rdp->nxtlist = list;
@@ -1536,6 +1587,10 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
else
break;
}
+ smp_mb(); /* List handling before counting for rcu_barrier(). */
+ rdp->qlen_lazy -= count_lazy;
+ rdp->qlen -= count;
+ rdp->n_cbs_invoked += count;
/* Reinstate batch limit if we have worked down the excess. */
if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
@@ -1824,11 +1879,14 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
rdp = this_cpu_ptr(rsp->rda);
/* Add the callback to our list. */
- *rdp->nxttail[RCU_NEXT_TAIL] = head;
- rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
rdp->qlen++;
if (lazy)
rdp->qlen_lazy++;
+ else
+ rcu_idle_count_callbacks_posted();
+ smp_mb(); /* Count before adding callback for rcu_barrier(). */
+ *rdp->nxttail[RCU_NEXT_TAIL] = head;
+ rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
if (__is_kfree_rcu_offset((unsigned long)func))
trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
@@ -1894,6 +1952,38 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
}
EXPORT_SYMBOL_GPL(call_rcu_bh);
+/*
+ * Because a context switch is a grace period for RCU-sched and RCU-bh,
+ * any blocking grace-period wait automatically implies a grace period
+ * if there is only one CPU online at any point time during execution
+ * of either synchronize_sched() or synchronize_rcu_bh(). It is OK to
+ * occasionally incorrectly indicate that there are multiple CPUs online
+ * when there was in fact only one the whole time, as this just adds
+ * some overhead: RCU still operates correctly.
+ *
+ * Of course, sampling num_online_cpus() with preemption enabled can
+ * give erroneous results if there are concurrent CPU-hotplug operations.
+ * For example, given a demonic sequence of preemptions in num_online_cpus()
+ * and CPU-hotplug operations, there could be two or more CPUs online at
+ * all times, but num_online_cpus() might well return one (or even zero).
+ *
+ * However, all such demonic sequences require at least one CPU-offline
+ * operation. Furthermore, rcu_blocking_is_gp() giving the wrong answer
+ * is only a problem if there is an RCU read-side critical section executing
+ * throughout. But RCU-sched and RCU-bh read-side critical sections
+ * disable either preemption or bh, which prevents a CPU from going offline.
+ * Therefore, the only way that rcu_blocking_is_gp() can incorrectly return
+ * that there is only one CPU when in fact there was more than one throughout
+ * is when there were no RCU readers in the system. If there are no
+ * RCU readers, the grace period by definition can be of zero length,
+ * regardless of the number of online CPUs.
+ */
+static inline int rcu_blocking_is_gp(void)
+{
+ might_sleep(); /* Check for RCU read-side critical section. */
+ return num_online_cpus() <= 1;
+}
+
/**
* synchronize_sched - wait until an rcu-sched grace period has elapsed.
*
@@ -2167,11 +2257,10 @@ static int rcu_cpu_has_callbacks(int cpu)
rcu_preempt_cpu_has_callbacks(cpu);
}
-static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
-static atomic_t rcu_barrier_cpu_count;
-static DEFINE_MUTEX(rcu_barrier_mutex);
-static struct completion rcu_barrier_completion;
-
+/*
+ * RCU callback function for _rcu_barrier(). If we are last, wake
+ * up the task executing _rcu_barrier().
+ */
static void rcu_barrier_callback(struct rcu_head *notused)
{
if (atomic_dec_and_test(&rcu_barrier_cpu_count))
@@ -2201,27 +2290,94 @@ static void _rcu_barrier(struct rcu_state *rsp,
void (*call_rcu_func)(struct rcu_head *head,
void (*func)(struct rcu_head *head)))
{
- BUG_ON(in_interrupt());
+ int cpu;
+ unsigned long flags;
+ struct rcu_data *rdp;
+ struct rcu_head rh;
+
+ init_rcu_head_on_stack(&rh);
+
/* Take mutex to serialize concurrent rcu_barrier() requests. */
mutex_lock(&rcu_barrier_mutex);
- init_completion(&rcu_barrier_completion);
+
+ smp_mb(); /* Prevent any prior operations from leaking in. */
+
/*
- * Initialize rcu_barrier_cpu_count to 1, then invoke
- * rcu_barrier_func() on each CPU, so that each CPU also has
- * incremented rcu_barrier_cpu_count. Only then is it safe to
- * decrement rcu_barrier_cpu_count -- otherwise the first CPU
- * might complete its grace period before all of the other CPUs
- * did their increment, causing this function to return too
- * early. Note that on_each_cpu() disables irqs, which prevents
- * any CPUs from coming online or going offline until each online
- * CPU has queued its RCU-barrier callback.
+ * Initialize the count to one rather than to zero in order to
+ * avoid a too-soon return to zero in case of a short grace period
+ * (or preemption of this task). Also flag this task as doing
+ * an rcu_barrier(). This will prevent anyone else from adopting
+ * orphaned callbacks, which could cause otherwise failure if a
+ * CPU went offline and quickly came back online. To see this,
+ * consider the following sequence of events:
+ *
+ * 1. We cause CPU 0 to post an rcu_barrier_callback() callback.
+ * 2. CPU 1 goes offline, orphaning its callbacks.
+ * 3. CPU 0 adopts CPU 1's orphaned callbacks.
+ * 4. CPU 1 comes back online.
+ * 5. We cause CPU 1 to post an rcu_barrier_callback() callback.
+ * 6. Both rcu_barrier_callback() callbacks are invoked, awakening
+ * us -- but before CPU 1's orphaned callbacks are invoked!!!
*/
+ init_completion(&rcu_barrier_completion);
atomic_set(&rcu_barrier_cpu_count, 1);
- on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
+ raw_spin_lock_irqsave(&rsp->onofflock, flags);
+ rsp->rcu_barrier_in_progress = current;
+ raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
+
+ /*
+ * Force every CPU with callbacks to register a new callback
+ * that will tell us when all the preceding callbacks have
+ * been invoked. If an offline CPU has callbacks, wait for
+ * it to either come back online or to finish orphaning those
+ * callbacks.
+ */
+ for_each_possible_cpu(cpu) {
+ preempt_disable();
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+ if (cpu_is_offline(cpu)) {
+ preempt_enable();
+ while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen))
+ schedule_timeout_interruptible(1);
+ } else if (ACCESS_ONCE(rdp->qlen)) {
+ smp_call_function_single(cpu, rcu_barrier_func,
+ (void *)call_rcu_func, 1);
+ preempt_enable();
+ } else {
+ preempt_enable();
+ }
+ }
+
+ /*
+ * Now that all online CPUs have rcu_barrier_callback() callbacks
+ * posted, we can adopt all of the orphaned callbacks and place
+ * an rcu_barrier_callback() callback after them. When that is done,
+ * we are guaranteed to have an rcu_barrier_callback() callback
+ * following every callback that could possibly have been
+ * registered before _rcu_barrier() was called.
+ */
+ raw_spin_lock_irqsave(&rsp->onofflock, flags);
+ rcu_adopt_orphan_cbs(rsp);
+ rsp->rcu_barrier_in_progress = NULL;
+ raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
+ atomic_inc(&rcu_barrier_cpu_count);
+ smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */
+ call_rcu_func(&rh, rcu_barrier_callback);
+
+ /*
+ * Now that we have an rcu_barrier_callback() callback on each
+ * CPU, and thus each counted, remove the initial count.
+ */
if (atomic_dec_and_test(&rcu_barrier_cpu_count))
complete(&rcu_barrier_completion);
+
+ /* Wait for all rcu_barrier_callback() callbacks to be invoked. */
wait_for_completion(&rcu_barrier_completion);
+
+ /* Other rcu_barrier() invocations can now safely proceed. */
mutex_unlock(&rcu_barrier_mutex);
+
+ destroy_rcu_head_on_stack(&rh);
}
/**
@@ -2418,7 +2574,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
for (i = NUM_RCU_LVLS - 1; i > 0; i--)
rsp->levelspread[i] = CONFIG_RCU_FANOUT;
- rsp->levelspread[0] = RCU_FANOUT_LEAF;
+ rsp->levelspread[0] = CONFIG_RCU_FANOUT_LEAF;
}
#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
static void __init rcu_init_levelspread(struct rcu_state *rsp)
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index cdd1be0a4072..7f5d138dedf5 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -29,18 +29,14 @@
#include <linux/seqlock.h>
/*
- * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
+ * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
+ * CONFIG_RCU_FANOUT_LEAF.
* In theory, it should be possible to add more levels straightforwardly.
* In practice, this did work well going from three levels to four.
* Of course, your mileage may vary.
*/
#define MAX_RCU_LVLS 4
-#if CONFIG_RCU_FANOUT > 16
-#define RCU_FANOUT_LEAF 16
-#else /* #if CONFIG_RCU_FANOUT > 16 */
-#define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT)
-#endif /* #else #if CONFIG_RCU_FANOUT > 16 */
-#define RCU_FANOUT_1 (RCU_FANOUT_LEAF)
+#define RCU_FANOUT_1 (CONFIG_RCU_FANOUT_LEAF)
#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
@@ -371,6 +367,17 @@ struct rcu_state {
raw_spinlock_t onofflock; /* exclude on/offline and */
/* starting new GP. */
+ struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */
+ /* need a grace period. */
+ struct rcu_head **orphan_nxttail; /* Tail of above. */
+ struct rcu_head *orphan_donelist; /* Orphaned callbacks that */
+ /* are ready to invoke. */
+ struct rcu_head **orphan_donetail; /* Tail of above. */
+ long qlen_lazy; /* Number of lazy callbacks. */
+ long qlen; /* Total number of callbacks. */
+ struct task_struct *rcu_barrier_in_progress;
+ /* Task doing rcu_barrier(), */
+ /* or NULL if no barrier. */
raw_spinlock_t fqslock; /* Only one task forcing */
/* quiescent states. */
unsigned long jiffies_force_qs; /* Time at which to invoke */
@@ -423,7 +430,6 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
/* Forward declarations for rcutree_plugin.h */
static void rcu_bootup_announce(void);
long rcu_batches_completed(void);
-static void rcu_preempt_note_context_switch(int cpu);
static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
#ifdef CONFIG_HOTPLUG_CPU
static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
@@ -471,6 +477,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu);
static void rcu_prepare_for_idle_init(int cpu);
static void rcu_cleanup_after_idle(int cpu);
static void rcu_prepare_for_idle(int cpu);
+static void rcu_idle_count_callbacks_posted(void);
static void print_cpu_stall_info_begin(void);
static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
static void print_cpu_stall_info_end(void);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index c023464816be..2411000d9869 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -153,7 +153,7 @@ static void rcu_preempt_qs(int cpu)
*
* Caller must disable preemption.
*/
-static void rcu_preempt_note_context_switch(int cpu)
+void rcu_preempt_note_context_switch(void)
{
struct task_struct *t = current;
unsigned long flags;
@@ -164,7 +164,7 @@ static void rcu_preempt_note_context_switch(int cpu)
(t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
/* Possibly blocking in an RCU read-side critical section. */
- rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
+ rdp = __this_cpu_ptr(rcu_preempt_state.rda);
rnp = rdp->mynode;
raw_spin_lock_irqsave(&rnp->lock, flags);
t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
@@ -228,7 +228,7 @@ static void rcu_preempt_note_context_switch(int cpu)
* means that we continue to block the current grace period.
*/
local_irq_save(flags);
- rcu_preempt_qs(cpu);
+ rcu_preempt_qs(smp_processor_id());
local_irq_restore(flags);
}
@@ -969,22 +969,6 @@ static void __init __rcu_init_preempt(void)
rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
}
-/*
- * Check for a task exiting while in a preemptible-RCU read-side
- * critical section, clean up if so. No need to issue warnings,
- * as debug_check_no_locks_held() already does this if lockdep
- * is enabled.
- */
-void exit_rcu(void)
-{
- struct task_struct *t = current;
-
- if (t->rcu_read_lock_nesting == 0)
- return;
- t->rcu_read_lock_nesting = 1;
- __rcu_read_unlock();
-}
-
#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
static struct rcu_state *rcu_state = &rcu_sched_state;
@@ -1018,14 +1002,6 @@ void rcu_force_quiescent_state(void)
EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
/*
- * Because preemptible RCU does not exist, we never have to check for
- * CPUs being in quiescent states.
- */
-static void rcu_preempt_note_context_switch(int cpu)
-{
-}
-
-/*
* Because preemptible RCU does not exist, there are never any preempted
* RCU readers.
*/
@@ -1938,6 +1914,14 @@ static void rcu_prepare_for_idle(int cpu)
{
}
+/*
+ * Don't bother keeping a running count of the number of RCU callbacks
+ * posted because CONFIG_RCU_FAST_NO_HZ=n.
+ */
+static void rcu_idle_count_callbacks_posted(void)
+{
+}
+
#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
/*
@@ -1978,11 +1962,20 @@ static void rcu_prepare_for_idle(int cpu)
#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */
#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
+/* Loop counter for rcu_prepare_for_idle(). */
static DEFINE_PER_CPU(int, rcu_dyntick_drain);
+/* If rcu_dyntick_holdoff==jiffies, don't try to enter dyntick-idle mode. */
static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
-static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer);
-static ktime_t rcu_idle_gp_wait; /* If some non-lazy callbacks. */
-static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */
+/* Timer to awaken the CPU if it enters dyntick-idle mode with callbacks. */
+static DEFINE_PER_CPU(struct timer_list, rcu_idle_gp_timer);
+/* Scheduled expiry time for rcu_idle_gp_timer to allow reposting. */
+static DEFINE_PER_CPU(unsigned long, rcu_idle_gp_timer_expires);
+/* Enable special processing on first attempt to enter dyntick-idle mode. */
+static DEFINE_PER_CPU(bool, rcu_idle_first_pass);
+/* Running count of non-lazy callbacks posted, never decremented. */
+static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted);
+/* Snapshot of rcu_nonlazy_posted to detect meaningful exits from idle. */
+static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted_snap);
/*
* Allow the CPU to enter dyntick-idle mode if either: (1) There are no
@@ -1995,6 +1988,8 @@ static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */
*/
int rcu_needs_cpu(int cpu)
{
+ /* Flag a new idle sojourn to the idle-entry state machine. */
+ per_cpu(rcu_idle_first_pass, cpu) = 1;
/* If no callbacks, RCU doesn't need the CPU. */
if (!rcu_cpu_has_callbacks(cpu))
return 0;
@@ -2045,16 +2040,34 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu)
}
/*
+ * Handler for smp_call_function_single(). The only point of this
+ * handler is to wake the CPU up, so the handler does only tracing.
+ */
+void rcu_idle_demigrate(void *unused)
+{
+ trace_rcu_prep_idle("Demigrate");
+}
+
+/*
* Timer handler used to force CPU to start pushing its remaining RCU
* callbacks in the case where it entered dyntick-idle mode with callbacks
* pending. The hander doesn't really need to do anything because the
* real work is done upon re-entry to idle, or by the next scheduling-clock
* interrupt should idle not be re-entered.
+ *
+ * One special case: the timer gets migrated without awakening the CPU
+ * on which the timer was scheduled on. In this case, we must wake up
+ * that CPU. We do so with smp_call_function_single().
*/
-static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp)
+static void rcu_idle_gp_timer_func(unsigned long cpu_in)
{
+ int cpu = (int)cpu_in;
+
trace_rcu_prep_idle("Timer");
- return HRTIMER_NORESTART;
+ if (cpu != smp_processor_id())
+ smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0);
+ else
+ WARN_ON_ONCE(1); /* Getting here can hang the system... */
}
/*
@@ -2062,19 +2075,11 @@ static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp)
*/
static void rcu_prepare_for_idle_init(int cpu)
{
- static int firsttime = 1;
- struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu);
-
- hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- hrtp->function = rcu_idle_gp_timer_func;
- if (firsttime) {
- unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY);
-
- rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000);
- upj = jiffies_to_usecs(RCU_IDLE_LAZY_GP_DELAY);
- rcu_idle_lazy_gp_wait = ns_to_ktime(upj * (u64)1000);
- firsttime = 0;
- }
+ per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
+ setup_timer(&per_cpu(rcu_idle_gp_timer, cpu),
+ rcu_idle_gp_timer_func, cpu);
+ per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies - 1;
+ per_cpu(rcu_idle_first_pass, cpu) = 1;
}
/*
@@ -2084,7 +2089,8 @@ static void rcu_prepare_for_idle_init(int cpu)
*/
static void rcu_cleanup_after_idle(int cpu)
{
- hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu));
+ del_timer(&per_cpu(rcu_idle_gp_timer, cpu));
+ trace_rcu_prep_idle("Cleanup after idle");
}
/*
@@ -2108,6 +2114,29 @@ static void rcu_cleanup_after_idle(int cpu)
*/
static void rcu_prepare_for_idle(int cpu)
{
+ struct timer_list *tp;
+
+ /*
+ * If this is an idle re-entry, for example, due to use of
+ * RCU_NONIDLE() or the new idle-loop tracing API within the idle
+ * loop, then don't take any state-machine actions, unless the
+ * momentary exit from idle queued additional non-lazy callbacks.
+ * Instead, repost the rcu_idle_gp_timer if this CPU has callbacks
+ * pending.
+ */
+ if (!per_cpu(rcu_idle_first_pass, cpu) &&
+ (per_cpu(rcu_nonlazy_posted, cpu) ==
+ per_cpu(rcu_nonlazy_posted_snap, cpu))) {
+ if (rcu_cpu_has_callbacks(cpu)) {
+ tp = &per_cpu(rcu_idle_gp_timer, cpu);
+ mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu));
+ }
+ return;
+ }
+ per_cpu(rcu_idle_first_pass, cpu) = 0;
+ per_cpu(rcu_nonlazy_posted_snap, cpu) =
+ per_cpu(rcu_nonlazy_posted, cpu) - 1;
+
/*
* If there are no callbacks on this CPU, enter dyntick-idle mode.
* Also reset state to avoid prejudicing later attempts.
@@ -2140,11 +2169,15 @@ static void rcu_prepare_for_idle(int cpu)
per_cpu(rcu_dyntick_drain, cpu) = 0;
per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
if (rcu_cpu_has_nonlazy_callbacks(cpu))
- hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu),
- rcu_idle_gp_wait, HRTIMER_MODE_REL);
+ per_cpu(rcu_idle_gp_timer_expires, cpu) =
+ jiffies + RCU_IDLE_GP_DELAY;
else
- hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu),
- rcu_idle_lazy_gp_wait, HRTIMER_MODE_REL);
+ per_cpu(rcu_idle_gp_timer_expires, cpu) =
+ jiffies + RCU_IDLE_LAZY_GP_DELAY;
+ tp = &per_cpu(rcu_idle_gp_timer, cpu);
+ mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu));
+ per_cpu(rcu_nonlazy_posted_snap, cpu) =
+ per_cpu(rcu_nonlazy_posted, cpu);
return; /* Nothing more to do immediately. */
} else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
/* We have hit the limit, so time to give up. */
@@ -2184,6 +2217,19 @@ static void rcu_prepare_for_idle(int cpu)
trace_rcu_prep_idle("Callbacks drained");
}
+/*
+ * Keep a running count of the number of non-lazy callbacks posted
+ * on this CPU. This running counter (which is never decremented) allows
+ * rcu_prepare_for_idle() to detect when something out of the idle loop
+ * posts a callback, even if an equal number of callbacks are invoked.
+ * Of course, callbacks should only be posted from within a trace event
+ * designed to be called from idle or from within RCU_NONIDLE().
+ */
+static void rcu_idle_count_callbacks_posted(void)
+{
+ __this_cpu_add(rcu_nonlazy_posted, 1);
+}
+
#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
#ifdef CONFIG_RCU_CPU_STALL_INFO
@@ -2192,14 +2238,12 @@ static void rcu_prepare_for_idle(int cpu)
static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
{
- struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu);
+ struct timer_list *tltp = &per_cpu(rcu_idle_gp_timer, cpu);
- sprintf(cp, "drain=%d %c timer=%lld",
+ sprintf(cp, "drain=%d %c timer=%lu",
per_cpu(rcu_dyntick_drain, cpu),
per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.',
- hrtimer_active(hrtp)
- ? ktime_to_us(hrtimer_get_remaining(hrtp))
- : -1);
+ timer_pending(tltp) ? tltp->expires - jiffies : -1);
}
#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index ed459edeff43..d4bc16ddd1d4 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -271,13 +271,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
gpnum = rsp->gpnum;
seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
- "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
+ "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
rsp->completed, gpnum, rsp->fqs_state,
(long)(rsp->jiffies_force_qs - jiffies),
(int)(jiffies & 0xffff),
rsp->n_force_qs, rsp->n_force_qs_ngp,
rsp->n_force_qs - rsp->n_force_qs_ngp,
- rsp->n_force_qs_lh);
+ rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen);
for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
if (rnp->level != level) {
seq_puts(m, "\n");
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4603b9d8f30a..5d89eb93f7e4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2083,6 +2083,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
#endif
/* Here we just switch the register state and the stack. */
+ rcu_switch_from(prev);
switch_to(prev, next, prev);
barrier();
diff --git a/kernel/timer.c b/kernel/timer.c
index a297ffcf888e..837c552fe838 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -861,7 +861,13 @@ EXPORT_SYMBOL(mod_timer);
*
* mod_timer_pinned() is a way to update the expire field of an
* active timer (if the timer is inactive it will be activated)
- * and not allow the timer to be migrated to a different CPU.
+ * and to ensure that the timer is scheduled on the current CPU.
+ *
+ * Note that this does not prevent the timer from being migrated
+ * when the current CPU goes offline. If this is a problem for
+ * you, use CPU-hotplug notifiers to handle it correctly, for
+ * example, cancelling the timer when the corresponding CPU goes
+ * offline.
*
* mod_timer_pinned(timer, expires) is equivalent to:
*