summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-05-20 03:14:34 +0200
committerLinus Torvalds <torvalds@linux-foundation.org>2011-05-20 03:14:34 +0200
commiteb04f2f04ed1227c266b3219c0aaeda525639718 (patch)
tree7f224483a3cd0e439cd64a8666ec9dc5ed178a3d /kernel
parentMerge branch 'x86-smep-for-linus' of git://git.kernel.org/pub/scm/linux/kerne... (diff)
parentRevert "rcu: Decrease memory-barrier usage based on semi-formal proof" (diff)
downloadlinux-eb04f2f04ed1227c266b3219c0aaeda525639718.tar.xz
linux-eb04f2f04ed1227c266b3219c0aaeda525639718.zip
Merge branch 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (78 commits) Revert "rcu: Decrease memory-barrier usage based on semi-formal proof" net,rcu: convert call_rcu(prl_entry_destroy_rcu) to kfree batman,rcu: convert call_rcu(softif_neigh_free_rcu) to kfree_rcu batman,rcu: convert call_rcu(neigh_node_free_rcu) to kfree() batman,rcu: convert call_rcu(gw_node_free_rcu) to kfree_rcu net,rcu: convert call_rcu(kfree_tid_tx) to kfree_rcu() net,rcu: convert call_rcu(xt_osf_finger_free_rcu) to kfree_rcu() net/mac80211,rcu: convert call_rcu(work_free_rcu) to kfree_rcu() net,rcu: convert call_rcu(wq_free_rcu) to kfree_rcu() net,rcu: convert call_rcu(phonet_device_rcu_free) to kfree_rcu() perf,rcu: convert call_rcu(swevent_hlist_release_rcu) to kfree_rcu() perf,rcu: convert call_rcu(free_ctx) to kfree_rcu() net,rcu: convert call_rcu(__nf_ct_ext_free_rcu) to kfree_rcu() net,rcu: convert call_rcu(net_generic_release) to kfree_rcu() net,rcu: convert call_rcu(netlbl_unlhsh_free_addr6) to kfree_rcu() net,rcu: convert call_rcu(netlbl_unlhsh_free_addr4) to kfree_rcu() security,rcu: convert call_rcu(sel_netif_free) to kfree_rcu() net,rcu: convert call_rcu(xps_dev_maps_release) to kfree_rcu() net,rcu: convert call_rcu(xps_map_release) to kfree_rcu() net,rcu: convert call_rcu(rps_map_release) to kfree_rcu() ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c27
-rw-r--r--kernel/events/core.c20
-rw-r--r--kernel/rcupdate.c32
-rw-r--r--kernel/rcutiny.c45
-rw-r--r--kernel/rcutiny_plugin.h203
-rw-r--r--kernel/rcutorture.c26
-rw-r--r--kernel/rcutree.c526
-rw-r--r--kernel/rcutree.h104
-rw-r--r--kernel/rcutree_plugin.h568
-rw-r--r--kernel/rcutree_trace.c180
-rw-r--r--kernel/softirq.c2
11 files changed, 1333 insertions, 400 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 25c7eb52de1a..909a35510af5 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -326,12 +326,6 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
return &css_set_table[index];
}
-static void free_css_set_rcu(struct rcu_head *obj)
-{
- struct css_set *cg = container_of(obj, struct css_set, rcu_head);
- kfree(cg);
-}
-
/* We don't maintain the lists running through each css_set to its
* task until after the first call to cgroup_iter_start(). This
* reduces the fork()/exit() overhead for people who have cgroups
@@ -375,7 +369,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
}
write_unlock(&css_set_lock);
- call_rcu(&cg->rcu_head, free_css_set_rcu);
+ kfree_rcu(cg, rcu_head);
}
/*
@@ -812,13 +806,6 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
return ret;
}
-static void free_cgroup_rcu(struct rcu_head *obj)
-{
- struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
-
- kfree(cgrp);
-}
-
static void cgroup_diput(struct dentry *dentry, struct inode *inode)
{
/* is dentry a directory ? if so, kfree() associated cgroup */
@@ -856,7 +843,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
*/
BUG_ON(!list_empty(&cgrp->pidlists));
- call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
+ kfree_rcu(cgrp, rcu_head);
}
iput(inode);
}
@@ -4623,14 +4610,6 @@ bool css_is_ancestor(struct cgroup_subsys_state *child,
return ret;
}
-static void __free_css_id_cb(struct rcu_head *head)
-{
- struct css_id *id;
-
- id = container_of(head, struct css_id, rcu_head);
- kfree(id);
-}
-
void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
{
struct css_id *id = css->id;
@@ -4645,7 +4624,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
spin_lock(&ss->id_lock);
idr_remove(&ss->idr, id->id);
spin_unlock(&ss->id_lock);
- call_rcu(&id->rcu_head, __free_css_id_cb);
+ kfree_rcu(id, rcu_head);
}
EXPORT_SYMBOL_GPL(free_css_id);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0fc34a370ba4..c09767f7db3e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -586,14 +586,6 @@ static void get_ctx(struct perf_event_context *ctx)
WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
}
-static void free_ctx(struct rcu_head *head)
-{
- struct perf_event_context *ctx;
-
- ctx = container_of(head, struct perf_event_context, rcu_head);
- kfree(ctx);
-}
-
static void put_ctx(struct perf_event_context *ctx)
{
if (atomic_dec_and_test(&ctx->refcount)) {
@@ -601,7 +593,7 @@ static void put_ctx(struct perf_event_context *ctx)
put_ctx(ctx->parent_ctx);
if (ctx->task)
put_task_struct(ctx->task);
- call_rcu(&ctx->rcu_head, free_ctx);
+ kfree_rcu(ctx, rcu_head);
}
}
@@ -5331,14 +5323,6 @@ swevent_hlist_deref(struct swevent_htable *swhash)
lockdep_is_held(&swhash->hlist_mutex));
}
-static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
-{
- struct swevent_hlist *hlist;
-
- hlist = container_of(rcu_head, struct swevent_hlist, rcu_head);
- kfree(hlist);
-}
-
static void swevent_hlist_release(struct swevent_htable *swhash)
{
struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
@@ -5347,7 +5331,7 @@ static void swevent_hlist_release(struct swevent_htable *swhash)
return;
rcu_assign_pointer(swhash->swevent_hlist, NULL);
- call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
+ kfree_rcu(hlist, rcu_head);
}
static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index f3240e987928..7784bd216b6a 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -142,10 +142,17 @@ static int rcuhead_fixup_init(void *addr, enum debug_obj_state state)
* Ensure that queued callbacks are all executed.
* If we detect that we are nested in a RCU read-side critical
* section, we should simply fail, otherwise we would deadlock.
+ * In !PREEMPT configurations, there is no way to tell if we are
+ * in a RCU read-side critical section or not, so we never
+ * attempt any fixup and just print a warning.
*/
+#ifndef CONFIG_PREEMPT
+ WARN_ON_ONCE(1);
+ return 0;
+#endif
if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
irqs_disabled()) {
- WARN_ON(1);
+ WARN_ON_ONCE(1);
return 0;
}
rcu_barrier();
@@ -184,10 +191,17 @@ static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
* Ensure that queued callbacks are all executed.
* If we detect that we are nested in a RCU read-side critical
* section, we should simply fail, otherwise we would deadlock.
+ * In !PREEMPT configurations, there is no way to tell if we are
+ * in a RCU read-side critical section or not, so we never
+ * attempt any fixup and just print a warning.
*/
+#ifndef CONFIG_PREEMPT
+ WARN_ON_ONCE(1);
+ return 0;
+#endif
if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
irqs_disabled()) {
- WARN_ON(1);
+ WARN_ON_ONCE(1);
return 0;
}
rcu_barrier();
@@ -214,15 +228,17 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
* Ensure that queued callbacks are all executed.
* If we detect that we are nested in a RCU read-side critical
* section, we should simply fail, otherwise we would deadlock.
- * Note that the machinery to reliably determine whether
- * or not we are in an RCU read-side critical section
- * exists only in the preemptible RCU implementations
- * (TINY_PREEMPT_RCU and TREE_PREEMPT_RCU), which is why
- * DEBUG_OBJECTS_RCU_HEAD is disallowed if !PREEMPT.
+ * In !PREEMPT configurations, there is no way to tell if we are
+ * in a RCU read-side critical section or not, so we never
+ * attempt any fixup and just print a warning.
*/
+#ifndef CONFIG_PREEMPT
+ WARN_ON_ONCE(1);
+ return 0;
+#endif
if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
irqs_disabled()) {
- WARN_ON(1);
+ WARN_ON_ONCE(1);
return 0;
}
rcu_barrier();
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 0c343b9a46d5..421abfd3641d 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -40,10 +40,10 @@
static struct task_struct *rcu_kthread_task;
static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
static unsigned long have_rcu_kthread_work;
-static void invoke_rcu_kthread(void);
/* Forward declarations for rcutiny_plugin.h. */
struct rcu_ctrlblk;
+static void invoke_rcu_kthread(void);
static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
static int rcu_kthread(void *arg);
static void __call_rcu(struct rcu_head *head,
@@ -79,36 +79,45 @@ void rcu_exit_nohz(void)
#endif /* #ifdef CONFIG_NO_HZ */
/*
- * Helper function for rcu_qsctr_inc() and rcu_bh_qsctr_inc().
- * Also disable irqs to avoid confusion due to interrupt handlers
+ * Helper function for rcu_sched_qs() and rcu_bh_qs().
+ * Also irqs are disabled to avoid confusion due to interrupt handlers
* invoking call_rcu().
*/
static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
{
- unsigned long flags;
-
- local_irq_save(flags);
if (rcp->rcucblist != NULL &&
rcp->donetail != rcp->curtail) {
rcp->donetail = rcp->curtail;
- local_irq_restore(flags);
return 1;
}
- local_irq_restore(flags);
return 0;
}
/*
+ * Wake up rcu_kthread() to process callbacks now eligible for invocation
+ * or to boost readers.
+ */
+static void invoke_rcu_kthread(void)
+{
+ have_rcu_kthread_work = 1;
+ wake_up(&rcu_kthread_wq);
+}
+
+/*
* Record an rcu quiescent state. And an rcu_bh quiescent state while we
* are at it, given that any rcu quiescent state is also an rcu_bh
* quiescent state. Use "+" instead of "||" to defeat short circuiting.
*/
void rcu_sched_qs(int cpu)
{
+ unsigned long flags;
+
+ local_irq_save(flags);
if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
rcu_qsctr_help(&rcu_bh_ctrlblk))
invoke_rcu_kthread();
+ local_irq_restore(flags);
}
/*
@@ -116,8 +125,12 @@ void rcu_sched_qs(int cpu)
*/
void rcu_bh_qs(int cpu)
{
+ unsigned long flags;
+
+ local_irq_save(flags);
if (rcu_qsctr_help(&rcu_bh_ctrlblk))
invoke_rcu_kthread();
+ local_irq_restore(flags);
}
/*
@@ -167,7 +180,7 @@ static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
prefetch(next);
debug_rcu_head_unqueue(list);
local_bh_disable();
- list->func(list);
+ __rcu_reclaim(list);
local_bh_enable();
list = next;
RCU_TRACE(cb_count++);
@@ -208,20 +221,6 @@ static int rcu_kthread(void *arg)
}
/*
- * Wake up rcu_kthread() to process callbacks now eligible for invocation
- * or to boost readers.
- */
-static void invoke_rcu_kthread(void)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- have_rcu_kthread_work = 1;
- wake_up(&rcu_kthread_wq);
- local_irq_restore(flags);
-}
-
-/*
* Wait for a grace period to elapse. But it is illegal to invoke
* synchronize_sched() from within an RCU read-side critical section.
* Therefore, any legal call to synchronize_sched() is a quiescent
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 3cb8e362e883..f259c676195f 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -100,23 +100,28 @@ struct rcu_preempt_ctrlblk {
u8 completed; /* Last grace period completed. */
/* If all three are equal, RCU is idle. */
#ifdef CONFIG_RCU_BOOST
- s8 boosted_this_gp; /* Has boosting already happened? */
unsigned long boost_time; /* When to start boosting (jiffies) */
#endif /* #ifdef CONFIG_RCU_BOOST */
#ifdef CONFIG_RCU_TRACE
unsigned long n_grace_periods;
#ifdef CONFIG_RCU_BOOST
unsigned long n_tasks_boosted;
+ /* Total number of tasks boosted. */
unsigned long n_exp_boosts;
+ /* Number of tasks boosted for expedited GP. */
unsigned long n_normal_boosts;
- unsigned long n_normal_balk_blkd_tasks;
- unsigned long n_normal_balk_gp_tasks;
- unsigned long n_normal_balk_boost_tasks;
- unsigned long n_normal_balk_boosted;
- unsigned long n_normal_balk_notyet;
- unsigned long n_normal_balk_nos;
- unsigned long n_exp_balk_blkd_tasks;
- unsigned long n_exp_balk_nos;
+ /* Number of tasks boosted for normal GP. */
+ unsigned long n_balk_blkd_tasks;
+ /* Refused to boost: no blocked tasks. */
+ unsigned long n_balk_exp_gp_tasks;
+ /* Refused to boost: nothing blocking GP. */
+ unsigned long n_balk_boost_tasks;
+ /* Refused to boost: already boosting. */
+ unsigned long n_balk_notyet;
+ /* Refused to boost: not yet time. */
+ unsigned long n_balk_nos;
+ /* Refused to boost: not sure why, though. */
+ /* This can happen due to race conditions. */
#endif /* #ifdef CONFIG_RCU_BOOST */
#endif /* #ifdef CONFIG_RCU_TRACE */
};
@@ -201,7 +206,6 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t)
#ifdef CONFIG_RCU_BOOST
static void rcu_initiate_boost_trace(void);
-static void rcu_initiate_exp_boost_trace(void);
#endif /* #ifdef CONFIG_RCU_BOOST */
/*
@@ -219,41 +223,21 @@ static void show_tiny_preempt_stats(struct seq_file *m)
"N."[!rcu_preempt_ctrlblk.gp_tasks],
"E."[!rcu_preempt_ctrlblk.exp_tasks]);
#ifdef CONFIG_RCU_BOOST
- seq_printf(m, " ttb=%c btg=",
- "B."[!rcu_preempt_ctrlblk.boost_tasks]);
- switch (rcu_preempt_ctrlblk.boosted_this_gp) {
- case -1:
- seq_puts(m, "exp");
- break;
- case 0:
- seq_puts(m, "no");
- break;
- case 1:
- seq_puts(m, "begun");
- break;
- case 2:
- seq_puts(m, "done");
- break;
- default:
- seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp);
- }
- seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
+ seq_printf(m, "%sttb=%c ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
+ " ",
+ "B."[!rcu_preempt_ctrlblk.boost_tasks],
rcu_preempt_ctrlblk.n_tasks_boosted,
rcu_preempt_ctrlblk.n_exp_boosts,
rcu_preempt_ctrlblk.n_normal_boosts,
(int)(jiffies & 0xffff),
(int)(rcu_preempt_ctrlblk.boost_time & 0xffff));
- seq_printf(m, " %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n",
- "normal balk",
- rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks,
- rcu_preempt_ctrlblk.n_normal_balk_gp_tasks,
- rcu_preempt_ctrlblk.n_normal_balk_boost_tasks,
- rcu_preempt_ctrlblk.n_normal_balk_boosted,
- rcu_preempt_ctrlblk.n_normal_balk_notyet,
- rcu_preempt_ctrlblk.n_normal_balk_nos);
- seq_printf(m, " exp balk: bt=%lu nos=%lu\n",
- rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks,
- rcu_preempt_ctrlblk.n_exp_balk_nos);
+ seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu ny=%lu nos=%lu\n",
+ " balk",
+ rcu_preempt_ctrlblk.n_balk_blkd_tasks,
+ rcu_preempt_ctrlblk.n_balk_exp_gp_tasks,
+ rcu_preempt_ctrlblk.n_balk_boost_tasks,
+ rcu_preempt_ctrlblk.n_balk_notyet,
+ rcu_preempt_ctrlblk.n_balk_nos);
#endif /* #ifdef CONFIG_RCU_BOOST */
}
@@ -271,25 +255,59 @@ static int rcu_boost(void)
{
unsigned long flags;
struct rt_mutex mtx;
- struct list_head *np;
struct task_struct *t;
+ struct list_head *tb;
- if (rcu_preempt_ctrlblk.boost_tasks == NULL)
+ if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
+ rcu_preempt_ctrlblk.exp_tasks == NULL)
return 0; /* Nothing to boost. */
+
raw_local_irq_save(flags);
- rcu_preempt_ctrlblk.boosted_this_gp++;
- t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct,
- rcu_node_entry);
- np = rcu_next_node_entry(t);
+
+ /*
+ * Recheck with irqs disabled: all tasks in need of boosting
+ * might exit their RCU read-side critical sections on their own
+ * if we are preempted just before disabling irqs.
+ */
+ if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
+ rcu_preempt_ctrlblk.exp_tasks == NULL) {
+ raw_local_irq_restore(flags);
+ return 0;
+ }
+
+ /*
+ * Preferentially boost tasks blocking expedited grace periods.
+ * This cannot starve the normal grace periods because a second
+ * expedited grace period must boost all blocked tasks, including
+ * those blocking the pre-existing normal grace period.
+ */
+ if (rcu_preempt_ctrlblk.exp_tasks != NULL) {
+ tb = rcu_preempt_ctrlblk.exp_tasks;
+ RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
+ } else {
+ tb = rcu_preempt_ctrlblk.boost_tasks;
+ RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
+ }
+ RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
+
+ /*
+ * We boost task t by manufacturing an rt_mutex that appears to
+ * be held by task t. We leave a pointer to that rt_mutex where
+ * task t can find it, and task t will release the mutex when it
+ * exits its outermost RCU read-side critical section. Then
+ * simply acquiring this artificial rt_mutex will boost task
+ * t's priority. (Thanks to tglx for suggesting this approach!)
+ */
+ t = container_of(tb, struct task_struct, rcu_node_entry);
rt_mutex_init_proxy_locked(&mtx, t);
t->rcu_boost_mutex = &mtx;
t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
raw_local_irq_restore(flags);
rt_mutex_lock(&mtx);
- RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
- rcu_preempt_ctrlblk.boosted_this_gp++;
- rt_mutex_unlock(&mtx);
- return rcu_preempt_ctrlblk.boost_tasks != NULL;
+ rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
+
+ return rcu_preempt_ctrlblk.boost_tasks != NULL ||
+ rcu_preempt_ctrlblk.exp_tasks != NULL;
}
/*
@@ -304,42 +322,25 @@ static int rcu_boost(void)
*/
static int rcu_initiate_boost(void)
{
- if (!rcu_preempt_blocked_readers_cgp()) {
- RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++);
+ if (!rcu_preempt_blocked_readers_cgp() &&
+ rcu_preempt_ctrlblk.exp_tasks == NULL) {
+ RCU_TRACE(rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++);
return 0;
}
- if (rcu_preempt_ctrlblk.gp_tasks != NULL &&
- rcu_preempt_ctrlblk.boost_tasks == NULL &&
- rcu_preempt_ctrlblk.boosted_this_gp == 0 &&
- ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) {
- rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks;
+ if (rcu_preempt_ctrlblk.exp_tasks != NULL ||
+ (rcu_preempt_ctrlblk.gp_tasks != NULL &&
+ rcu_preempt_ctrlblk.boost_tasks == NULL &&
+ ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))) {
+ if (rcu_preempt_ctrlblk.exp_tasks == NULL)
+ rcu_preempt_ctrlblk.boost_tasks =
+ rcu_preempt_ctrlblk.gp_tasks;
invoke_rcu_kthread();
- RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
} else
RCU_TRACE(rcu_initiate_boost_trace());
return 1;
}
-/*
- * Initiate boosting for an expedited grace period.
- */
-static void rcu_initiate_expedited_boost(void)
-{
- unsigned long flags;
-
- raw_local_irq_save(flags);
- if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) {
- rcu_preempt_ctrlblk.boost_tasks =
- rcu_preempt_ctrlblk.blkd_tasks.next;
- rcu_preempt_ctrlblk.boosted_this_gp = -1;
- invoke_rcu_kthread();
- RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
- } else
- RCU_TRACE(rcu_initiate_exp_boost_trace());
- raw_local_irq_restore(flags);
-}
-
-#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000);
+#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
/*
* Do priority-boost accounting for the start of a new grace period.
@@ -347,8 +348,6 @@ static void rcu_initiate_expedited_boost(void)
static void rcu_preempt_boost_start_gp(void)
{
rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
- if (rcu_preempt_ctrlblk.boosted_this_gp > 0)
- rcu_preempt_ctrlblk.boosted_this_gp = 0;
}
#else /* #ifdef CONFIG_RCU_BOOST */
@@ -372,13 +371,6 @@ static int rcu_initiate_boost(void)
}
/*
- * If there is no RCU priority boosting, we don't initiate expedited boosting.
- */
-static void rcu_initiate_expedited_boost(void)
-{
-}
-
-/*
* If there is no RCU priority boosting, nothing to do at grace-period start.
*/
static void rcu_preempt_boost_start_gp(void)
@@ -418,7 +410,7 @@ static void rcu_preempt_cpu_qs(void)
if (!rcu_preempt_gp_in_progress())
return;
/*
- * Check up on boosting. If there are no readers blocking the
+ * Check up on boosting. If there are readers blocking the
* current grace period, leave.
*/
if (rcu_initiate_boost())
@@ -578,7 +570,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
empty = !rcu_preempt_blocked_readers_cgp();
empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
np = rcu_next_node_entry(t);
- list_del(&t->rcu_node_entry);
+ list_del_init(&t->rcu_node_entry);
if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
rcu_preempt_ctrlblk.gp_tasks = np;
if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
@@ -587,7 +579,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks)
rcu_preempt_ctrlblk.boost_tasks = np;
#endif /* #ifdef CONFIG_RCU_BOOST */
- INIT_LIST_HEAD(&t->rcu_node_entry);
/*
* If this was the last task on the current list, and if
@@ -812,13 +803,16 @@ void synchronize_rcu_expedited(void)
rpcp->exp_tasks = rpcp->blkd_tasks.next;
if (rpcp->exp_tasks == &rpcp->blkd_tasks)
rpcp->exp_tasks = NULL;
- local_irq_restore(flags);
/* Wait for tail of ->blkd_tasks list to drain. */
- if (rcu_preempted_readers_exp())
- rcu_initiate_expedited_boost();
+ if (!rcu_preempted_readers_exp())
+ local_irq_restore(flags);
+ else {
+ rcu_initiate_boost();
+ local_irq_restore(flags);
wait_event(sync_rcu_preempt_exp_wq,
!rcu_preempted_readers_exp());
+ }
/* Clean up and exit. */
barrier(); /* ensure expedited GP seen before counter increment. */
@@ -931,24 +925,17 @@ void __init rcu_scheduler_starting(void)
static void rcu_initiate_boost_trace(void)
{
- if (rcu_preempt_ctrlblk.gp_tasks == NULL)
- rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++;
+ if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
+ rcu_preempt_ctrlblk.n_balk_blkd_tasks++;
+ else if (rcu_preempt_ctrlblk.gp_tasks == NULL &&
+ rcu_preempt_ctrlblk.exp_tasks == NULL)
+ rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++;
else if (rcu_preempt_ctrlblk.boost_tasks != NULL)
- rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++;
- else if (rcu_preempt_ctrlblk.boosted_this_gp != 0)
- rcu_preempt_ctrlblk.n_normal_balk_boosted++;
+ rcu_preempt_ctrlblk.n_balk_boost_tasks++;
else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))
- rcu_preempt_ctrlblk.n_normal_balk_notyet++;
- else
- rcu_preempt_ctrlblk.n_normal_balk_nos++;
-}
-
-static void rcu_initiate_exp_boost_trace(void)
-{
- if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
- rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks++;
+ rcu_preempt_ctrlblk.n_balk_notyet++;
else
- rcu_preempt_ctrlblk.n_exp_balk_nos++;
+ rcu_preempt_ctrlblk.n_balk_nos++;
}
#endif /* #ifdef CONFIG_RCU_BOOST */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index c224da41890c..2e138db03382 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -131,7 +131,7 @@ struct rcu_torture {
static LIST_HEAD(rcu_torture_freelist);
static struct rcu_torture __rcu *rcu_torture_current;
-static long rcu_torture_current_version;
+static unsigned long rcu_torture_current_version;
static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
static DEFINE_SPINLOCK(rcu_torture_lock);
static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
@@ -146,8 +146,6 @@ static atomic_t n_rcu_torture_mberror;
static atomic_t n_rcu_torture_error;
static long n_rcu_torture_boost_ktrerror;
static long n_rcu_torture_boost_rterror;
-static long n_rcu_torture_boost_allocerror;
-static long n_rcu_torture_boost_afferror;
static long n_rcu_torture_boost_failure;
static long n_rcu_torture_boosts;
static long n_rcu_torture_timers;
@@ -163,11 +161,11 @@ static int stutter_pause_test;
#endif
int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
-#ifdef CONFIG_RCU_BOOST
+#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
#define rcu_can_boost() 1
-#else /* #ifdef CONFIG_RCU_BOOST */
+#else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
#define rcu_can_boost() 0
-#endif /* #else #ifdef CONFIG_RCU_BOOST */
+#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
static unsigned long boost_starttime; /* jiffies of next boost test start. */
DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
@@ -751,6 +749,7 @@ static int rcu_torture_boost(void *arg)
n_rcu_torture_boost_rterror++;
}
+ init_rcu_head_on_stack(&rbi.rcu);
/* Each pass through the following loop does one boost-test cycle. */
do {
/* Wait for the next test interval. */
@@ -810,6 +809,7 @@ checkwait: rcu_stutter_wait("rcu_torture_boost");
/* Clean up and exit. */
VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
+ destroy_rcu_head_on_stack(&rbi.rcu);
rcutorture_shutdown_absorb("rcu_torture_boost");
while (!kthread_should_stop() || rbi.inflight)
schedule_timeout_uninterruptible(1);
@@ -886,7 +886,7 @@ rcu_torture_writer(void *arg)
old_rp->rtort_pipe_count++;
cur_ops->deferred_free(old_rp);
}
- rcu_torture_current_version++;
+ rcutorture_record_progress(++rcu_torture_current_version);
oldbatch = cur_ops->completed();
rcu_stutter_wait("rcu_torture_writer");
} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
@@ -1066,8 +1066,8 @@ rcu_torture_printk(char *page)
}
cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
cnt += sprintf(&page[cnt],
- "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
- "rtmbe: %d rtbke: %ld rtbre: %ld rtbae: %ld rtbafe: %ld "
+ "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "
+ "rtmbe: %d rtbke: %ld rtbre: %ld "
"rtbf: %ld rtb: %ld nt: %ld",
rcu_torture_current,
rcu_torture_current_version,
@@ -1078,16 +1078,12 @@ rcu_torture_printk(char *page)
atomic_read(&n_rcu_torture_mberror),
n_rcu_torture_boost_ktrerror,
n_rcu_torture_boost_rterror,
- n_rcu_torture_boost_allocerror,
- n_rcu_torture_boost_afferror,
n_rcu_torture_boost_failure,
n_rcu_torture_boosts,
n_rcu_torture_timers);
if (atomic_read(&n_rcu_torture_mberror) != 0 ||
n_rcu_torture_boost_ktrerror != 0 ||
n_rcu_torture_boost_rterror != 0 ||
- n_rcu_torture_boost_allocerror != 0 ||
- n_rcu_torture_boost_afferror != 0 ||
n_rcu_torture_boost_failure != 0)
cnt += sprintf(&page[cnt], " !!!");
cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
@@ -1331,6 +1327,7 @@ rcu_torture_cleanup(void)
int i;
mutex_lock(&fullstop_mutex);
+ rcutorture_record_test_transition();
if (fullstop == FULLSTOP_SHUTDOWN) {
printk(KERN_WARNING /* but going down anyway, so... */
"Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
@@ -1486,8 +1483,6 @@ rcu_torture_init(void)
atomic_set(&n_rcu_torture_error, 0);
n_rcu_torture_boost_ktrerror = 0;
n_rcu_torture_boost_rterror = 0;
- n_rcu_torture_boost_allocerror = 0;
- n_rcu_torture_boost_afferror = 0;
n_rcu_torture_boost_failure = 0;
n_rcu_torture_boosts = 0;
for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
@@ -1624,6 +1619,7 @@ rcu_torture_init(void)
}
}
register_reboot_notifier(&rcutorture_shutdown_nb);
+ rcutorture_record_test_transition();
mutex_unlock(&fullstop_mutex);
return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index dd4aea806f8e..e486f7c3ffb8 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -47,6 +47,8 @@
#include <linux/mutex.h>
#include <linux/time.h>
#include <linux/kernel_stat.h>
+#include <linux/wait.h>
+#include <linux/kthread.h>
#include "rcutree.h"
@@ -79,10 +81,41 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
+static struct rcu_state *rcu_state;
+
int rcu_scheduler_active __read_mostly;
EXPORT_SYMBOL_GPL(rcu_scheduler_active);
/*
+ * Control variables for per-CPU and per-rcu_node kthreads. These
+ * handle all flavors of RCU.
+ */
+static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
+DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
+DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu);
+DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
+static DEFINE_PER_CPU(wait_queue_head_t, rcu_cpu_wq);
+DEFINE_PER_CPU(char, rcu_cpu_has_work);
+static char rcu_kthreads_spawnable;
+
+static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
+static void invoke_rcu_cpu_kthread(void);
+
+#define RCU_KTHREAD_PRIO 1 /* RT priority for per-CPU kthreads. */
+
+/*
+ * Track the rcutorture test sequence number and the update version
+ * number within a given test. The rcutorture_testseq is incremented
+ * on every rcutorture module load and unload, so has an odd value
+ * when a test is running. The rcutorture_vernum is set to zero
+ * when rcutorture starts and is incremented on each rcutorture update.
+ * These variables enable correlating rcutorture output with the
+ * RCU tracing information.
+ */
+unsigned long rcutorture_testseq;
+unsigned long rcutorture_vernum;
+
+/*
* Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
* permit this function to be invoked without holding the root rcu_node
* structure's ->lock, but of course results can be subject to change.
@@ -124,6 +157,7 @@ void rcu_note_context_switch(int cpu)
rcu_sched_qs(cpu);
rcu_preempt_note_context_switch(cpu);
}
+EXPORT_SYMBOL_GPL(rcu_note_context_switch);
#ifdef CONFIG_NO_HZ
DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
@@ -140,10 +174,8 @@ module_param(blimit, int, 0);
module_param(qhimark, int, 0);
module_param(qlowmark, int, 0);
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
-int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT;
+int rcu_cpu_stall_suppress __read_mostly;
module_param(rcu_cpu_stall_suppress, int, 0644);
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
static int rcu_pending(int cpu);
@@ -176,6 +208,31 @@ void rcu_bh_force_quiescent_state(void)
EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
/*
+ * Record the number of times rcutorture tests have been initiated and
+ * terminated. This information allows the debugfs tracing stats to be
+ * correlated to the rcutorture messages, even when the rcutorture module
+ * is being repeatedly loaded and unloaded. In other words, we cannot
+ * store this state in rcutorture itself.
+ */
+void rcutorture_record_test_transition(void)
+{
+ rcutorture_testseq++;
+ rcutorture_vernum = 0;
+}
+EXPORT_SYMBOL_GPL(rcutorture_record_test_transition);
+
+/*
+ * Record the number of writer passes through the current rcutorture test.
+ * This is also used to correlate debugfs tracing stats with the rcutorture
+ * messages.
+ */
+void rcutorture_record_progress(unsigned long vernum)
+{
+ rcutorture_vernum++;
+}
+EXPORT_SYMBOL_GPL(rcutorture_record_progress);
+
+/*
* Force a quiescent state for RCU-sched.
*/
void rcu_sched_force_quiescent_state(void)
@@ -234,8 +291,8 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
return 1;
}
- /* If preemptable RCU, no point in sending reschedule IPI. */
- if (rdp->preemptable)
+ /* If preemptible RCU, no point in sending reschedule IPI. */
+ if (rdp->preemptible)
return 0;
/* The CPU is online, so send it a reschedule IPI. */
@@ -450,8 +507,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
#endif /* #else #ifdef CONFIG_NO_HZ */
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
-
int rcu_cpu_stall_suppress __read_mostly;
static void record_gp_stall_check_time(struct rcu_state *rsp)
@@ -537,21 +592,24 @@ static void print_cpu_stall(struct rcu_state *rsp)
static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
{
- long delta;
+ unsigned long j;
+ unsigned long js;
struct rcu_node *rnp;
if (rcu_cpu_stall_suppress)
return;
- delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall);
+ j = ACCESS_ONCE(jiffies);
+ js = ACCESS_ONCE(rsp->jiffies_stall);
rnp = rdp->mynode;
- if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && delta >= 0) {
+ if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) {
/* We haven't checked in, so go dump stack. */
print_cpu_stall(rsp);
- } else if (rcu_gp_in_progress(rsp) && delta >= RCU_STALL_RAT_DELAY) {
+ } else if (rcu_gp_in_progress(rsp) &&
+ ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) {
- /* They had two time units to dump stack, so complain. */
+ /* They had a few time units to dump stack, so complain. */
print_other_cpu_stall(rsp);
}
}
@@ -587,26 +645,6 @@ static void __init check_cpu_stall_init(void)
atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
}
-#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
-
-static void record_gp_stall_check_time(struct rcu_state *rsp)
-{
-}
-
-static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
-{
-}
-
-void rcu_cpu_stall_reset(void)
-{
-}
-
-static void __init check_cpu_stall_init(void)
-{
-}
-
-#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
-
/*
* Update CPU-local rcu_data state to record the newly noticed grace period.
* This is used both when we started the grace period and when we notice
@@ -809,6 +847,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
rnp->completed = rsp->completed;
rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
rcu_start_gp_per_cpu(rsp, rnp, rdp);
+ rcu_preempt_boost_start_gp(rnp);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
return;
}
@@ -844,6 +883,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
rnp->completed = rsp->completed;
if (rnp == rdp->mynode)
rcu_start_gp_per_cpu(rsp, rnp, rdp);
+ rcu_preempt_boost_start_gp(rnp);
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
}
@@ -864,7 +904,12 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
__releases(rcu_get_root(rsp)->lock)
{
+ unsigned long gp_duration;
+
WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
+ gp_duration = jiffies - rsp->gp_start;
+ if (gp_duration > rsp->gp_max)
+ rsp->gp_max = gp_duration;
rsp->completed = rsp->gpnum;
rsp->signaled = RCU_GP_IDLE;
rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
@@ -894,7 +939,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
return;
}
rnp->qsmask &= ~mask;
- if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
+ if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
/* Other bits still set at this level, so done. */
raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1037,6 +1082,8 @@ static void rcu_send_cbs_to_online(struct rcu_state *rsp)
/*
* Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
* and move all callbacks from the outgoing CPU to the current one.
+ * There can only be one CPU hotplug operation at a time, so no other
+ * CPU can be attempting to update rcu_cpu_kthread_task.
*/
static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
{
@@ -1045,6 +1092,14 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
int need_report = 0;
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp;
+ struct task_struct *t;
+
+ /* Stop the CPU's kthread. */
+ t = per_cpu(rcu_cpu_kthread_task, cpu);
+ if (t != NULL) {
+ per_cpu(rcu_cpu_kthread_task, cpu) = NULL;
+ kthread_stop(t);
+ }
/* Exclude any attempts to start a new grace period. */
raw_spin_lock_irqsave(&rsp->onofflock, flags);
@@ -1082,6 +1137,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
raw_spin_unlock_irqrestore(&rnp->lock, flags);
if (need_report & RCU_OFL_TASKS_EXP_GP)
rcu_report_exp_rnp(rsp, rnp);
+ rcu_node_kthread_setaffinity(rnp, -1);
}
/*
@@ -1143,7 +1199,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
next = list->next;
prefetch(next);
debug_rcu_head_unqueue(list);
- list->func(list);
+ __rcu_reclaim(list);
list = next;
if (++count >= rdp->blimit)
break;
@@ -1179,7 +1235,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
/* Re-raise the RCU softirq if there are callbacks remaining. */
if (cpu_has_callbacks_ready_to_invoke(rdp))
- raise_softirq(RCU_SOFTIRQ);
+ invoke_rcu_cpu_kthread();
}
/*
@@ -1225,7 +1281,7 @@ void rcu_check_callbacks(int cpu, int user)
}
rcu_preempt_check_callbacks(cpu);
if (rcu_pending(cpu))
- raise_softirq(RCU_SOFTIRQ);
+ invoke_rcu_cpu_kthread();
}
#ifdef CONFIG_SMP
@@ -1233,6 +1289,8 @@ void rcu_check_callbacks(int cpu, int user)
/*
* Scan the leaf rcu_node structures, processing dyntick state for any that
* have not yet encountered a quiescent state, using the function specified.
+ * Also initiate boosting for any threads blocked on the root rcu_node.
+ *
* The caller must have suppressed start of new grace periods.
*/
static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
@@ -1251,7 +1309,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
return;
}
if (rnp->qsmask == 0) {
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
continue;
}
cpu = rnp->grplo;
@@ -1269,6 +1327,11 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
}
raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
+ rnp = rcu_get_root(rsp);
+ if (rnp->qsmask == 0) {
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
+ }
}
/*
@@ -1389,7 +1452,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
/*
* Do softirq processing for the current CPU.
*/
-static void rcu_process_callbacks(struct softirq_action *unused)
+static void rcu_process_callbacks(void)
{
/*
* Memory references from any prior RCU read-side critical sections
@@ -1414,6 +1477,347 @@ static void rcu_process_callbacks(struct softirq_action *unused)
rcu_needs_cpu_flush();
}
+/*
+ * Wake up the current CPU's kthread. This replaces raise_softirq()
+ * in earlier versions of RCU. Note that because we are running on
+ * the current CPU with interrupts disabled, the rcu_cpu_kthread_task
+ * cannot disappear out from under us.
+ */
+static void invoke_rcu_cpu_kthread(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __this_cpu_write(rcu_cpu_has_work, 1);
+ if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) {
+ local_irq_restore(flags);
+ return;
+ }
+ wake_up(&__get_cpu_var(rcu_cpu_wq));
+ local_irq_restore(flags);
+}
+
+/*
+ * Wake up the specified per-rcu_node-structure kthread.
+ * Because the per-rcu_node kthreads are immortal, we don't need
+ * to do anything to keep them alive.
+ */
+static void invoke_rcu_node_kthread(struct rcu_node *rnp)
+{
+ struct task_struct *t;
+
+ t = rnp->node_kthread_task;
+ if (t != NULL)
+ wake_up_process(t);
+}
+
+/*
+ * Set the specified CPU's kthread to run RT or not, as specified by
+ * the to_rt argument. The CPU-hotplug locks are held, so the task
+ * is not going away.
+ */
+static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
+{
+ int policy;
+ struct sched_param sp;
+ struct task_struct *t;
+
+ t = per_cpu(rcu_cpu_kthread_task, cpu);
+ if (t == NULL)
+ return;
+ if (to_rt) {
+ policy = SCHED_FIFO;
+ sp.sched_priority = RCU_KTHREAD_PRIO;
+ } else {
+ policy = SCHED_NORMAL;
+ sp.sched_priority = 0;
+ }
+ sched_setscheduler_nocheck(t, policy, &sp);
+}
+
+/*
+ * Timer handler to initiate the waking up of per-CPU kthreads that
+ * have yielded the CPU due to excess numbers of RCU callbacks.
+ * We wake up the per-rcu_node kthread, which in turn will wake up
+ * the booster kthread.
+ */
+static void rcu_cpu_kthread_timer(unsigned long arg)
+{
+ unsigned long flags;
+ struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg);
+ struct rcu_node *rnp = rdp->mynode;
+
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ rnp->wakemask |= rdp->grpmask;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ invoke_rcu_node_kthread(rnp);
+}
+
+/*
+ * Drop to non-real-time priority and yield, but only after posting a
+ * timer that will cause us to regain our real-time priority if we
+ * remain preempted. Either way, we restore our real-time priority
+ * before returning.
+ */
+static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
+{
+ struct sched_param sp;
+ struct timer_list yield_timer;
+
+ setup_timer_on_stack(&yield_timer, f, arg);
+ mod_timer(&yield_timer, jiffies + 2);
+ sp.sched_priority = 0;
+ sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
+ set_user_nice(current, 19);
+ schedule();
+ sp.sched_priority = RCU_KTHREAD_PRIO;
+ sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
+ del_timer(&yield_timer);
+}
+
+/*
+ * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU.
+ * This can happen while the corresponding CPU is either coming online
+ * or going offline. We cannot wait until the CPU is fully online
+ * before starting the kthread, because the various notifier functions
+ * can wait for RCU grace periods. So we park rcu_cpu_kthread() until
+ * the corresponding CPU is online.
+ *
+ * Return 1 if the kthread needs to stop, 0 otherwise.
+ *
+ * Caller must disable bh. This function can momentarily enable it.
+ */
+static int rcu_cpu_kthread_should_stop(int cpu)
+{
+ while (cpu_is_offline(cpu) ||
+ !cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)) ||
+ smp_processor_id() != cpu) {
+ if (kthread_should_stop())
+ return 1;
+ per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
+ per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id();
+ local_bh_enable();
+ schedule_timeout_uninterruptible(1);
+ if (!cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)))
+ set_cpus_allowed_ptr(current, cpumask_of(cpu));
+ local_bh_disable();
+ }
+ per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
+ return 0;
+}
+
+/*
+ * Per-CPU kernel thread that invokes RCU callbacks. This replaces the
+ * earlier RCU softirq.
+ */
+static int rcu_cpu_kthread(void *arg)
+{
+ int cpu = (int)(long)arg;
+ unsigned long flags;
+ int spincnt = 0;
+ unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu);
+ wait_queue_head_t *wqp = &per_cpu(rcu_cpu_wq, cpu);
+ char work;
+ char *workp = &per_cpu(rcu_cpu_has_work, cpu);
+
+ for (;;) {
+ *statusp = RCU_KTHREAD_WAITING;
+ wait_event_interruptible(*wqp,
+ *workp != 0 || kthread_should_stop());
+ local_bh_disable();
+ if (rcu_cpu_kthread_should_stop(cpu)) {
+ local_bh_enable();
+ break;
+ }
+ *statusp = RCU_KTHREAD_RUNNING;
+ per_cpu(rcu_cpu_kthread_loops, cpu)++;
+ local_irq_save(flags);
+ work = *workp;
+ *workp = 0;
+ local_irq_restore(flags);
+ if (work)
+ rcu_process_callbacks();
+ local_bh_enable();
+ if (*workp != 0)
+ spincnt++;
+ else
+ spincnt = 0;
+ if (spincnt > 10) {
+ *statusp = RCU_KTHREAD_YIELDING;
+ rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
+ spincnt = 0;
+ }
+ }
+ *statusp = RCU_KTHREAD_STOPPED;
+ return 0;
+}
+
+/*
+ * Spawn a per-CPU kthread, setting up affinity and priority.
+ * Because the CPU hotplug lock is held, no other CPU will be attempting
+ * to manipulate rcu_cpu_kthread_task. There might be another CPU
+ * attempting to access it during boot, but the locking in kthread_bind()
+ * will enforce sufficient ordering.
+ */
+static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
+{
+ struct sched_param sp;
+ struct task_struct *t;
+
+ if (!rcu_kthreads_spawnable ||
+ per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
+ return 0;
+ t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu);
+ if (IS_ERR(t))
+ return PTR_ERR(t);
+ kthread_bind(t, cpu);
+ per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
+ WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL);
+ per_cpu(rcu_cpu_kthread_task, cpu) = t;
+ wake_up_process(t);
+ sp.sched_priority = RCU_KTHREAD_PRIO;
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+ return 0;
+}
+
+/*
+ * Per-rcu_node kthread, which is in charge of waking up the per-CPU
+ * kthreads when needed. We ignore requests to wake up kthreads
+ * for offline CPUs, which is OK because force_quiescent_state()
+ * takes care of this case.
+ */
+static int rcu_node_kthread(void *arg)
+{
+ int cpu;
+ unsigned long flags;
+ unsigned long mask;
+ struct rcu_node *rnp = (struct rcu_node *)arg;
+ struct sched_param sp;
+ struct task_struct *t;
+
+ for (;;) {
+ rnp->node_kthread_status = RCU_KTHREAD_WAITING;
+ wait_event_interruptible(rnp->node_wq, rnp->wakemask != 0);
+ rnp->node_kthread_status = RCU_KTHREAD_RUNNING;
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ mask = rnp->wakemask;
+ rnp->wakemask = 0;
+ rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
+ for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) {
+ if ((mask & 0x1) == 0)
+ continue;
+ preempt_disable();
+ t = per_cpu(rcu_cpu_kthread_task, cpu);
+ if (!cpu_online(cpu) || t == NULL) {
+ preempt_enable();
+ continue;
+ }
+ per_cpu(rcu_cpu_has_work, cpu) = 1;
+ sp.sched_priority = RCU_KTHREAD_PRIO;
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+ preempt_enable();
+ }
+ }
+ /* NOTREACHED */
+ rnp->node_kthread_status = RCU_KTHREAD_STOPPED;
+ return 0;
+}
+
+/*
+ * Set the per-rcu_node kthread's affinity to cover all CPUs that are
+ * served by the rcu_node in question. The CPU hotplug lock is still
+ * held, so the value of rnp->qsmaskinit will be stable.
+ *
+ * We don't include outgoingcpu in the affinity set, use -1 if there is
+ * no outgoing CPU. If there are no CPUs left in the affinity set,
+ * this function allows the kthread to execute on any CPU.
+ */
+static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
+{
+ cpumask_var_t cm;
+ int cpu;
+ unsigned long mask = rnp->qsmaskinit;
+
+ if (rnp->node_kthread_task == NULL)
+ return;
+ if (!alloc_cpumask_var(&cm, GFP_KERNEL))
+ return;
+ cpumask_clear(cm);
+ for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
+ if ((mask & 0x1) && cpu != outgoingcpu)
+ cpumask_set_cpu(cpu, cm);
+ if (cpumask_weight(cm) == 0) {
+ cpumask_setall(cm);
+ for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++)
+ cpumask_clear_cpu(cpu, cm);
+ WARN_ON_ONCE(cpumask_weight(cm) == 0);
+ }
+ set_cpus_allowed_ptr(rnp->node_kthread_task, cm);
+ rcu_boost_kthread_setaffinity(rnp, cm);
+ free_cpumask_var(cm);
+}
+
+/*
+ * Spawn a per-rcu_node kthread, setting priority and affinity.
+ * Called during boot before online/offline can happen, or, if
+ * during runtime, with the main CPU-hotplug locks held. So only
+ * one of these can be executing at a time.
+ */
+static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
+ struct rcu_node *rnp)
+{
+ unsigned long flags;
+ int rnp_index = rnp - &rsp->node[0];
+ struct sched_param sp;
+ struct task_struct *t;
+
+ if (!rcu_kthreads_spawnable ||
+ rnp->qsmaskinit == 0)
+ return 0;
+ if (rnp->node_kthread_task == NULL) {
+ t = kthread_create(rcu_node_kthread, (void *)rnp,
+ "rcun%d", rnp_index);
+ if (IS_ERR(t))
+ return PTR_ERR(t);
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ rnp->node_kthread_task = t;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ wake_up_process(t);
+ sp.sched_priority = 99;
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+ }
+ return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index);
+}
+
+/*
+ * Spawn all kthreads -- called as soon as the scheduler is running.
+ */
+static int __init rcu_spawn_kthreads(void)
+{
+ int cpu;
+ struct rcu_node *rnp;
+
+ rcu_kthreads_spawnable = 1;
+ for_each_possible_cpu(cpu) {
+ init_waitqueue_head(&per_cpu(rcu_cpu_wq, cpu));
+ per_cpu(rcu_cpu_has_work, cpu) = 0;
+ if (cpu_online(cpu))
+ (void)rcu_spawn_one_cpu_kthread(cpu);
+ }
+ rnp = rcu_get_root(rcu_state);
+ init_waitqueue_head(&rnp->node_wq);
+ rcu_init_boost_waitqueue(rnp);
+ (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
+ if (NUM_RCU_NODES > 1)
+ rcu_for_each_leaf_node(rcu_state, rnp) {
+ init_waitqueue_head(&rnp->node_wq);
+ rcu_init_boost_waitqueue(rnp);
+ (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
+ }
+ return 0;
+}
+early_initcall(rcu_spawn_kthreads);
+
static void
__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
struct rcu_state *rsp)
@@ -1439,6 +1843,13 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
/* Add the callback to our list. */
*rdp->nxttail[RCU_NEXT_TAIL] = head;
rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
+ rdp->qlen++;
+
+ /* If interrupts were disabled, don't dive into RCU core. */
+ if (irqs_disabled_flags(flags)) {
+ local_irq_restore(flags);
+ return;
+ }
/*
* Force the grace period if too many callbacks or too long waiting.
@@ -1447,7 +1858,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
* invoking force_quiescent_state() if the newly enqueued callback
* is the only one waiting for a grace period to complete.
*/
- if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
+ if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
/* Are we ignoring a completed grace period? */
rcu_process_gp_end(rsp, rdp);
@@ -1583,7 +1994,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
* or RCU-bh, force a local reschedule.
*/
rdp->n_rp_qs_pending++;
- if (!rdp->preemptable &&
+ if (!rdp->preemptible &&
ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
jiffies))
set_need_resched();
@@ -1760,7 +2171,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
* that this CPU cannot possibly have any RCU callbacks in flight yet.
*/
static void __cpuinit
-rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
+rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
{
unsigned long flags;
unsigned long mask;
@@ -1772,7 +2183,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
rdp->passed_quiesc = 0; /* We could be racing with new GP, */
rdp->qs_pending = 1; /* so set up to respond to current GP. */
rdp->beenonline = 1; /* We have now been online. */
- rdp->preemptable = preemptable;
+ rdp->preemptible = preemptible;
rdp->qlen_last_fqs_check = 0;
rdp->n_force_qs_snap = rsp->n_force_qs;
rdp->blimit = blimit;
@@ -1813,6 +2224,19 @@ static void __cpuinit rcu_online_cpu(int cpu)
rcu_preempt_init_percpu_data(cpu);
}
+static void __cpuinit rcu_online_kthreads(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
+ struct rcu_node *rnp = rdp->mynode;
+
+ /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
+ if (rcu_kthreads_spawnable) {
+ (void)rcu_spawn_one_cpu_kthread(cpu);
+ if (rnp->node_kthread_task == NULL)
+ (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
+ }
+}
+
/*
* Handle CPU online/offline notification events.
*/
@@ -1820,11 +2244,23 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
long cpu = (long)hcpu;
+ struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
+ struct rcu_node *rnp = rdp->mynode;
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
rcu_online_cpu(cpu);
+ rcu_online_kthreads(cpu);
+ break;
+ case CPU_ONLINE:
+ case CPU_DOWN_FAILED:
+ rcu_node_kthread_setaffinity(rnp, -1);
+ rcu_cpu_kthread_setrt(cpu, 1);
+ break;
+ case CPU_DOWN_PREPARE:
+ rcu_node_kthread_setaffinity(rnp, cpu);
+ rcu_cpu_kthread_setrt(cpu, 0);
break;
case CPU_DYING:
case CPU_DYING_FROZEN:
@@ -1943,10 +2379,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
j / rsp->levelspread[i - 1];
}
rnp->level = i;
- INIT_LIST_HEAD(&rnp->blocked_tasks[0]);
- INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
- INIT_LIST_HEAD(&rnp->blocked_tasks[2]);
- INIT_LIST_HEAD(&rnp->blocked_tasks[3]);
+ INIT_LIST_HEAD(&rnp->blkd_tasks);
}
}
@@ -1968,7 +2401,6 @@ void __init rcu_init(void)
rcu_init_one(&rcu_sched_state, &rcu_sched_data);
rcu_init_one(&rcu_bh_state, &rcu_bh_data);
__rcu_init_preempt();
- open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
/*
* We don't need protection against CPU-hotplug here because
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index e8f057e44e3e..257664815d5d 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -91,6 +91,14 @@ struct rcu_dynticks {
/* remains even for nmi from irq handler. */
};
+/* RCU's kthread states for tracing. */
+#define RCU_KTHREAD_STOPPED 0
+#define RCU_KTHREAD_RUNNING 1
+#define RCU_KTHREAD_WAITING 2
+#define RCU_KTHREAD_OFFCPU 3
+#define RCU_KTHREAD_YIELDING 4
+#define RCU_KTHREAD_MAX 4
+
/*
* Definition for node within the RCU grace-period-detection hierarchy.
*/
@@ -109,10 +117,11 @@ struct rcu_node {
/* an rcu_data structure, otherwise, each */
/* bit corresponds to a child rcu_node */
/* structure. */
- unsigned long expmask; /* Groups that have ->blocked_tasks[] */
+ unsigned long expmask; /* Groups that have ->blkd_tasks */
/* elements that need to drain to allow the */
/* current expedited grace period to */
/* complete (only for TREE_PREEMPT_RCU). */
+ unsigned long wakemask; /* CPUs whose kthread needs to be awakened. */
unsigned long qsmaskinit;
/* Per-GP initial value for qsmask & expmask. */
unsigned long grpmask; /* Mask to apply to parent qsmask. */
@@ -122,11 +131,68 @@ struct rcu_node {
u8 grpnum; /* CPU/group number for next level up. */
u8 level; /* root is at level 0. */
struct rcu_node *parent;
- struct list_head blocked_tasks[4];
- /* Tasks blocked in RCU read-side critsect. */
- /* Grace period number (->gpnum) x blocked */
- /* by tasks on the (x & 0x1) element of the */
- /* blocked_tasks[] array. */
+ struct list_head blkd_tasks;
+ /* Tasks blocked in RCU read-side critical */
+ /* section. Tasks are placed at the head */
+ /* of this list and age towards the tail. */
+ struct list_head *gp_tasks;
+ /* Pointer to the first task blocking the */
+ /* current grace period, or NULL if there */
+ /* is no such task. */
+ struct list_head *exp_tasks;
+ /* Pointer to the first task blocking the */
+ /* current expedited grace period, or NULL */
+ /* if there is no such task. If there */
+ /* is no current expedited grace period, */
+ /* then there can cannot be any such task. */
+#ifdef CONFIG_RCU_BOOST
+ struct list_head *boost_tasks;
+ /* Pointer to first task that needs to be */
+ /* priority boosted, or NULL if no priority */
+ /* boosting is needed for this rcu_node */
+ /* structure. If there are no tasks */
+ /* queued on this rcu_node structure that */
+ /* are blocking the current grace period, */
+ /* there can be no such task. */
+ unsigned long boost_time;
+ /* When to start boosting (jiffies). */
+ struct task_struct *boost_kthread_task;
+ /* kthread that takes care of priority */
+ /* boosting for this rcu_node structure. */
+ wait_queue_head_t boost_wq;
+ /* Wait queue on which to park the boost */
+ /* kthread. */
+ unsigned int boost_kthread_status;
+ /* State of boost_kthread_task for tracing. */
+ unsigned long n_tasks_boosted;
+ /* Total number of tasks boosted. */
+ unsigned long n_exp_boosts;
+ /* Number of tasks boosted for expedited GP. */
+ unsigned long n_normal_boosts;
+ /* Number of tasks boosted for normal GP. */
+ unsigned long n_balk_blkd_tasks;
+ /* Refused to boost: no blocked tasks. */
+ unsigned long n_balk_exp_gp_tasks;
+ /* Refused to boost: nothing blocking GP. */
+ unsigned long n_balk_boost_tasks;
+ /* Refused to boost: already boosting. */
+ unsigned long n_balk_notblocked;
+ /* Refused to boost: RCU RS CS still running. */
+ unsigned long n_balk_notyet;
+ /* Refused to boost: not yet time. */
+ unsigned long n_balk_nos;
+ /* Refused to boost: not sure why, though. */
+ /* This can happen due to race conditions. */
+#endif /* #ifdef CONFIG_RCU_BOOST */
+ struct task_struct *node_kthread_task;
+ /* kthread that takes care of this rcu_node */
+ /* structure, for example, awakening the */
+ /* per-CPU kthreads as needed. */
+ wait_queue_head_t node_wq;
+ /* Wait queue on which to park the per-node */
+ /* kthread. */
+ unsigned int node_kthread_status;
+ /* State of node_kthread_task for tracing. */
} ____cacheline_internodealigned_in_smp;
/*
@@ -175,7 +241,7 @@ struct rcu_data {
bool passed_quiesc; /* User-mode/idle loop etc. */
bool qs_pending; /* Core waits for quiesc state. */
bool beenonline; /* CPU online at least once. */
- bool preemptable; /* Preemptable RCU? */
+ bool preemptible; /* Preemptible RCU? */
struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
unsigned long grpmask; /* Mask to apply to leaf qsmask. */
@@ -254,7 +320,6 @@ struct rcu_data {
#endif /* #else #ifdef CONFIG_NO_HZ */
#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
#ifdef CONFIG_PROVE_RCU
#define RCU_STALL_DELAY_DELTA (5 * HZ)
@@ -272,13 +337,6 @@ struct rcu_data {
/* scheduling clock irq */
/* before ratting on them. */
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR_RUNNABLE
-#define RCU_CPU_STALL_SUPPRESS_INIT 0
-#else
-#define RCU_CPU_STALL_SUPPRESS_INIT 1
-#endif
-
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
/*
* RCU global state, including node hierarchy. This hierarchy is
@@ -325,12 +383,12 @@ struct rcu_state {
/* due to lock unavailable. */
unsigned long n_force_qs_ngp; /* Number of calls leaving */
/* due to no GP active. */
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
unsigned long gp_start; /* Time at which GP started, */
/* but in jiffies. */
unsigned long jiffies_stall; /* Time at which to check */
/* for CPU stalls. */
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+ unsigned long gp_max; /* Maximum GP duration in */
+ /* jiffies. */
char *name; /* Name of structure. */
};
@@ -361,16 +419,14 @@ DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
static void rcu_bootup_announce(void);
long rcu_batches_completed(void);
static void rcu_preempt_note_context_switch(int cpu);
-static int rcu_preempted_readers(struct rcu_node *rnp);
+static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
#ifdef CONFIG_HOTPLUG_CPU
static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
unsigned long flags);
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
static void rcu_print_detail_task_stall(struct rcu_state *rsp);
static void rcu_print_task_stall(struct rcu_node *rnp);
static void rcu_preempt_stall_reset(void);
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
#ifdef CONFIG_HOTPLUG_CPU
static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
@@ -390,5 +446,13 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
static void rcu_preempt_send_cbs_to_online(void);
static void __init __rcu_init_preempt(void);
static void rcu_needs_cpu_flush(void);
+static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp);
+static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
+static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
+ cpumask_var_t cm);
+static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
+static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
+ struct rcu_node *rnp,
+ int rnp_index);
#endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index a3638710dc67..3f6559a5f5cd 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1,7 +1,7 @@
/*
* Read-Copy Update mechanism for mutual exclusion (tree-based version)
* Internal non-public definitions that provide either classic
- * or preemptable semantics.
+ * or preemptible semantics.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -54,10 +54,6 @@ static void __init rcu_bootup_announce_oddness(void)
#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
printk(KERN_INFO "\tRCU torture testing starts during boot.\n");
#endif
-#ifndef CONFIG_RCU_CPU_STALL_DETECTOR
- printk(KERN_INFO
- "\tRCU-based detection of stalled CPUs is disabled.\n");
-#endif
#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
#endif
@@ -70,6 +66,7 @@ static void __init rcu_bootup_announce_oddness(void)
struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
+static struct rcu_state *rcu_state = &rcu_preempt_state;
static int rcu_preempted_readers_exp(struct rcu_node *rnp);
@@ -78,7 +75,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp);
*/
static void __init rcu_bootup_announce(void)
{
- printk(KERN_INFO "Preemptable hierarchical RCU implementation.\n");
+ printk(KERN_INFO "Preemptible hierarchical RCU implementation.\n");
rcu_bootup_announce_oddness();
}
@@ -111,7 +108,7 @@ void rcu_force_quiescent_state(void)
EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
/*
- * Record a preemptable-RCU quiescent state for the specified CPU. Note
+ * Record a preemptible-RCU quiescent state for the specified CPU. Note
* that this just means that the task currently running on the CPU is
* not in a quiescent state. There might be any number of tasks blocked
* while in an RCU read-side critical section.
@@ -134,12 +131,12 @@ static void rcu_preempt_qs(int cpu)
* We have entered the scheduler, and the current task might soon be
* context-switched away from. If this task is in an RCU read-side
* critical section, we will no longer be able to rely on the CPU to
- * record that fact, so we enqueue the task on the appropriate entry
- * of the blocked_tasks[] array. The task will dequeue itself when
- * it exits the outermost enclosing RCU read-side critical section.
- * Therefore, the current grace period cannot be permitted to complete
- * until the blocked_tasks[] entry indexed by the low-order bit of
- * rnp->gpnum empties.
+ * record that fact, so we enqueue the task on the blkd_tasks list.
+ * The task will dequeue itself when it exits the outermost enclosing
+ * RCU read-side critical section. Therefore, the current grace period
+ * cannot be permitted to complete until the blkd_tasks list entries
+ * predating the current grace period drain, in other words, until
+ * rnp->gp_tasks becomes NULL.
*
* Caller must disable preemption.
*/
@@ -147,7 +144,6 @@ static void rcu_preempt_note_context_switch(int cpu)
{
struct task_struct *t = current;
unsigned long flags;
- int phase;
struct rcu_data *rdp;
struct rcu_node *rnp;
@@ -169,15 +165,30 @@ static void rcu_preempt_note_context_switch(int cpu)
* (i.e., this CPU has not yet passed through a quiescent
* state for the current grace period), then as long
* as that task remains queued, the current grace period
- * cannot end.
+ * cannot end. Note that there is some uncertainty as
+ * to exactly when the current grace period started.
+ * We take a conservative approach, which can result
+ * in unnecessarily waiting on tasks that started very
+ * slightly after the current grace period began. C'est
+ * la vie!!!
*
* But first, note that the current CPU must still be
* on line!
*/
WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0);
WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
- phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1;
- list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]);
+ if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
+ list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
+ rnp->gp_tasks = &t->rcu_node_entry;
+#ifdef CONFIG_RCU_BOOST
+ if (rnp->boost_tasks != NULL)
+ rnp->boost_tasks = rnp->gp_tasks;
+#endif /* #ifdef CONFIG_RCU_BOOST */
+ } else {
+ list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
+ if (rnp->qsmask & rdp->grpmask)
+ rnp->gp_tasks = &t->rcu_node_entry;
+ }
raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
@@ -196,7 +207,7 @@ static void rcu_preempt_note_context_switch(int cpu)
}
/*
- * Tree-preemptable RCU implementation for rcu_read_lock().
+ * Tree-preemptible RCU implementation for rcu_read_lock().
* Just increment ->rcu_read_lock_nesting, shared state will be updated
* if we block.
*/
@@ -212,12 +223,9 @@ EXPORT_SYMBOL_GPL(__rcu_read_lock);
* for the specified rcu_node structure. If the caller needs a reliable
* answer, it must hold the rcu_node's ->lock.
*/
-static int rcu_preempted_readers(struct rcu_node *rnp)
+static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
{
- int phase = rnp->gpnum & 0x1;
-
- return !list_empty(&rnp->blocked_tasks[phase]) ||
- !list_empty(&rnp->blocked_tasks[phase + 2]);
+ return rnp->gp_tasks != NULL;
}
/*
@@ -233,7 +241,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
unsigned long mask;
struct rcu_node *rnp_p;
- if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
+ if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
return; /* Still need more quiescent states! */
}
@@ -257,6 +265,21 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
}
/*
+ * Advance a ->blkd_tasks-list pointer to the next entry, instead
+ * returning NULL if at the end of the list.
+ */
+static struct list_head *rcu_next_node_entry(struct task_struct *t,
+ struct rcu_node *rnp)
+{
+ struct list_head *np;
+
+ np = t->rcu_node_entry.next;
+ if (np == &rnp->blkd_tasks)
+ np = NULL;
+ return np;
+}
+
+/*
* Handle special cases during rcu_read_unlock(), such as needing to
* notify RCU core processing or task having blocked during the RCU
* read-side critical section.
@@ -266,6 +289,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
int empty;
int empty_exp;
unsigned long flags;
+ struct list_head *np;
struct rcu_node *rnp;
int special;
@@ -306,10 +330,19 @@ static void rcu_read_unlock_special(struct task_struct *t)
break;
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
}
- empty = !rcu_preempted_readers(rnp);
+ empty = !rcu_preempt_blocked_readers_cgp(rnp);
empty_exp = !rcu_preempted_readers_exp(rnp);
smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
+ np = rcu_next_node_entry(t, rnp);
list_del_init(&t->rcu_node_entry);
+ if (&t->rcu_node_entry == rnp->gp_tasks)
+ rnp->gp_tasks = np;
+ if (&t->rcu_node_entry == rnp->exp_tasks)
+ rnp->exp_tasks = np;
+#ifdef CONFIG_RCU_BOOST
+ if (&t->rcu_node_entry == rnp->boost_tasks)
+ rnp->boost_tasks = np;
+#endif /* #ifdef CONFIG_RCU_BOOST */
t->rcu_blocked_node = NULL;
/*
@@ -322,6 +355,15 @@ static void rcu_read_unlock_special(struct task_struct *t)
else
rcu_report_unblock_qs_rnp(rnp, flags);
+#ifdef CONFIG_RCU_BOOST
+ /* Unboost if we were boosted. */
+ if (special & RCU_READ_UNLOCK_BOOSTED) {
+ t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
+ rt_mutex_unlock(t->rcu_boost_mutex);
+ t->rcu_boost_mutex = NULL;
+ }
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
/*
* If this was the last task on the expedited lists,
* then we need to report up the rcu_node hierarchy.
@@ -334,7 +376,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
}
/*
- * Tree-preemptable RCU implementation for rcu_read_unlock().
+ * Tree-preemptible RCU implementation for rcu_read_unlock().
* Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
* rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
* invoke rcu_read_unlock_special() to clean up after a context switch
@@ -356,8 +398,6 @@ void __rcu_read_unlock(void)
}
EXPORT_SYMBOL_GPL(__rcu_read_unlock);
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
-
#ifdef CONFIG_RCU_CPU_STALL_VERBOSE
/*
@@ -367,18 +407,16 @@ EXPORT_SYMBOL_GPL(__rcu_read_unlock);
static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
{
unsigned long flags;
- struct list_head *lp;
- int phase;
struct task_struct *t;
- if (rcu_preempted_readers(rnp)) {
- raw_spin_lock_irqsave(&rnp->lock, flags);
- phase = rnp->gpnum & 0x1;
- lp = &rnp->blocked_tasks[phase];
- list_for_each_entry(t, lp, rcu_node_entry)
- sched_show_task(t);
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- }
+ if (!rcu_preempt_blocked_readers_cgp(rnp))
+ return;
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ t = list_entry(rnp->gp_tasks,
+ struct task_struct, rcu_node_entry);
+ list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
+ sched_show_task(t);
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
/*
@@ -408,16 +446,14 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
*/
static void rcu_print_task_stall(struct rcu_node *rnp)
{
- struct list_head *lp;
- int phase;
struct task_struct *t;
- if (rcu_preempted_readers(rnp)) {
- phase = rnp->gpnum & 0x1;
- lp = &rnp->blocked_tasks[phase];
- list_for_each_entry(t, lp, rcu_node_entry)
- printk(" P%d", t->pid);
- }
+ if (!rcu_preempt_blocked_readers_cgp(rnp))
+ return;
+ t = list_entry(rnp->gp_tasks,
+ struct task_struct, rcu_node_entry);
+ list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
+ printk(" P%d", t->pid);
}
/*
@@ -430,18 +466,21 @@ static void rcu_preempt_stall_reset(void)
rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2;
}
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
-
/*
* Check that the list of blocked tasks for the newly completed grace
* period is in fact empty. It is a serious bug to complete a grace
* period that still has RCU readers blocked! This function must be
* invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock
* must be held by the caller.
+ *
+ * Also, if there are blocked tasks on the list, they automatically
+ * block the newly created grace period, so set up ->gp_tasks accordingly.
*/
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
{
- WARN_ON_ONCE(rcu_preempted_readers(rnp));
+ WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
+ if (!list_empty(&rnp->blkd_tasks))
+ rnp->gp_tasks = rnp->blkd_tasks.next;
WARN_ON_ONCE(rnp->qsmask);
}
@@ -465,50 +504,68 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
struct rcu_node *rnp,
struct rcu_data *rdp)
{
- int i;
struct list_head *lp;
struct list_head *lp_root;
int retval = 0;
struct rcu_node *rnp_root = rcu_get_root(rsp);
- struct task_struct *tp;
+ struct task_struct *t;
if (rnp == rnp_root) {
WARN_ONCE(1, "Last CPU thought to be offlined?");
return 0; /* Shouldn't happen: at least one CPU online. */
}
- WARN_ON_ONCE(rnp != rdp->mynode &&
- (!list_empty(&rnp->blocked_tasks[0]) ||
- !list_empty(&rnp->blocked_tasks[1]) ||
- !list_empty(&rnp->blocked_tasks[2]) ||
- !list_empty(&rnp->blocked_tasks[3])));
+
+ /* If we are on an internal node, complain bitterly. */
+ WARN_ON_ONCE(rnp != rdp->mynode);
/*
- * Move tasks up to root rcu_node. Rely on the fact that the
- * root rcu_node can be at most one ahead of the rest of the
- * rcu_nodes in terms of gp_num value. This fact allows us to
- * move the blocked_tasks[] array directly, element by element.
+ * Move tasks up to root rcu_node. Don't try to get fancy for
+ * this corner-case operation -- just put this node's tasks
+ * at the head of the root node's list, and update the root node's
+ * ->gp_tasks and ->exp_tasks pointers to those of this node's,
+ * if non-NULL. This might result in waiting for more tasks than
+ * absolutely necessary, but this is a good performance/complexity
+ * tradeoff.
*/
- if (rcu_preempted_readers(rnp))
+ if (rcu_preempt_blocked_readers_cgp(rnp))
retval |= RCU_OFL_TASKS_NORM_GP;
if (rcu_preempted_readers_exp(rnp))
retval |= RCU_OFL_TASKS_EXP_GP;
- for (i = 0; i < 4; i++) {
- lp = &rnp->blocked_tasks[i];
- lp_root = &rnp_root->blocked_tasks[i];
- while (!list_empty(lp)) {
- tp = list_entry(lp->next, typeof(*tp), rcu_node_entry);
- raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
- list_del(&tp->rcu_node_entry);
- tp->rcu_blocked_node = rnp_root;
- list_add(&tp->rcu_node_entry, lp_root);
- raw_spin_unlock(&rnp_root->lock); /* irqs remain disabled */
- }
+ lp = &rnp->blkd_tasks;
+ lp_root = &rnp_root->blkd_tasks;
+ while (!list_empty(lp)) {
+ t = list_entry(lp->next, typeof(*t), rcu_node_entry);
+ raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
+ list_del(&t->rcu_node_entry);
+ t->rcu_blocked_node = rnp_root;
+ list_add(&t->rcu_node_entry, lp_root);
+ if (&t->rcu_node_entry == rnp->gp_tasks)
+ rnp_root->gp_tasks = rnp->gp_tasks;
+ if (&t->rcu_node_entry == rnp->exp_tasks)
+ rnp_root->exp_tasks = rnp->exp_tasks;
+#ifdef CONFIG_RCU_BOOST
+ if (&t->rcu_node_entry == rnp->boost_tasks)
+ rnp_root->boost_tasks = rnp->boost_tasks;
+#endif /* #ifdef CONFIG_RCU_BOOST */
+ raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
}
+
+#ifdef CONFIG_RCU_BOOST
+ /* In case root is being boosted and leaf is not. */
+ raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
+ if (rnp_root->boost_tasks != NULL &&
+ rnp_root->boost_tasks != rnp_root->gp_tasks)
+ rnp_root->boost_tasks = rnp_root->gp_tasks;
+ raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
+ rnp->gp_tasks = NULL;
+ rnp->exp_tasks = NULL;
return retval;
}
/*
- * Do CPU-offline processing for preemptable RCU.
+ * Do CPU-offline processing for preemptible RCU.
*/
static void rcu_preempt_offline_cpu(int cpu)
{
@@ -537,7 +594,7 @@ static void rcu_preempt_check_callbacks(int cpu)
}
/*
- * Process callbacks for preemptable RCU.
+ * Process callbacks for preemptible RCU.
*/
static void rcu_preempt_process_callbacks(void)
{
@@ -546,7 +603,7 @@ static void rcu_preempt_process_callbacks(void)
}
/*
- * Queue a preemptable-RCU callback for invocation after a grace period.
+ * Queue a preemptible-RCU callback for invocation after a grace period.
*/
void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
{
@@ -594,8 +651,7 @@ static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
*/
static int rcu_preempted_readers_exp(struct rcu_node *rnp)
{
- return !list_empty(&rnp->blocked_tasks[2]) ||
- !list_empty(&rnp->blocked_tasks[3]);
+ return rnp->exp_tasks != NULL;
}
/*
@@ -655,13 +711,17 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
static void
sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
{
- int must_wait;
+ unsigned long flags;
+ int must_wait = 0;
- raw_spin_lock(&rnp->lock); /* irqs already disabled */
- list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]);
- list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]);
- must_wait = rcu_preempted_readers_exp(rnp);
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ if (list_empty(&rnp->blkd_tasks))
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ else {
+ rnp->exp_tasks = rnp->blkd_tasks.next;
+ rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
+ must_wait = 1;
+ }
if (!must_wait)
rcu_report_exp_rnp(rsp, rnp);
}
@@ -669,9 +729,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
/*
* Wait for an rcu-preempt grace period, but expedite it. The basic idea
* is to invoke synchronize_sched_expedited() to push all the tasks to
- * the ->blocked_tasks[] lists, move all entries from the first set of
- * ->blocked_tasks[] lists to the second set, and finally wait for this
- * second set to drain.
+ * the ->blkd_tasks lists and wait for this list to drain.
*/
void synchronize_rcu_expedited(void)
{
@@ -703,7 +761,7 @@ void synchronize_rcu_expedited(void)
if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
goto unlock_mb_ret; /* Others did our work for us. */
- /* force all RCU readers onto blocked_tasks[]. */
+ /* force all RCU readers onto ->blkd_tasks lists. */
synchronize_sched_expedited();
raw_spin_lock_irqsave(&rsp->onofflock, flags);
@@ -715,7 +773,7 @@ void synchronize_rcu_expedited(void)
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
}
- /* Snapshot current state of ->blocked_tasks[] lists. */
+ /* Snapshot current state of ->blkd_tasks lists. */
rcu_for_each_leaf_node(rsp, rnp)
sync_rcu_preempt_exp_init(rsp, rnp);
if (NUM_RCU_NODES > 1)
@@ -723,7 +781,7 @@ void synchronize_rcu_expedited(void)
raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
- /* Wait for snapshotted ->blocked_tasks[] lists to drain. */
+ /* Wait for snapshotted ->blkd_tasks lists to drain. */
rnp = rcu_get_root(rsp);
wait_event(sync_rcu_preempt_exp_wq,
sync_rcu_preempt_exp_done(rnp));
@@ -739,7 +797,7 @@ mb_ret:
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
/*
- * Check to see if there is any immediate preemptable-RCU-related work
+ * Check to see if there is any immediate preemptible-RCU-related work
* to be done.
*/
static int rcu_preempt_pending(int cpu)
@@ -749,7 +807,7 @@ static int rcu_preempt_pending(int cpu)
}
/*
- * Does preemptable RCU need the CPU to stay out of dynticks mode?
+ * Does preemptible RCU need the CPU to stay out of dynticks mode?
*/
static int rcu_preempt_needs_cpu(int cpu)
{
@@ -766,7 +824,7 @@ void rcu_barrier(void)
EXPORT_SYMBOL_GPL(rcu_barrier);
/*
- * Initialize preemptable RCU's per-CPU data.
+ * Initialize preemptible RCU's per-CPU data.
*/
static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
{
@@ -774,7 +832,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
}
/*
- * Move preemptable RCU's callbacks from dying CPU to other online CPU.
+ * Move preemptible RCU's callbacks from dying CPU to other online CPU.
*/
static void rcu_preempt_send_cbs_to_online(void)
{
@@ -782,7 +840,7 @@ static void rcu_preempt_send_cbs_to_online(void)
}
/*
- * Initialize preemptable RCU's state structures.
+ * Initialize preemptible RCU's state structures.
*/
static void __init __rcu_init_preempt(void)
{
@@ -790,7 +848,7 @@ static void __init __rcu_init_preempt(void)
}
/*
- * Check for a task exiting while in a preemptable-RCU read-side
+ * Check for a task exiting while in a preemptible-RCU read-side
* critical section, clean up if so. No need to issue warnings,
* as debug_check_no_locks_held() already does this if lockdep
* is enabled.
@@ -802,11 +860,13 @@ void exit_rcu(void)
if (t->rcu_read_lock_nesting == 0)
return;
t->rcu_read_lock_nesting = 1;
- rcu_read_unlock();
+ __rcu_read_unlock();
}
#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+static struct rcu_state *rcu_state = &rcu_sched_state;
+
/*
* Tell them what RCU they are running.
*/
@@ -836,7 +896,7 @@ void rcu_force_quiescent_state(void)
EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
/*
- * Because preemptable RCU does not exist, we never have to check for
+ * Because preemptible RCU does not exist, we never have to check for
* CPUs being in quiescent states.
*/
static void rcu_preempt_note_context_switch(int cpu)
@@ -844,10 +904,10 @@ static void rcu_preempt_note_context_switch(int cpu)
}
/*
- * Because preemptable RCU does not exist, there are never any preempted
+ * Because preemptible RCU does not exist, there are never any preempted
* RCU readers.
*/
-static int rcu_preempted_readers(struct rcu_node *rnp)
+static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
{
return 0;
}
@@ -862,10 +922,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
-
/*
- * Because preemptable RCU does not exist, we never have to check for
+ * Because preemptible RCU does not exist, we never have to check for
* tasks blocked within RCU read-side critical sections.
*/
static void rcu_print_detail_task_stall(struct rcu_state *rsp)
@@ -873,7 +931,7 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
}
/*
- * Because preemptable RCU does not exist, we never have to check for
+ * Because preemptible RCU does not exist, we never have to check for
* tasks blocked within RCU read-side critical sections.
*/
static void rcu_print_task_stall(struct rcu_node *rnp)
@@ -888,10 +946,8 @@ static void rcu_preempt_stall_reset(void)
{
}
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
-
/*
- * Because there is no preemptable RCU, there can be no readers blocked,
+ * Because there is no preemptible RCU, there can be no readers blocked,
* so there is no need to check for blocked tasks. So check only for
* bogus qsmask values.
*/
@@ -903,7 +959,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
#ifdef CONFIG_HOTPLUG_CPU
/*
- * Because preemptable RCU does not exist, it never needs to migrate
+ * Because preemptible RCU does not exist, it never needs to migrate
* tasks that were blocked within RCU read-side critical sections, and
* such non-existent tasks cannot possibly have been blocking the current
* grace period.
@@ -916,7 +972,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
}
/*
- * Because preemptable RCU does not exist, it never needs CPU-offline
+ * Because preemptible RCU does not exist, it never needs CPU-offline
* processing.
*/
static void rcu_preempt_offline_cpu(int cpu)
@@ -926,7 +982,7 @@ static void rcu_preempt_offline_cpu(int cpu)
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
/*
- * Because preemptable RCU does not exist, it never has any callbacks
+ * Because preemptible RCU does not exist, it never has any callbacks
* to check.
*/
static void rcu_preempt_check_callbacks(int cpu)
@@ -934,7 +990,7 @@ static void rcu_preempt_check_callbacks(int cpu)
}
/*
- * Because preemptable RCU does not exist, it never has any callbacks
+ * Because preemptible RCU does not exist, it never has any callbacks
* to process.
*/
static void rcu_preempt_process_callbacks(void)
@@ -943,7 +999,7 @@ static void rcu_preempt_process_callbacks(void)
/*
* Wait for an rcu-preempt grace period, but make it happen quickly.
- * But because preemptable RCU does not exist, map to rcu-sched.
+ * But because preemptible RCU does not exist, map to rcu-sched.
*/
void synchronize_rcu_expedited(void)
{
@@ -954,7 +1010,7 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
#ifdef CONFIG_HOTPLUG_CPU
/*
- * Because preemptable RCU does not exist, there is never any need to
+ * Because preemptible RCU does not exist, there is never any need to
* report on tasks preempted in RCU read-side critical sections during
* expedited RCU grace periods.
*/
@@ -966,7 +1022,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
/*
- * Because preemptable RCU does not exist, it never has any work to do.
+ * Because preemptible RCU does not exist, it never has any work to do.
*/
static int rcu_preempt_pending(int cpu)
{
@@ -974,7 +1030,7 @@ static int rcu_preempt_pending(int cpu)
}
/*
- * Because preemptable RCU does not exist, it never needs any CPU.
+ * Because preemptible RCU does not exist, it never needs any CPU.
*/
static int rcu_preempt_needs_cpu(int cpu)
{
@@ -982,7 +1038,7 @@ static int rcu_preempt_needs_cpu(int cpu)
}
/*
- * Because preemptable RCU does not exist, rcu_barrier() is just
+ * Because preemptible RCU does not exist, rcu_barrier() is just
* another name for rcu_barrier_sched().
*/
void rcu_barrier(void)
@@ -992,7 +1048,7 @@ void rcu_barrier(void)
EXPORT_SYMBOL_GPL(rcu_barrier);
/*
- * Because preemptable RCU does not exist, there is no per-CPU
+ * Because preemptible RCU does not exist, there is no per-CPU
* data to initialize.
*/
static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
@@ -1000,14 +1056,14 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
}
/*
- * Because there is no preemptable RCU, there are no callbacks to move.
+ * Because there is no preemptible RCU, there are no callbacks to move.
*/
static void rcu_preempt_send_cbs_to_online(void)
{
}
/*
- * Because preemptable RCU does not exist, it need not be initialized.
+ * Because preemptible RCU does not exist, it need not be initialized.
*/
static void __init __rcu_init_preempt(void)
{
@@ -1015,6 +1071,276 @@ static void __init __rcu_init_preempt(void)
#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifdef CONFIG_RCU_BOOST
+
+#include "rtmutex_common.h"
+
+#ifdef CONFIG_RCU_TRACE
+
+static void rcu_initiate_boost_trace(struct rcu_node *rnp)
+{
+ if (list_empty(&rnp->blkd_tasks))
+ rnp->n_balk_blkd_tasks++;
+ else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL)
+ rnp->n_balk_exp_gp_tasks++;
+ else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL)
+ rnp->n_balk_boost_tasks++;
+ else if (rnp->gp_tasks != NULL && rnp->qsmask != 0)
+ rnp->n_balk_notblocked++;
+ else if (rnp->gp_tasks != NULL &&
+ ULONG_CMP_LT(jiffies, rnp->boost_time))
+ rnp->n_balk_notyet++;
+ else
+ rnp->n_balk_nos++;
+}
+
+#else /* #ifdef CONFIG_RCU_TRACE */
+
+static void rcu_initiate_boost_trace(struct rcu_node *rnp)
+{
+}
+
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
+
+/*
+ * Carry out RCU priority boosting on the task indicated by ->exp_tasks
+ * or ->boost_tasks, advancing the pointer to the next task in the
+ * ->blkd_tasks list.
+ *
+ * Note that irqs must be enabled: boosting the task can block.
+ * Returns 1 if there are more tasks needing to be boosted.
+ */
+static int rcu_boost(struct rcu_node *rnp)
+{
+ unsigned long flags;
+ struct rt_mutex mtx;
+ struct task_struct *t;
+ struct list_head *tb;
+
+ if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL)
+ return 0; /* Nothing left to boost. */
+
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+
+ /*
+ * Recheck under the lock: all tasks in need of boosting
+ * might exit their RCU read-side critical sections on their own.
+ */
+ if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) {
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ return 0;
+ }
+
+ /*
+ * Preferentially boost tasks blocking expedited grace periods.
+ * This cannot starve the normal grace periods because a second
+ * expedited grace period must boost all blocked tasks, including
+ * those blocking the pre-existing normal grace period.
+ */
+ if (rnp->exp_tasks != NULL) {
+ tb = rnp->exp_tasks;
+ rnp->n_exp_boosts++;
+ } else {
+ tb = rnp->boost_tasks;
+ rnp->n_normal_boosts++;
+ }
+ rnp->n_tasks_boosted++;
+
+ /*
+ * We boost task t by manufacturing an rt_mutex that appears to
+ * be held by task t. We leave a pointer to that rt_mutex where
+ * task t can find it, and task t will release the mutex when it
+ * exits its outermost RCU read-side critical section. Then
+ * simply acquiring this artificial rt_mutex will boost task
+ * t's priority. (Thanks to tglx for suggesting this approach!)
+ *
+ * Note that task t must acquire rnp->lock to remove itself from
+ * the ->blkd_tasks list, which it will do from exit() if from
+ * nowhere else. We therefore are guaranteed that task t will
+ * stay around at least until we drop rnp->lock. Note that
+ * rnp->lock also resolves races between our priority boosting
+ * and task t's exiting its outermost RCU read-side critical
+ * section.
+ */
+ t = container_of(tb, struct task_struct, rcu_node_entry);
+ rt_mutex_init_proxy_locked(&mtx, t);
+ t->rcu_boost_mutex = &mtx;
+ t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */
+ rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
+
+ return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL;
+}
+
+/*
+ * Timer handler to initiate waking up of boost kthreads that
+ * have yielded the CPU due to excessive numbers of tasks to
+ * boost. We wake up the per-rcu_node kthread, which in turn
+ * will wake up the booster kthread.
+ */
+static void rcu_boost_kthread_timer(unsigned long arg)
+{
+ invoke_rcu_node_kthread((struct rcu_node *)arg);
+}
+
+/*
+ * Priority-boosting kthread. One per leaf rcu_node and one for the
+ * root rcu_node.
+ */
+static int rcu_boost_kthread(void *arg)
+{
+ struct rcu_node *rnp = (struct rcu_node *)arg;
+ int spincnt = 0;
+ int more2boost;
+
+ for (;;) {
+ rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
+ wait_event_interruptible(rnp->boost_wq, rnp->boost_tasks ||
+ rnp->exp_tasks);
+ rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
+ more2boost = rcu_boost(rnp);
+ if (more2boost)
+ spincnt++;
+ else
+ spincnt = 0;
+ if (spincnt > 10) {
+ rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp);
+ spincnt = 0;
+ }
+ }
+ /* NOTREACHED */
+ return 0;
+}
+
+/*
+ * Check to see if it is time to start boosting RCU readers that are
+ * blocking the current grace period, and, if so, tell the per-rcu_node
+ * kthread to start boosting them. If there is an expedited grace
+ * period in progress, it is always time to boost.
+ *
+ * The caller must hold rnp->lock, which this function releases,
+ * but irqs remain disabled. The ->boost_kthread_task is immortal,
+ * so we don't need to worry about it going away.
+ */
+static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
+{
+ struct task_struct *t;
+
+ if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
+ rnp->n_balk_exp_gp_tasks++;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ return;
+ }
+ if (rnp->exp_tasks != NULL ||
+ (rnp->gp_tasks != NULL &&
+ rnp->boost_tasks == NULL &&
+ rnp->qsmask == 0 &&
+ ULONG_CMP_GE(jiffies, rnp->boost_time))) {
+ if (rnp->exp_tasks == NULL)
+ rnp->boost_tasks = rnp->gp_tasks;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ t = rnp->boost_kthread_task;
+ if (t != NULL)
+ wake_up_process(t);
+ } else {
+ rcu_initiate_boost_trace(rnp);
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ }
+}
+
+/*
+ * Set the affinity of the boost kthread. The CPU-hotplug locks are
+ * held, so no one should be messing with the existence of the boost
+ * kthread.
+ */
+static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
+ cpumask_var_t cm)
+{
+ struct task_struct *t;
+
+ t = rnp->boost_kthread_task;
+ if (t != NULL)
+ set_cpus_allowed_ptr(rnp->boost_kthread_task, cm);
+}
+
+#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
+
+/*
+ * Do priority-boost accounting for the start of a new grace period.
+ */
+static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
+{
+ rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
+}
+
+/*
+ * Initialize the RCU-boost waitqueue.
+ */
+static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp)
+{
+ init_waitqueue_head(&rnp->boost_wq);
+}
+
+/*
+ * Create an RCU-boost kthread for the specified node if one does not
+ * already exist. We only create this kthread for preemptible RCU.
+ * Returns zero if all is well, a negated errno otherwise.
+ */
+static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
+ struct rcu_node *rnp,
+ int rnp_index)
+{
+ unsigned long flags;
+ struct sched_param sp;
+ struct task_struct *t;
+
+ if (&rcu_preempt_state != rsp)
+ return 0;
+ if (rnp->boost_kthread_task != NULL)
+ return 0;
+ t = kthread_create(rcu_boost_kthread, (void *)rnp,
+ "rcub%d", rnp_index);
+ if (IS_ERR(t))
+ return PTR_ERR(t);
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ rnp->boost_kthread_task = t;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ wake_up_process(t);
+ sp.sched_priority = RCU_KTHREAD_PRIO;
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+ return 0;
+}
+
+#else /* #ifdef CONFIG_RCU_BOOST */
+
+static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
+{
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+}
+
+static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
+ cpumask_var_t cm)
+{
+}
+
+static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
+{
+}
+
+static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp)
+{
+}
+
+static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
+ struct rcu_node *rnp,
+ int rnp_index)
+{
+ return 0;
+}
+
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
+
#ifndef CONFIG_SMP
void synchronize_sched_expedited(void)
@@ -1187,8 +1513,8 @@ static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
*
* Because it is not legal to invoke rcu_process_callbacks() with irqs
* disabled, we do one pass of force_quiescent_state(), then do a
- * raise_softirq() to cause rcu_process_callbacks() to be invoked later.
- * The per-cpu rcu_dyntick_drain variable controls the sequencing.
+ * invoke_rcu_cpu_kthread() to cause rcu_process_callbacks() to be invoked
+ * later. The per-cpu rcu_dyntick_drain variable controls the sequencing.
*/
int rcu_needs_cpu(int cpu)
{
@@ -1239,7 +1565,7 @@ int rcu_needs_cpu(int cpu)
/* If RCU callbacks are still pending, RCU still needs this CPU. */
if (c)
- raise_softirq(RCU_SOFTIRQ);
+ invoke_rcu_cpu_kthread();
return c;
}
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index c8e97853b970..aa0fd72b4bc7 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -46,6 +46,18 @@
#define RCU_TREE_NONCORE
#include "rcutree.h"
+DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
+DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_cpu);
+DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
+DECLARE_PER_CPU(char, rcu_cpu_has_work);
+
+static char convert_kthread_status(unsigned int kthread_status)
+{
+ if (kthread_status > RCU_KTHREAD_MAX)
+ return '?';
+ return "SRWOY"[kthread_status];
+}
+
static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
{
if (!rdp->beenonline)
@@ -64,7 +76,21 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
rdp->dynticks_fqs);
#endif /* #ifdef CONFIG_NO_HZ */
seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
- seq_printf(m, " ql=%ld b=%ld", rdp->qlen, rdp->blimit);
+ seq_printf(m, " ql=%ld qs=%c%c%c%c kt=%d/%c/%d ktl=%x b=%ld",
+ rdp->qlen,
+ ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
+ rdp->nxttail[RCU_NEXT_TAIL]],
+ ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
+ rdp->nxttail[RCU_NEXT_READY_TAIL]],
+ ".W"[rdp->nxttail[RCU_DONE_TAIL] !=
+ rdp->nxttail[RCU_WAIT_TAIL]],
+ ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]],
+ per_cpu(rcu_cpu_has_work, rdp->cpu),
+ convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
+ rdp->cpu)),
+ per_cpu(rcu_cpu_kthread_cpu, rdp->cpu),
+ per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff,
+ rdp->blimit);
seq_printf(m, " ci=%lu co=%lu ca=%lu\n",
rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
}
@@ -121,7 +147,18 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
rdp->dynticks_fqs);
#endif /* #ifdef CONFIG_NO_HZ */
seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
- seq_printf(m, ",%ld,%ld", rdp->qlen, rdp->blimit);
+ seq_printf(m, ",%ld,\"%c%c%c%c\",%d,\"%c\",%ld", rdp->qlen,
+ ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
+ rdp->nxttail[RCU_NEXT_TAIL]],
+ ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
+ rdp->nxttail[RCU_NEXT_READY_TAIL]],
+ ".W"[rdp->nxttail[RCU_DONE_TAIL] !=
+ rdp->nxttail[RCU_WAIT_TAIL]],
+ ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]],
+ per_cpu(rcu_cpu_has_work, rdp->cpu),
+ convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
+ rdp->cpu)),
+ rdp->blimit);
seq_printf(m, ",%lu,%lu,%lu\n",
rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
}
@@ -157,11 +194,76 @@ static const struct file_operations rcudata_csv_fops = {
.release = single_release,
};
+#ifdef CONFIG_RCU_BOOST
+
+static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp)
+{
+ seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu "
+ "j=%04x bt=%04x\n",
+ rnp->grplo, rnp->grphi,
+ "T."[list_empty(&rnp->blkd_tasks)],
+ "N."[!rnp->gp_tasks],
+ "E."[!rnp->exp_tasks],
+ "B."[!rnp->boost_tasks],
+ convert_kthread_status(rnp->boost_kthread_status),
+ rnp->n_tasks_boosted, rnp->n_exp_boosts,
+ rnp->n_normal_boosts,
+ (int)(jiffies & 0xffff),
+ (int)(rnp->boost_time & 0xffff));
+ seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n",
+ " balk",
+ rnp->n_balk_blkd_tasks,
+ rnp->n_balk_exp_gp_tasks,
+ rnp->n_balk_boost_tasks,
+ rnp->n_balk_notblocked,
+ rnp->n_balk_notyet,
+ rnp->n_balk_nos);
+}
+
+static int show_rcu_node_boost(struct seq_file *m, void *unused)
+{
+ struct rcu_node *rnp;
+
+ rcu_for_each_leaf_node(&rcu_preempt_state, rnp)
+ print_one_rcu_node_boost(m, rnp);
+ return 0;
+}
+
+static int rcu_node_boost_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, show_rcu_node_boost, NULL);
+}
+
+static const struct file_operations rcu_node_boost_fops = {
+ .owner = THIS_MODULE,
+ .open = rcu_node_boost_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+/*
+ * Create the rcuboost debugfs entry. Standard error return.
+ */
+static int rcu_boost_trace_create_file(struct dentry *rcudir)
+{
+ return !debugfs_create_file("rcuboost", 0444, rcudir, NULL,
+ &rcu_node_boost_fops);
+}
+
+#else /* #ifdef CONFIG_RCU_BOOST */
+
+static int rcu_boost_trace_create_file(struct dentry *rcudir)
+{
+ return 0; /* There cannot be an error if we didn't create it! */
+}
+
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
+
static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
{
unsigned long gpnum;
int level = 0;
- int phase;
struct rcu_node *rnp;
gpnum = rsp->gpnum;
@@ -178,13 +280,11 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
seq_puts(m, "\n");
level = rnp->level;
}
- phase = gpnum & 0x1;
- seq_printf(m, "%lx/%lx %c%c>%c%c %d:%d ^%d ",
+ seq_printf(m, "%lx/%lx %c%c>%c %d:%d ^%d ",
rnp->qsmask, rnp->qsmaskinit,
- "T."[list_empty(&rnp->blocked_tasks[phase])],
- "E."[list_empty(&rnp->blocked_tasks[phase + 2])],
- "T."[list_empty(&rnp->blocked_tasks[!phase])],
- "E."[list_empty(&rnp->blocked_tasks[!phase + 2])],
+ ".G"[rnp->gp_tasks != NULL],
+ ".E"[rnp->exp_tasks != NULL],
+ ".T"[!list_empty(&rnp->blkd_tasks)],
rnp->grplo, rnp->grphi, rnp->grpnum);
}
seq_puts(m, "\n");
@@ -216,16 +316,35 @@ static const struct file_operations rcuhier_fops = {
.release = single_release,
};
+static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
+{
+ unsigned long flags;
+ unsigned long completed;
+ unsigned long gpnum;
+ unsigned long gpage;
+ unsigned long gpmax;
+ struct rcu_node *rnp = &rsp->node[0];
+
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ completed = rsp->completed;
+ gpnum = rsp->gpnum;
+ if (rsp->completed == rsp->gpnum)
+ gpage = 0;
+ else
+ gpage = jiffies - rsp->gp_start;
+ gpmax = rsp->gp_max;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ seq_printf(m, "%s: completed=%ld gpnum=%lu age=%ld max=%ld\n",
+ rsp->name, completed, gpnum, gpage, gpmax);
+}
+
static int show_rcugp(struct seq_file *m, void *unused)
{
#ifdef CONFIG_TREE_PREEMPT_RCU
- seq_printf(m, "rcu_preempt: completed=%ld gpnum=%lu\n",
- rcu_preempt_state.completed, rcu_preempt_state.gpnum);
+ show_one_rcugp(m, &rcu_preempt_state);
#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
- seq_printf(m, "rcu_sched: completed=%ld gpnum=%lu\n",
- rcu_sched_state.completed, rcu_sched_state.gpnum);
- seq_printf(m, "rcu_bh: completed=%ld gpnum=%lu\n",
- rcu_bh_state.completed, rcu_bh_state.gpnum);
+ show_one_rcugp(m, &rcu_sched_state);
+ show_one_rcugp(m, &rcu_bh_state);
return 0;
}
@@ -298,6 +417,29 @@ static const struct file_operations rcu_pending_fops = {
.release = single_release,
};
+static int show_rcutorture(struct seq_file *m, void *unused)
+{
+ seq_printf(m, "rcutorture test sequence: %lu %s\n",
+ rcutorture_testseq >> 1,
+ (rcutorture_testseq & 0x1) ? "(test in progress)" : "");
+ seq_printf(m, "rcutorture update version number: %lu\n",
+ rcutorture_vernum);
+ return 0;
+}
+
+static int rcutorture_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, show_rcutorture, NULL);
+}
+
+static const struct file_operations rcutorture_fops = {
+ .owner = THIS_MODULE,
+ .open = rcutorture_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
static struct dentry *rcudir;
static int __init rcutree_trace_init(void)
@@ -318,6 +460,9 @@ static int __init rcutree_trace_init(void)
if (!retval)
goto free_out;
+ if (rcu_boost_trace_create_file(rcudir))
+ goto free_out;
+
retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
if (!retval)
goto free_out;
@@ -331,6 +476,11 @@ static int __init rcutree_trace_init(void)
NULL, &rcu_pending_fops);
if (!retval)
goto free_out;
+
+ retval = debugfs_create_file("rcutorture", 0444, rcudir,
+ NULL, &rcutorture_fops);
+ if (!retval)
+ goto free_out;
return 0;
free_out:
debugfs_remove_recursive(rcudir);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 174f976c2874..13960170cad4 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -58,7 +58,7 @@ DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
char *softirq_to_name[NR_SOFTIRQS] = {
"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
- "TASKLET", "SCHED", "HRTIMER", "RCU"
+ "TASKLET", "SCHED", "HRTIMER"
};
/*