diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2011-05-20 03:14:34 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-05-20 03:14:34 +0200 |
commit | eb04f2f04ed1227c266b3219c0aaeda525639718 (patch) | |
tree | 7f224483a3cd0e439cd64a8666ec9dc5ed178a3d /kernel | |
parent | Merge branch 'x86-smep-for-linus' of git://git.kernel.org/pub/scm/linux/kerne... (diff) | |
parent | Revert "rcu: Decrease memory-barrier usage based on semi-formal proof" (diff) | |
download | linux-eb04f2f04ed1227c266b3219c0aaeda525639718.tar.xz linux-eb04f2f04ed1227c266b3219c0aaeda525639718.zip |
Merge branch 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (78 commits)
Revert "rcu: Decrease memory-barrier usage based on semi-formal proof"
net,rcu: convert call_rcu(prl_entry_destroy_rcu) to kfree
batman,rcu: convert call_rcu(softif_neigh_free_rcu) to kfree_rcu
batman,rcu: convert call_rcu(neigh_node_free_rcu) to kfree()
batman,rcu: convert call_rcu(gw_node_free_rcu) to kfree_rcu
net,rcu: convert call_rcu(kfree_tid_tx) to kfree_rcu()
net,rcu: convert call_rcu(xt_osf_finger_free_rcu) to kfree_rcu()
net/mac80211,rcu: convert call_rcu(work_free_rcu) to kfree_rcu()
net,rcu: convert call_rcu(wq_free_rcu) to kfree_rcu()
net,rcu: convert call_rcu(phonet_device_rcu_free) to kfree_rcu()
perf,rcu: convert call_rcu(swevent_hlist_release_rcu) to kfree_rcu()
perf,rcu: convert call_rcu(free_ctx) to kfree_rcu()
net,rcu: convert call_rcu(__nf_ct_ext_free_rcu) to kfree_rcu()
net,rcu: convert call_rcu(net_generic_release) to kfree_rcu()
net,rcu: convert call_rcu(netlbl_unlhsh_free_addr6) to kfree_rcu()
net,rcu: convert call_rcu(netlbl_unlhsh_free_addr4) to kfree_rcu()
security,rcu: convert call_rcu(sel_netif_free) to kfree_rcu()
net,rcu: convert call_rcu(xps_dev_maps_release) to kfree_rcu()
net,rcu: convert call_rcu(xps_map_release) to kfree_rcu()
net,rcu: convert call_rcu(rps_map_release) to kfree_rcu()
...
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cgroup.c | 27 | ||||
-rw-r--r-- | kernel/events/core.c | 20 | ||||
-rw-r--r-- | kernel/rcupdate.c | 32 | ||||
-rw-r--r-- | kernel/rcutiny.c | 45 | ||||
-rw-r--r-- | kernel/rcutiny_plugin.h | 203 | ||||
-rw-r--r-- | kernel/rcutorture.c | 26 | ||||
-rw-r--r-- | kernel/rcutree.c | 526 | ||||
-rw-r--r-- | kernel/rcutree.h | 104 | ||||
-rw-r--r-- | kernel/rcutree_plugin.h | 568 | ||||
-rw-r--r-- | kernel/rcutree_trace.c | 180 | ||||
-rw-r--r-- | kernel/softirq.c | 2 |
11 files changed, 1333 insertions, 400 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 25c7eb52de1a..909a35510af5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -326,12 +326,6 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) return &css_set_table[index]; } -static void free_css_set_rcu(struct rcu_head *obj) -{ - struct css_set *cg = container_of(obj, struct css_set, rcu_head); - kfree(cg); -} - /* We don't maintain the lists running through each css_set to its * task until after the first call to cgroup_iter_start(). This * reduces the fork()/exit() overhead for people who have cgroups @@ -375,7 +369,7 @@ static void __put_css_set(struct css_set *cg, int taskexit) } write_unlock(&css_set_lock); - call_rcu(&cg->rcu_head, free_css_set_rcu); + kfree_rcu(cg, rcu_head); } /* @@ -812,13 +806,6 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp) return ret; } -static void free_cgroup_rcu(struct rcu_head *obj) -{ - struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head); - - kfree(cgrp); -} - static void cgroup_diput(struct dentry *dentry, struct inode *inode) { /* is dentry a directory ? if so, kfree() associated cgroup */ @@ -856,7 +843,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) */ BUG_ON(!list_empty(&cgrp->pidlists)); - call_rcu(&cgrp->rcu_head, free_cgroup_rcu); + kfree_rcu(cgrp, rcu_head); } iput(inode); } @@ -4623,14 +4610,6 @@ bool css_is_ancestor(struct cgroup_subsys_state *child, return ret; } -static void __free_css_id_cb(struct rcu_head *head) -{ - struct css_id *id; - - id = container_of(head, struct css_id, rcu_head); - kfree(id); -} - void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) { struct css_id *id = css->id; @@ -4645,7 +4624,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) spin_lock(&ss->id_lock); idr_remove(&ss->idr, id->id); spin_unlock(&ss->id_lock); - call_rcu(&id->rcu_head, __free_css_id_cb); + kfree_rcu(id, rcu_head); } EXPORT_SYMBOL_GPL(free_css_id); diff --git a/kernel/events/core.c b/kernel/events/core.c index 0fc34a370ba4..c09767f7db3e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -586,14 +586,6 @@ static void get_ctx(struct perf_event_context *ctx) WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); } -static void free_ctx(struct rcu_head *head) -{ - struct perf_event_context *ctx; - - ctx = container_of(head, struct perf_event_context, rcu_head); - kfree(ctx); -} - static void put_ctx(struct perf_event_context *ctx) { if (atomic_dec_and_test(&ctx->refcount)) { @@ -601,7 +593,7 @@ static void put_ctx(struct perf_event_context *ctx) put_ctx(ctx->parent_ctx); if (ctx->task) put_task_struct(ctx->task); - call_rcu(&ctx->rcu_head, free_ctx); + kfree_rcu(ctx, rcu_head); } } @@ -5331,14 +5323,6 @@ swevent_hlist_deref(struct swevent_htable *swhash) lockdep_is_held(&swhash->hlist_mutex)); } -static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) -{ - struct swevent_hlist *hlist; - - hlist = container_of(rcu_head, struct swevent_hlist, rcu_head); - kfree(hlist); -} - static void swevent_hlist_release(struct swevent_htable *swhash) { struct swevent_hlist *hlist = swevent_hlist_deref(swhash); @@ -5347,7 +5331,7 @@ static void swevent_hlist_release(struct swevent_htable *swhash) return; rcu_assign_pointer(swhash->swevent_hlist, NULL); - call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); + kfree_rcu(hlist, rcu_head); } static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index f3240e987928..7784bd216b6a 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -142,10 +142,17 @@ static int rcuhead_fixup_init(void *addr, enum debug_obj_state state) * Ensure that queued callbacks are all executed. * If we detect that we are nested in a RCU read-side critical * section, we should simply fail, otherwise we would deadlock. + * In !PREEMPT configurations, there is no way to tell if we are + * in a RCU read-side critical section or not, so we never + * attempt any fixup and just print a warning. */ +#ifndef CONFIG_PREEMPT + WARN_ON_ONCE(1); + return 0; +#endif if (rcu_preempt_depth() != 0 || preempt_count() != 0 || irqs_disabled()) { - WARN_ON(1); + WARN_ON_ONCE(1); return 0; } rcu_barrier(); @@ -184,10 +191,17 @@ static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state) * Ensure that queued callbacks are all executed. * If we detect that we are nested in a RCU read-side critical * section, we should simply fail, otherwise we would deadlock. + * In !PREEMPT configurations, there is no way to tell if we are + * in a RCU read-side critical section or not, so we never + * attempt any fixup and just print a warning. */ +#ifndef CONFIG_PREEMPT + WARN_ON_ONCE(1); + return 0; +#endif if (rcu_preempt_depth() != 0 || preempt_count() != 0 || irqs_disabled()) { - WARN_ON(1); + WARN_ON_ONCE(1); return 0; } rcu_barrier(); @@ -214,15 +228,17 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state) * Ensure that queued callbacks are all executed. * If we detect that we are nested in a RCU read-side critical * section, we should simply fail, otherwise we would deadlock. - * Note that the machinery to reliably determine whether - * or not we are in an RCU read-side critical section - * exists only in the preemptible RCU implementations - * (TINY_PREEMPT_RCU and TREE_PREEMPT_RCU), which is why - * DEBUG_OBJECTS_RCU_HEAD is disallowed if !PREEMPT. + * In !PREEMPT configurations, there is no way to tell if we are + * in a RCU read-side critical section or not, so we never + * attempt any fixup and just print a warning. */ +#ifndef CONFIG_PREEMPT + WARN_ON_ONCE(1); + return 0; +#endif if (rcu_preempt_depth() != 0 || preempt_count() != 0 || irqs_disabled()) { - WARN_ON(1); + WARN_ON_ONCE(1); return 0; } rcu_barrier(); diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 0c343b9a46d5..421abfd3641d 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -40,10 +40,10 @@ static struct task_struct *rcu_kthread_task; static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); static unsigned long have_rcu_kthread_work; -static void invoke_rcu_kthread(void); /* Forward declarations for rcutiny_plugin.h. */ struct rcu_ctrlblk; +static void invoke_rcu_kthread(void); static void rcu_process_callbacks(struct rcu_ctrlblk *rcp); static int rcu_kthread(void *arg); static void __call_rcu(struct rcu_head *head, @@ -79,36 +79,45 @@ void rcu_exit_nohz(void) #endif /* #ifdef CONFIG_NO_HZ */ /* - * Helper function for rcu_qsctr_inc() and rcu_bh_qsctr_inc(). - * Also disable irqs to avoid confusion due to interrupt handlers + * Helper function for rcu_sched_qs() and rcu_bh_qs(). + * Also irqs are disabled to avoid confusion due to interrupt handlers * invoking call_rcu(). */ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) { - unsigned long flags; - - local_irq_save(flags); if (rcp->rcucblist != NULL && rcp->donetail != rcp->curtail) { rcp->donetail = rcp->curtail; - local_irq_restore(flags); return 1; } - local_irq_restore(flags); return 0; } /* + * Wake up rcu_kthread() to process callbacks now eligible for invocation + * or to boost readers. + */ +static void invoke_rcu_kthread(void) +{ + have_rcu_kthread_work = 1; + wake_up(&rcu_kthread_wq); +} + +/* * Record an rcu quiescent state. And an rcu_bh quiescent state while we * are at it, given that any rcu quiescent state is also an rcu_bh * quiescent state. Use "+" instead of "||" to defeat short circuiting. */ void rcu_sched_qs(int cpu) { + unsigned long flags; + + local_irq_save(flags); if (rcu_qsctr_help(&rcu_sched_ctrlblk) + rcu_qsctr_help(&rcu_bh_ctrlblk)) invoke_rcu_kthread(); + local_irq_restore(flags); } /* @@ -116,8 +125,12 @@ void rcu_sched_qs(int cpu) */ void rcu_bh_qs(int cpu) { + unsigned long flags; + + local_irq_save(flags); if (rcu_qsctr_help(&rcu_bh_ctrlblk)) invoke_rcu_kthread(); + local_irq_restore(flags); } /* @@ -167,7 +180,7 @@ static void rcu_process_callbacks(struct rcu_ctrlblk *rcp) prefetch(next); debug_rcu_head_unqueue(list); local_bh_disable(); - list->func(list); + __rcu_reclaim(list); local_bh_enable(); list = next; RCU_TRACE(cb_count++); @@ -208,20 +221,6 @@ static int rcu_kthread(void *arg) } /* - * Wake up rcu_kthread() to process callbacks now eligible for invocation - * or to boost readers. - */ -static void invoke_rcu_kthread(void) -{ - unsigned long flags; - - local_irq_save(flags); - have_rcu_kthread_work = 1; - wake_up(&rcu_kthread_wq); - local_irq_restore(flags); -} - -/* * Wait for a grace period to elapse. But it is illegal to invoke * synchronize_sched() from within an RCU read-side critical section. * Therefore, any legal call to synchronize_sched() is a quiescent diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 3cb8e362e883..f259c676195f 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -100,23 +100,28 @@ struct rcu_preempt_ctrlblk { u8 completed; /* Last grace period completed. */ /* If all three are equal, RCU is idle. */ #ifdef CONFIG_RCU_BOOST - s8 boosted_this_gp; /* Has boosting already happened? */ unsigned long boost_time; /* When to start boosting (jiffies) */ #endif /* #ifdef CONFIG_RCU_BOOST */ #ifdef CONFIG_RCU_TRACE unsigned long n_grace_periods; #ifdef CONFIG_RCU_BOOST unsigned long n_tasks_boosted; + /* Total number of tasks boosted. */ unsigned long n_exp_boosts; + /* Number of tasks boosted for expedited GP. */ unsigned long n_normal_boosts; - unsigned long n_normal_balk_blkd_tasks; - unsigned long n_normal_balk_gp_tasks; - unsigned long n_normal_balk_boost_tasks; - unsigned long n_normal_balk_boosted; - unsigned long n_normal_balk_notyet; - unsigned long n_normal_balk_nos; - unsigned long n_exp_balk_blkd_tasks; - unsigned long n_exp_balk_nos; + /* Number of tasks boosted for normal GP. */ + unsigned long n_balk_blkd_tasks; + /* Refused to boost: no blocked tasks. */ + unsigned long n_balk_exp_gp_tasks; + /* Refused to boost: nothing blocking GP. */ + unsigned long n_balk_boost_tasks; + /* Refused to boost: already boosting. */ + unsigned long n_balk_notyet; + /* Refused to boost: not yet time. */ + unsigned long n_balk_nos; + /* Refused to boost: not sure why, though. */ + /* This can happen due to race conditions. */ #endif /* #ifdef CONFIG_RCU_BOOST */ #endif /* #ifdef CONFIG_RCU_TRACE */ }; @@ -201,7 +206,6 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t) #ifdef CONFIG_RCU_BOOST static void rcu_initiate_boost_trace(void); -static void rcu_initiate_exp_boost_trace(void); #endif /* #ifdef CONFIG_RCU_BOOST */ /* @@ -219,41 +223,21 @@ static void show_tiny_preempt_stats(struct seq_file *m) "N."[!rcu_preempt_ctrlblk.gp_tasks], "E."[!rcu_preempt_ctrlblk.exp_tasks]); #ifdef CONFIG_RCU_BOOST - seq_printf(m, " ttb=%c btg=", - "B."[!rcu_preempt_ctrlblk.boost_tasks]); - switch (rcu_preempt_ctrlblk.boosted_this_gp) { - case -1: - seq_puts(m, "exp"); - break; - case 0: - seq_puts(m, "no"); - break; - case 1: - seq_puts(m, "begun"); - break; - case 2: - seq_puts(m, "done"); - break; - default: - seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp); - } - seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n", + seq_printf(m, "%sttb=%c ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n", + " ", + "B."[!rcu_preempt_ctrlblk.boost_tasks], rcu_preempt_ctrlblk.n_tasks_boosted, rcu_preempt_ctrlblk.n_exp_boosts, rcu_preempt_ctrlblk.n_normal_boosts, (int)(jiffies & 0xffff), (int)(rcu_preempt_ctrlblk.boost_time & 0xffff)); - seq_printf(m, " %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n", - "normal balk", - rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks, - rcu_preempt_ctrlblk.n_normal_balk_gp_tasks, - rcu_preempt_ctrlblk.n_normal_balk_boost_tasks, - rcu_preempt_ctrlblk.n_normal_balk_boosted, - rcu_preempt_ctrlblk.n_normal_balk_notyet, - rcu_preempt_ctrlblk.n_normal_balk_nos); - seq_printf(m, " exp balk: bt=%lu nos=%lu\n", - rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks, - rcu_preempt_ctrlblk.n_exp_balk_nos); + seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu ny=%lu nos=%lu\n", + " balk", + rcu_preempt_ctrlblk.n_balk_blkd_tasks, + rcu_preempt_ctrlblk.n_balk_exp_gp_tasks, + rcu_preempt_ctrlblk.n_balk_boost_tasks, + rcu_preempt_ctrlblk.n_balk_notyet, + rcu_preempt_ctrlblk.n_balk_nos); #endif /* #ifdef CONFIG_RCU_BOOST */ } @@ -271,25 +255,59 @@ static int rcu_boost(void) { unsigned long flags; struct rt_mutex mtx; - struct list_head *np; struct task_struct *t; + struct list_head *tb; - if (rcu_preempt_ctrlblk.boost_tasks == NULL) + if (rcu_preempt_ctrlblk.boost_tasks == NULL && + rcu_preempt_ctrlblk.exp_tasks == NULL) return 0; /* Nothing to boost. */ + raw_local_irq_save(flags); - rcu_preempt_ctrlblk.boosted_this_gp++; - t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct, - rcu_node_entry); - np = rcu_next_node_entry(t); + + /* + * Recheck with irqs disabled: all tasks in need of boosting + * might exit their RCU read-side critical sections on their own + * if we are preempted just before disabling irqs. + */ + if (rcu_preempt_ctrlblk.boost_tasks == NULL && + rcu_preempt_ctrlblk.exp_tasks == NULL) { + raw_local_irq_restore(flags); + return 0; + } + + /* + * Preferentially boost tasks blocking expedited grace periods. + * This cannot starve the normal grace periods because a second + * expedited grace period must boost all blocked tasks, including + * those blocking the pre-existing normal grace period. + */ + if (rcu_preempt_ctrlblk.exp_tasks != NULL) { + tb = rcu_preempt_ctrlblk.exp_tasks; + RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++); + } else { + tb = rcu_preempt_ctrlblk.boost_tasks; + RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++); + } + RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++); + + /* + * We boost task t by manufacturing an rt_mutex that appears to + * be held by task t. We leave a pointer to that rt_mutex where + * task t can find it, and task t will release the mutex when it + * exits its outermost RCU read-side critical section. Then + * simply acquiring this artificial rt_mutex will boost task + * t's priority. (Thanks to tglx for suggesting this approach!) + */ + t = container_of(tb, struct task_struct, rcu_node_entry); rt_mutex_init_proxy_locked(&mtx, t); t->rcu_boost_mutex = &mtx; t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED; raw_local_irq_restore(flags); rt_mutex_lock(&mtx); - RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++); - rcu_preempt_ctrlblk.boosted_this_gp++; - rt_mutex_unlock(&mtx); - return rcu_preempt_ctrlblk.boost_tasks != NULL; + rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ + + return rcu_preempt_ctrlblk.boost_tasks != NULL || + rcu_preempt_ctrlblk.exp_tasks != NULL; } /* @@ -304,42 +322,25 @@ static int rcu_boost(void) */ static int rcu_initiate_boost(void) { - if (!rcu_preempt_blocked_readers_cgp()) { - RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++); + if (!rcu_preempt_blocked_readers_cgp() && + rcu_preempt_ctrlblk.exp_tasks == NULL) { + RCU_TRACE(rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++); return 0; } - if (rcu_preempt_ctrlblk.gp_tasks != NULL && - rcu_preempt_ctrlblk.boost_tasks == NULL && - rcu_preempt_ctrlblk.boosted_this_gp == 0 && - ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) { - rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks; + if (rcu_preempt_ctrlblk.exp_tasks != NULL || + (rcu_preempt_ctrlblk.gp_tasks != NULL && + rcu_preempt_ctrlblk.boost_tasks == NULL && + ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))) { + if (rcu_preempt_ctrlblk.exp_tasks == NULL) + rcu_preempt_ctrlblk.boost_tasks = + rcu_preempt_ctrlblk.gp_tasks; invoke_rcu_kthread(); - RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++); } else RCU_TRACE(rcu_initiate_boost_trace()); return 1; } -/* - * Initiate boosting for an expedited grace period. - */ -static void rcu_initiate_expedited_boost(void) -{ - unsigned long flags; - - raw_local_irq_save(flags); - if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) { - rcu_preempt_ctrlblk.boost_tasks = - rcu_preempt_ctrlblk.blkd_tasks.next; - rcu_preempt_ctrlblk.boosted_this_gp = -1; - invoke_rcu_kthread(); - RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++); - } else - RCU_TRACE(rcu_initiate_exp_boost_trace()); - raw_local_irq_restore(flags); -} - -#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000); +#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) /* * Do priority-boost accounting for the start of a new grace period. @@ -347,8 +348,6 @@ static void rcu_initiate_expedited_boost(void) static void rcu_preempt_boost_start_gp(void) { rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; - if (rcu_preempt_ctrlblk.boosted_this_gp > 0) - rcu_preempt_ctrlblk.boosted_this_gp = 0; } #else /* #ifdef CONFIG_RCU_BOOST */ @@ -372,13 +371,6 @@ static int rcu_initiate_boost(void) } /* - * If there is no RCU priority boosting, we don't initiate expedited boosting. - */ -static void rcu_initiate_expedited_boost(void) -{ -} - -/* * If there is no RCU priority boosting, nothing to do at grace-period start. */ static void rcu_preempt_boost_start_gp(void) @@ -418,7 +410,7 @@ static void rcu_preempt_cpu_qs(void) if (!rcu_preempt_gp_in_progress()) return; /* - * Check up on boosting. If there are no readers blocking the + * Check up on boosting. If there are readers blocking the * current grace period, leave. */ if (rcu_initiate_boost()) @@ -578,7 +570,7 @@ static void rcu_read_unlock_special(struct task_struct *t) empty = !rcu_preempt_blocked_readers_cgp(); empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; np = rcu_next_node_entry(t); - list_del(&t->rcu_node_entry); + list_del_init(&t->rcu_node_entry); if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) rcu_preempt_ctrlblk.gp_tasks = np; if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) @@ -587,7 +579,6 @@ static void rcu_read_unlock_special(struct task_struct *t) if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks) rcu_preempt_ctrlblk.boost_tasks = np; #endif /* #ifdef CONFIG_RCU_BOOST */ - INIT_LIST_HEAD(&t->rcu_node_entry); /* * If this was the last task on the current list, and if @@ -812,13 +803,16 @@ void synchronize_rcu_expedited(void) rpcp->exp_tasks = rpcp->blkd_tasks.next; if (rpcp->exp_tasks == &rpcp->blkd_tasks) rpcp->exp_tasks = NULL; - local_irq_restore(flags); /* Wait for tail of ->blkd_tasks list to drain. */ - if (rcu_preempted_readers_exp()) - rcu_initiate_expedited_boost(); + if (!rcu_preempted_readers_exp()) + local_irq_restore(flags); + else { + rcu_initiate_boost(); + local_irq_restore(flags); wait_event(sync_rcu_preempt_exp_wq, !rcu_preempted_readers_exp()); + } /* Clean up and exit. */ barrier(); /* ensure expedited GP seen before counter increment. */ @@ -931,24 +925,17 @@ void __init rcu_scheduler_starting(void) static void rcu_initiate_boost_trace(void) { - if (rcu_preempt_ctrlblk.gp_tasks == NULL) - rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++; + if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) + rcu_preempt_ctrlblk.n_balk_blkd_tasks++; + else if (rcu_preempt_ctrlblk.gp_tasks == NULL && + rcu_preempt_ctrlblk.exp_tasks == NULL) + rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++; else if (rcu_preempt_ctrlblk.boost_tasks != NULL) - rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++; - else if (rcu_preempt_ctrlblk.boosted_this_gp != 0) - rcu_preempt_ctrlblk.n_normal_balk_boosted++; + rcu_preempt_ctrlblk.n_balk_boost_tasks++; else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) - rcu_preempt_ctrlblk.n_normal_balk_notyet++; - else - rcu_preempt_ctrlblk.n_normal_balk_nos++; -} - -static void rcu_initiate_exp_boost_trace(void) -{ - if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) - rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks++; + rcu_preempt_ctrlblk.n_balk_notyet++; else - rcu_preempt_ctrlblk.n_exp_balk_nos++; + rcu_preempt_ctrlblk.n_balk_nos++; } #endif /* #ifdef CONFIG_RCU_BOOST */ diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index c224da41890c..2e138db03382 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -131,7 +131,7 @@ struct rcu_torture { static LIST_HEAD(rcu_torture_freelist); static struct rcu_torture __rcu *rcu_torture_current; -static long rcu_torture_current_version; +static unsigned long rcu_torture_current_version; static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; static DEFINE_SPINLOCK(rcu_torture_lock); static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = @@ -146,8 +146,6 @@ static atomic_t n_rcu_torture_mberror; static atomic_t n_rcu_torture_error; static long n_rcu_torture_boost_ktrerror; static long n_rcu_torture_boost_rterror; -static long n_rcu_torture_boost_allocerror; -static long n_rcu_torture_boost_afferror; static long n_rcu_torture_boost_failure; static long n_rcu_torture_boosts; static long n_rcu_torture_timers; @@ -163,11 +161,11 @@ static int stutter_pause_test; #endif int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; -#ifdef CONFIG_RCU_BOOST +#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) #define rcu_can_boost() 1 -#else /* #ifdef CONFIG_RCU_BOOST */ +#else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ #define rcu_can_boost() 0 -#endif /* #else #ifdef CONFIG_RCU_BOOST */ +#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ static unsigned long boost_starttime; /* jiffies of next boost test start. */ DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ @@ -751,6 +749,7 @@ static int rcu_torture_boost(void *arg) n_rcu_torture_boost_rterror++; } + init_rcu_head_on_stack(&rbi.rcu); /* Each pass through the following loop does one boost-test cycle. */ do { /* Wait for the next test interval. */ @@ -810,6 +809,7 @@ checkwait: rcu_stutter_wait("rcu_torture_boost"); /* Clean up and exit. */ VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); + destroy_rcu_head_on_stack(&rbi.rcu); rcutorture_shutdown_absorb("rcu_torture_boost"); while (!kthread_should_stop() || rbi.inflight) schedule_timeout_uninterruptible(1); @@ -886,7 +886,7 @@ rcu_torture_writer(void *arg) old_rp->rtort_pipe_count++; cur_ops->deferred_free(old_rp); } - rcu_torture_current_version++; + rcutorture_record_progress(++rcu_torture_current_version); oldbatch = cur_ops->completed(); rcu_stutter_wait("rcu_torture_writer"); } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); @@ -1066,8 +1066,8 @@ rcu_torture_printk(char *page) } cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); cnt += sprintf(&page[cnt], - "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " - "rtmbe: %d rtbke: %ld rtbre: %ld rtbae: %ld rtbafe: %ld " + "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " + "rtmbe: %d rtbke: %ld rtbre: %ld " "rtbf: %ld rtb: %ld nt: %ld", rcu_torture_current, rcu_torture_current_version, @@ -1078,16 +1078,12 @@ rcu_torture_printk(char *page) atomic_read(&n_rcu_torture_mberror), n_rcu_torture_boost_ktrerror, n_rcu_torture_boost_rterror, - n_rcu_torture_boost_allocerror, - n_rcu_torture_boost_afferror, n_rcu_torture_boost_failure, n_rcu_torture_boosts, n_rcu_torture_timers); if (atomic_read(&n_rcu_torture_mberror) != 0 || n_rcu_torture_boost_ktrerror != 0 || n_rcu_torture_boost_rterror != 0 || - n_rcu_torture_boost_allocerror != 0 || - n_rcu_torture_boost_afferror != 0 || n_rcu_torture_boost_failure != 0) cnt += sprintf(&page[cnt], " !!!"); cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); @@ -1331,6 +1327,7 @@ rcu_torture_cleanup(void) int i; mutex_lock(&fullstop_mutex); + rcutorture_record_test_transition(); if (fullstop == FULLSTOP_SHUTDOWN) { printk(KERN_WARNING /* but going down anyway, so... */ "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); @@ -1486,8 +1483,6 @@ rcu_torture_init(void) atomic_set(&n_rcu_torture_error, 0); n_rcu_torture_boost_ktrerror = 0; n_rcu_torture_boost_rterror = 0; - n_rcu_torture_boost_allocerror = 0; - n_rcu_torture_boost_afferror = 0; n_rcu_torture_boost_failure = 0; n_rcu_torture_boosts = 0; for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) @@ -1624,6 +1619,7 @@ rcu_torture_init(void) } } register_reboot_notifier(&rcutorture_shutdown_nb); + rcutorture_record_test_transition(); mutex_unlock(&fullstop_mutex); return 0; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index dd4aea806f8e..e486f7c3ffb8 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -47,6 +47,8 @@ #include <linux/mutex.h> #include <linux/time.h> #include <linux/kernel_stat.h> +#include <linux/wait.h> +#include <linux/kthread.h> #include "rcutree.h" @@ -79,10 +81,41 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); +static struct rcu_state *rcu_state; + int rcu_scheduler_active __read_mostly; EXPORT_SYMBOL_GPL(rcu_scheduler_active); /* + * Control variables for per-CPU and per-rcu_node kthreads. These + * handle all flavors of RCU. + */ +static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); +DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu); +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); +static DEFINE_PER_CPU(wait_queue_head_t, rcu_cpu_wq); +DEFINE_PER_CPU(char, rcu_cpu_has_work); +static char rcu_kthreads_spawnable; + +static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); +static void invoke_rcu_cpu_kthread(void); + +#define RCU_KTHREAD_PRIO 1 /* RT priority for per-CPU kthreads. */ + +/* + * Track the rcutorture test sequence number and the update version + * number within a given test. The rcutorture_testseq is incremented + * on every rcutorture module load and unload, so has an odd value + * when a test is running. The rcutorture_vernum is set to zero + * when rcutorture starts and is incremented on each rcutorture update. + * These variables enable correlating rcutorture output with the + * RCU tracing information. + */ +unsigned long rcutorture_testseq; +unsigned long rcutorture_vernum; + +/* * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s * permit this function to be invoked without holding the root rcu_node * structure's ->lock, but of course results can be subject to change. @@ -124,6 +157,7 @@ void rcu_note_context_switch(int cpu) rcu_sched_qs(cpu); rcu_preempt_note_context_switch(cpu); } +EXPORT_SYMBOL_GPL(rcu_note_context_switch); #ifdef CONFIG_NO_HZ DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { @@ -140,10 +174,8 @@ module_param(blimit, int, 0); module_param(qhimark, int, 0); module_param(qlowmark, int, 0); -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR -int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT; +int rcu_cpu_stall_suppress __read_mostly; module_param(rcu_cpu_stall_suppress, int, 0644); -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ static void force_quiescent_state(struct rcu_state *rsp, int relaxed); static int rcu_pending(int cpu); @@ -176,6 +208,31 @@ void rcu_bh_force_quiescent_state(void) EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); /* + * Record the number of times rcutorture tests have been initiated and + * terminated. This information allows the debugfs tracing stats to be + * correlated to the rcutorture messages, even when the rcutorture module + * is being repeatedly loaded and unloaded. In other words, we cannot + * store this state in rcutorture itself. + */ +void rcutorture_record_test_transition(void) +{ + rcutorture_testseq++; + rcutorture_vernum = 0; +} +EXPORT_SYMBOL_GPL(rcutorture_record_test_transition); + +/* + * Record the number of writer passes through the current rcutorture test. + * This is also used to correlate debugfs tracing stats with the rcutorture + * messages. + */ +void rcutorture_record_progress(unsigned long vernum) +{ + rcutorture_vernum++; +} +EXPORT_SYMBOL_GPL(rcutorture_record_progress); + +/* * Force a quiescent state for RCU-sched. */ void rcu_sched_force_quiescent_state(void) @@ -234,8 +291,8 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp) return 1; } - /* If preemptable RCU, no point in sending reschedule IPI. */ - if (rdp->preemptable) + /* If preemptible RCU, no point in sending reschedule IPI. */ + if (rdp->preemptible) return 0; /* The CPU is online, so send it a reschedule IPI. */ @@ -450,8 +507,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) #endif /* #else #ifdef CONFIG_NO_HZ */ -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR - int rcu_cpu_stall_suppress __read_mostly; static void record_gp_stall_check_time(struct rcu_state *rsp) @@ -537,21 +592,24 @@ static void print_cpu_stall(struct rcu_state *rsp) static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) { - long delta; + unsigned long j; + unsigned long js; struct rcu_node *rnp; if (rcu_cpu_stall_suppress) return; - delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall); + j = ACCESS_ONCE(jiffies); + js = ACCESS_ONCE(rsp->jiffies_stall); rnp = rdp->mynode; - if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && delta >= 0) { + if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) { /* We haven't checked in, so go dump stack. */ print_cpu_stall(rsp); - } else if (rcu_gp_in_progress(rsp) && delta >= RCU_STALL_RAT_DELAY) { + } else if (rcu_gp_in_progress(rsp) && + ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) { - /* They had two time units to dump stack, so complain. */ + /* They had a few time units to dump stack, so complain. */ print_other_cpu_stall(rsp); } } @@ -587,26 +645,6 @@ static void __init check_cpu_stall_init(void) atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); } -#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ - -static void record_gp_stall_check_time(struct rcu_state *rsp) -{ -} - -static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) -{ -} - -void rcu_cpu_stall_reset(void) -{ -} - -static void __init check_cpu_stall_init(void) -{ -} - -#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ - /* * Update CPU-local rcu_data state to record the newly noticed grace period. * This is used both when we started the grace period and when we notice @@ -809,6 +847,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) rnp->completed = rsp->completed; rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ rcu_start_gp_per_cpu(rsp, rnp, rdp); + rcu_preempt_boost_start_gp(rnp); raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } @@ -844,6 +883,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) rnp->completed = rsp->completed; if (rnp == rdp->mynode) rcu_start_gp_per_cpu(rsp, rnp, rdp); + rcu_preempt_boost_start_gp(rnp); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } @@ -864,7 +904,12 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) __releases(rcu_get_root(rsp)->lock) { + unsigned long gp_duration; + WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); + gp_duration = jiffies - rsp->gp_start; + if (gp_duration > rsp->gp_max) + rsp->gp_max = gp_duration; rsp->completed = rsp->gpnum; rsp->signaled = RCU_GP_IDLE; rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ @@ -894,7 +939,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, return; } rnp->qsmask &= ~mask; - if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { + if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { /* Other bits still set at this level, so done. */ raw_spin_unlock_irqrestore(&rnp->lock, flags); @@ -1037,6 +1082,8 @@ static void rcu_send_cbs_to_online(struct rcu_state *rsp) /* * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy * and move all callbacks from the outgoing CPU to the current one. + * There can only be one CPU hotplug operation at a time, so no other + * CPU can be attempting to update rcu_cpu_kthread_task. */ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) { @@ -1045,6 +1092,14 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) int need_report = 0; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp; + struct task_struct *t; + + /* Stop the CPU's kthread. */ + t = per_cpu(rcu_cpu_kthread_task, cpu); + if (t != NULL) { + per_cpu(rcu_cpu_kthread_task, cpu) = NULL; + kthread_stop(t); + } /* Exclude any attempts to start a new grace period. */ raw_spin_lock_irqsave(&rsp->onofflock, flags); @@ -1082,6 +1137,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) raw_spin_unlock_irqrestore(&rnp->lock, flags); if (need_report & RCU_OFL_TASKS_EXP_GP) rcu_report_exp_rnp(rsp, rnp); + rcu_node_kthread_setaffinity(rnp, -1); } /* @@ -1143,7 +1199,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) next = list->next; prefetch(next); debug_rcu_head_unqueue(list); - list->func(list); + __rcu_reclaim(list); list = next; if (++count >= rdp->blimit) break; @@ -1179,7 +1235,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) /* Re-raise the RCU softirq if there are callbacks remaining. */ if (cpu_has_callbacks_ready_to_invoke(rdp)) - raise_softirq(RCU_SOFTIRQ); + invoke_rcu_cpu_kthread(); } /* @@ -1225,7 +1281,7 @@ void rcu_check_callbacks(int cpu, int user) } rcu_preempt_check_callbacks(cpu); if (rcu_pending(cpu)) - raise_softirq(RCU_SOFTIRQ); + invoke_rcu_cpu_kthread(); } #ifdef CONFIG_SMP @@ -1233,6 +1289,8 @@ void rcu_check_callbacks(int cpu, int user) /* * Scan the leaf rcu_node structures, processing dyntick state for any that * have not yet encountered a quiescent state, using the function specified. + * Also initiate boosting for any threads blocked on the root rcu_node. + * * The caller must have suppressed start of new grace periods. */ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) @@ -1251,7 +1309,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) return; } if (rnp->qsmask == 0) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ continue; } cpu = rnp->grplo; @@ -1269,6 +1327,11 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) } raw_spin_unlock_irqrestore(&rnp->lock, flags); } + rnp = rcu_get_root(rsp); + if (rnp->qsmask == 0) { + raw_spin_lock_irqsave(&rnp->lock, flags); + rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ + } } /* @@ -1389,7 +1452,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) /* * Do softirq processing for the current CPU. */ -static void rcu_process_callbacks(struct softirq_action *unused) +static void rcu_process_callbacks(void) { /* * Memory references from any prior RCU read-side critical sections @@ -1414,6 +1477,347 @@ static void rcu_process_callbacks(struct softirq_action *unused) rcu_needs_cpu_flush(); } +/* + * Wake up the current CPU's kthread. This replaces raise_softirq() + * in earlier versions of RCU. Note that because we are running on + * the current CPU with interrupts disabled, the rcu_cpu_kthread_task + * cannot disappear out from under us. + */ +static void invoke_rcu_cpu_kthread(void) +{ + unsigned long flags; + + local_irq_save(flags); + __this_cpu_write(rcu_cpu_has_work, 1); + if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) { + local_irq_restore(flags); + return; + } + wake_up(&__get_cpu_var(rcu_cpu_wq)); + local_irq_restore(flags); +} + +/* + * Wake up the specified per-rcu_node-structure kthread. + * Because the per-rcu_node kthreads are immortal, we don't need + * to do anything to keep them alive. + */ +static void invoke_rcu_node_kthread(struct rcu_node *rnp) +{ + struct task_struct *t; + + t = rnp->node_kthread_task; + if (t != NULL) + wake_up_process(t); +} + +/* + * Set the specified CPU's kthread to run RT or not, as specified by + * the to_rt argument. The CPU-hotplug locks are held, so the task + * is not going away. + */ +static void rcu_cpu_kthread_setrt(int cpu, int to_rt) +{ + int policy; + struct sched_param sp; + struct task_struct *t; + + t = per_cpu(rcu_cpu_kthread_task, cpu); + if (t == NULL) + return; + if (to_rt) { + policy = SCHED_FIFO; + sp.sched_priority = RCU_KTHREAD_PRIO; + } else { + policy = SCHED_NORMAL; + sp.sched_priority = 0; + } + sched_setscheduler_nocheck(t, policy, &sp); +} + +/* + * Timer handler to initiate the waking up of per-CPU kthreads that + * have yielded the CPU due to excess numbers of RCU callbacks. + * We wake up the per-rcu_node kthread, which in turn will wake up + * the booster kthread. + */ +static void rcu_cpu_kthread_timer(unsigned long arg) +{ + unsigned long flags; + struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg); + struct rcu_node *rnp = rdp->mynode; + + raw_spin_lock_irqsave(&rnp->lock, flags); + rnp->wakemask |= rdp->grpmask; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + invoke_rcu_node_kthread(rnp); +} + +/* + * Drop to non-real-time priority and yield, but only after posting a + * timer that will cause us to regain our real-time priority if we + * remain preempted. Either way, we restore our real-time priority + * before returning. + */ +static void rcu_yield(void (*f)(unsigned long), unsigned long arg) +{ + struct sched_param sp; + struct timer_list yield_timer; + + setup_timer_on_stack(&yield_timer, f, arg); + mod_timer(&yield_timer, jiffies + 2); + sp.sched_priority = 0; + sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp); + set_user_nice(current, 19); + schedule(); + sp.sched_priority = RCU_KTHREAD_PRIO; + sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); + del_timer(&yield_timer); +} + +/* + * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU. + * This can happen while the corresponding CPU is either coming online + * or going offline. We cannot wait until the CPU is fully online + * before starting the kthread, because the various notifier functions + * can wait for RCU grace periods. So we park rcu_cpu_kthread() until + * the corresponding CPU is online. + * + * Return 1 if the kthread needs to stop, 0 otherwise. + * + * Caller must disable bh. This function can momentarily enable it. + */ +static int rcu_cpu_kthread_should_stop(int cpu) +{ + while (cpu_is_offline(cpu) || + !cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu)) || + smp_processor_id() != cpu) { + if (kthread_should_stop()) + return 1; + per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; + per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id(); + local_bh_enable(); + schedule_timeout_uninterruptible(1); + if (!cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu))) + set_cpus_allowed_ptr(current, cpumask_of(cpu)); + local_bh_disable(); + } + per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; + return 0; +} + +/* + * Per-CPU kernel thread that invokes RCU callbacks. This replaces the + * earlier RCU softirq. + */ +static int rcu_cpu_kthread(void *arg) +{ + int cpu = (int)(long)arg; + unsigned long flags; + int spincnt = 0; + unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu); + wait_queue_head_t *wqp = &per_cpu(rcu_cpu_wq, cpu); + char work; + char *workp = &per_cpu(rcu_cpu_has_work, cpu); + + for (;;) { + *statusp = RCU_KTHREAD_WAITING; + wait_event_interruptible(*wqp, + *workp != 0 || kthread_should_stop()); + local_bh_disable(); + if (rcu_cpu_kthread_should_stop(cpu)) { + local_bh_enable(); + break; + } + *statusp = RCU_KTHREAD_RUNNING; + per_cpu(rcu_cpu_kthread_loops, cpu)++; + local_irq_save(flags); + work = *workp; + *workp = 0; + local_irq_restore(flags); + if (work) + rcu_process_callbacks(); + local_bh_enable(); + if (*workp != 0) + spincnt++; + else + spincnt = 0; + if (spincnt > 10) { + *statusp = RCU_KTHREAD_YIELDING; + rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu); + spincnt = 0; + } + } + *statusp = RCU_KTHREAD_STOPPED; + return 0; +} + +/* + * Spawn a per-CPU kthread, setting up affinity and priority. + * Because the CPU hotplug lock is held, no other CPU will be attempting + * to manipulate rcu_cpu_kthread_task. There might be another CPU + * attempting to access it during boot, but the locking in kthread_bind() + * will enforce sufficient ordering. + */ +static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu) +{ + struct sched_param sp; + struct task_struct *t; + + if (!rcu_kthreads_spawnable || + per_cpu(rcu_cpu_kthread_task, cpu) != NULL) + return 0; + t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu); + if (IS_ERR(t)) + return PTR_ERR(t); + kthread_bind(t, cpu); + per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; + WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL); + per_cpu(rcu_cpu_kthread_task, cpu) = t; + wake_up_process(t); + sp.sched_priority = RCU_KTHREAD_PRIO; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + return 0; +} + +/* + * Per-rcu_node kthread, which is in charge of waking up the per-CPU + * kthreads when needed. We ignore requests to wake up kthreads + * for offline CPUs, which is OK because force_quiescent_state() + * takes care of this case. + */ +static int rcu_node_kthread(void *arg) +{ + int cpu; + unsigned long flags; + unsigned long mask; + struct rcu_node *rnp = (struct rcu_node *)arg; + struct sched_param sp; + struct task_struct *t; + + for (;;) { + rnp->node_kthread_status = RCU_KTHREAD_WAITING; + wait_event_interruptible(rnp->node_wq, rnp->wakemask != 0); + rnp->node_kthread_status = RCU_KTHREAD_RUNNING; + raw_spin_lock_irqsave(&rnp->lock, flags); + mask = rnp->wakemask; + rnp->wakemask = 0; + rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) { + if ((mask & 0x1) == 0) + continue; + preempt_disable(); + t = per_cpu(rcu_cpu_kthread_task, cpu); + if (!cpu_online(cpu) || t == NULL) { + preempt_enable(); + continue; + } + per_cpu(rcu_cpu_has_work, cpu) = 1; + sp.sched_priority = RCU_KTHREAD_PRIO; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + preempt_enable(); + } + } + /* NOTREACHED */ + rnp->node_kthread_status = RCU_KTHREAD_STOPPED; + return 0; +} + +/* + * Set the per-rcu_node kthread's affinity to cover all CPUs that are + * served by the rcu_node in question. The CPU hotplug lock is still + * held, so the value of rnp->qsmaskinit will be stable. + * + * We don't include outgoingcpu in the affinity set, use -1 if there is + * no outgoing CPU. If there are no CPUs left in the affinity set, + * this function allows the kthread to execute on any CPU. + */ +static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) +{ + cpumask_var_t cm; + int cpu; + unsigned long mask = rnp->qsmaskinit; + + if (rnp->node_kthread_task == NULL) + return; + if (!alloc_cpumask_var(&cm, GFP_KERNEL)) + return; + cpumask_clear(cm); + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) + if ((mask & 0x1) && cpu != outgoingcpu) + cpumask_set_cpu(cpu, cm); + if (cpumask_weight(cm) == 0) { + cpumask_setall(cm); + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) + cpumask_clear_cpu(cpu, cm); + WARN_ON_ONCE(cpumask_weight(cm) == 0); + } + set_cpus_allowed_ptr(rnp->node_kthread_task, cm); + rcu_boost_kthread_setaffinity(rnp, cm); + free_cpumask_var(cm); +} + +/* + * Spawn a per-rcu_node kthread, setting priority and affinity. + * Called during boot before online/offline can happen, or, if + * during runtime, with the main CPU-hotplug locks held. So only + * one of these can be executing at a time. + */ +static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp, + struct rcu_node *rnp) +{ + unsigned long flags; + int rnp_index = rnp - &rsp->node[0]; + struct sched_param sp; + struct task_struct *t; + + if (!rcu_kthreads_spawnable || + rnp->qsmaskinit == 0) + return 0; + if (rnp->node_kthread_task == NULL) { + t = kthread_create(rcu_node_kthread, (void *)rnp, + "rcun%d", rnp_index); + if (IS_ERR(t)) + return PTR_ERR(t); + raw_spin_lock_irqsave(&rnp->lock, flags); + rnp->node_kthread_task = t; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + wake_up_process(t); + sp.sched_priority = 99; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + } + return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index); +} + +/* + * Spawn all kthreads -- called as soon as the scheduler is running. + */ +static int __init rcu_spawn_kthreads(void) +{ + int cpu; + struct rcu_node *rnp; + + rcu_kthreads_spawnable = 1; + for_each_possible_cpu(cpu) { + init_waitqueue_head(&per_cpu(rcu_cpu_wq, cpu)); + per_cpu(rcu_cpu_has_work, cpu) = 0; + if (cpu_online(cpu)) + (void)rcu_spawn_one_cpu_kthread(cpu); + } + rnp = rcu_get_root(rcu_state); + init_waitqueue_head(&rnp->node_wq); + rcu_init_boost_waitqueue(rnp); + (void)rcu_spawn_one_node_kthread(rcu_state, rnp); + if (NUM_RCU_NODES > 1) + rcu_for_each_leaf_node(rcu_state, rnp) { + init_waitqueue_head(&rnp->node_wq); + rcu_init_boost_waitqueue(rnp); + (void)rcu_spawn_one_node_kthread(rcu_state, rnp); + } + return 0; +} +early_initcall(rcu_spawn_kthreads); + static void __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), struct rcu_state *rsp) @@ -1439,6 +1843,13 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), /* Add the callback to our list. */ *rdp->nxttail[RCU_NEXT_TAIL] = head; rdp->nxttail[RCU_NEXT_TAIL] = &head->next; + rdp->qlen++; + + /* If interrupts were disabled, don't dive into RCU core. */ + if (irqs_disabled_flags(flags)) { + local_irq_restore(flags); + return; + } /* * Force the grace period if too many callbacks or too long waiting. @@ -1447,7 +1858,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), * invoking force_quiescent_state() if the newly enqueued callback * is the only one waiting for a grace period to complete. */ - if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { + if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { /* Are we ignoring a completed grace period? */ rcu_process_gp_end(rsp, rdp); @@ -1583,7 +1994,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) * or RCU-bh, force a local reschedule. */ rdp->n_rp_qs_pending++; - if (!rdp->preemptable && + if (!rdp->preemptible && ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1, jiffies)) set_need_resched(); @@ -1760,7 +2171,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) * that this CPU cannot possibly have any RCU callbacks in flight yet. */ static void __cpuinit -rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) +rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) { unsigned long flags; unsigned long mask; @@ -1772,7 +2183,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) rdp->passed_quiesc = 0; /* We could be racing with new GP, */ rdp->qs_pending = 1; /* so set up to respond to current GP. */ rdp->beenonline = 1; /* We have now been online. */ - rdp->preemptable = preemptable; + rdp->preemptible = preemptible; rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; @@ -1813,6 +2224,19 @@ static void __cpuinit rcu_online_cpu(int cpu) rcu_preempt_init_percpu_data(cpu); } +static void __cpuinit rcu_online_kthreads(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); + struct rcu_node *rnp = rdp->mynode; + + /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ + if (rcu_kthreads_spawnable) { + (void)rcu_spawn_one_cpu_kthread(cpu); + if (rnp->node_kthread_task == NULL) + (void)rcu_spawn_one_node_kthread(rcu_state, rnp); + } +} + /* * Handle CPU online/offline notification events. */ @@ -1820,11 +2244,23 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { long cpu = (long)hcpu; + struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); + struct rcu_node *rnp = rdp->mynode; switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: rcu_online_cpu(cpu); + rcu_online_kthreads(cpu); + break; + case CPU_ONLINE: + case CPU_DOWN_FAILED: + rcu_node_kthread_setaffinity(rnp, -1); + rcu_cpu_kthread_setrt(cpu, 1); + break; + case CPU_DOWN_PREPARE: + rcu_node_kthread_setaffinity(rnp, cpu); + rcu_cpu_kthread_setrt(cpu, 0); break; case CPU_DYING: case CPU_DYING_FROZEN: @@ -1943,10 +2379,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, j / rsp->levelspread[i - 1]; } rnp->level = i; - INIT_LIST_HEAD(&rnp->blocked_tasks[0]); - INIT_LIST_HEAD(&rnp->blocked_tasks[1]); - INIT_LIST_HEAD(&rnp->blocked_tasks[2]); - INIT_LIST_HEAD(&rnp->blocked_tasks[3]); + INIT_LIST_HEAD(&rnp->blkd_tasks); } } @@ -1968,7 +2401,6 @@ void __init rcu_init(void) rcu_init_one(&rcu_sched_state, &rcu_sched_data); rcu_init_one(&rcu_bh_state, &rcu_bh_data); __rcu_init_preempt(); - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); /* * We don't need protection against CPU-hotplug here because diff --git a/kernel/rcutree.h b/kernel/rcutree.h index e8f057e44e3e..257664815d5d 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -91,6 +91,14 @@ struct rcu_dynticks { /* remains even for nmi from irq handler. */ }; +/* RCU's kthread states for tracing. */ +#define RCU_KTHREAD_STOPPED 0 +#define RCU_KTHREAD_RUNNING 1 +#define RCU_KTHREAD_WAITING 2 +#define RCU_KTHREAD_OFFCPU 3 +#define RCU_KTHREAD_YIELDING 4 +#define RCU_KTHREAD_MAX 4 + /* * Definition for node within the RCU grace-period-detection hierarchy. */ @@ -109,10 +117,11 @@ struct rcu_node { /* an rcu_data structure, otherwise, each */ /* bit corresponds to a child rcu_node */ /* structure. */ - unsigned long expmask; /* Groups that have ->blocked_tasks[] */ + unsigned long expmask; /* Groups that have ->blkd_tasks */ /* elements that need to drain to allow the */ /* current expedited grace period to */ /* complete (only for TREE_PREEMPT_RCU). */ + unsigned long wakemask; /* CPUs whose kthread needs to be awakened. */ unsigned long qsmaskinit; /* Per-GP initial value for qsmask & expmask. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ @@ -122,11 +131,68 @@ struct rcu_node { u8 grpnum; /* CPU/group number for next level up. */ u8 level; /* root is at level 0. */ struct rcu_node *parent; - struct list_head blocked_tasks[4]; - /* Tasks blocked in RCU read-side critsect. */ - /* Grace period number (->gpnum) x blocked */ - /* by tasks on the (x & 0x1) element of the */ - /* blocked_tasks[] array. */ + struct list_head blkd_tasks; + /* Tasks blocked in RCU read-side critical */ + /* section. Tasks are placed at the head */ + /* of this list and age towards the tail. */ + struct list_head *gp_tasks; + /* Pointer to the first task blocking the */ + /* current grace period, or NULL if there */ + /* is no such task. */ + struct list_head *exp_tasks; + /* Pointer to the first task blocking the */ + /* current expedited grace period, or NULL */ + /* if there is no such task. If there */ + /* is no current expedited grace period, */ + /* then there can cannot be any such task. */ +#ifdef CONFIG_RCU_BOOST + struct list_head *boost_tasks; + /* Pointer to first task that needs to be */ + /* priority boosted, or NULL if no priority */ + /* boosting is needed for this rcu_node */ + /* structure. If there are no tasks */ + /* queued on this rcu_node structure that */ + /* are blocking the current grace period, */ + /* there can be no such task. */ + unsigned long boost_time; + /* When to start boosting (jiffies). */ + struct task_struct *boost_kthread_task; + /* kthread that takes care of priority */ + /* boosting for this rcu_node structure. */ + wait_queue_head_t boost_wq; + /* Wait queue on which to park the boost */ + /* kthread. */ + unsigned int boost_kthread_status; + /* State of boost_kthread_task for tracing. */ + unsigned long n_tasks_boosted; + /* Total number of tasks boosted. */ + unsigned long n_exp_boosts; + /* Number of tasks boosted for expedited GP. */ + unsigned long n_normal_boosts; + /* Number of tasks boosted for normal GP. */ + unsigned long n_balk_blkd_tasks; + /* Refused to boost: no blocked tasks. */ + unsigned long n_balk_exp_gp_tasks; + /* Refused to boost: nothing blocking GP. */ + unsigned long n_balk_boost_tasks; + /* Refused to boost: already boosting. */ + unsigned long n_balk_notblocked; + /* Refused to boost: RCU RS CS still running. */ + unsigned long n_balk_notyet; + /* Refused to boost: not yet time. */ + unsigned long n_balk_nos; + /* Refused to boost: not sure why, though. */ + /* This can happen due to race conditions. */ +#endif /* #ifdef CONFIG_RCU_BOOST */ + struct task_struct *node_kthread_task; + /* kthread that takes care of this rcu_node */ + /* structure, for example, awakening the */ + /* per-CPU kthreads as needed. */ + wait_queue_head_t node_wq; + /* Wait queue on which to park the per-node */ + /* kthread. */ + unsigned int node_kthread_status; + /* State of node_kthread_task for tracing. */ } ____cacheline_internodealigned_in_smp; /* @@ -175,7 +241,7 @@ struct rcu_data { bool passed_quiesc; /* User-mode/idle loop etc. */ bool qs_pending; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ - bool preemptable; /* Preemptable RCU? */ + bool preemptible; /* Preemptible RCU? */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ @@ -254,7 +320,6 @@ struct rcu_data { #endif /* #else #ifdef CONFIG_NO_HZ */ #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR #ifdef CONFIG_PROVE_RCU #define RCU_STALL_DELAY_DELTA (5 * HZ) @@ -272,13 +337,6 @@ struct rcu_data { /* scheduling clock irq */ /* before ratting on them. */ -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR_RUNNABLE -#define RCU_CPU_STALL_SUPPRESS_INIT 0 -#else -#define RCU_CPU_STALL_SUPPRESS_INIT 1 -#endif - -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ /* * RCU global state, including node hierarchy. This hierarchy is @@ -325,12 +383,12 @@ struct rcu_state { /* due to lock unavailable. */ unsigned long n_force_qs_ngp; /* Number of calls leaving */ /* due to no GP active. */ -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR unsigned long gp_start; /* Time at which GP started, */ /* but in jiffies. */ unsigned long jiffies_stall; /* Time at which to check */ /* for CPU stalls. */ -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + unsigned long gp_max; /* Maximum GP duration in */ + /* jiffies. */ char *name; /* Name of structure. */ }; @@ -361,16 +419,14 @@ DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); static void rcu_bootup_announce(void); long rcu_batches_completed(void); static void rcu_preempt_note_context_switch(int cpu); -static int rcu_preempted_readers(struct rcu_node *rnp); +static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR static void rcu_print_detail_task_stall(struct rcu_state *rsp); static void rcu_print_task_stall(struct rcu_node *rnp); static void rcu_preempt_stall_reset(void); -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static int rcu_preempt_offline_tasks(struct rcu_state *rsp, @@ -390,5 +446,13 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu); static void rcu_preempt_send_cbs_to_online(void); static void __init __rcu_init_preempt(void); static void rcu_needs_cpu_flush(void); +static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp); +static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); +static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, + cpumask_var_t cm); +static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); +static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, + struct rcu_node *rnp, + int rnp_index); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index a3638710dc67..3f6559a5f5cd 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1,7 +1,7 @@ /* * Read-Copy Update mechanism for mutual exclusion (tree-based version) * Internal non-public definitions that provide either classic - * or preemptable semantics. + * or preemptible semantics. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -54,10 +54,6 @@ static void __init rcu_bootup_announce_oddness(void) #ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE printk(KERN_INFO "\tRCU torture testing starts during boot.\n"); #endif -#ifndef CONFIG_RCU_CPU_STALL_DETECTOR - printk(KERN_INFO - "\tRCU-based detection of stalled CPUs is disabled.\n"); -#endif #if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); #endif @@ -70,6 +66,7 @@ static void __init rcu_bootup_announce_oddness(void) struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); +static struct rcu_state *rcu_state = &rcu_preempt_state; static int rcu_preempted_readers_exp(struct rcu_node *rnp); @@ -78,7 +75,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp); */ static void __init rcu_bootup_announce(void) { - printk(KERN_INFO "Preemptable hierarchical RCU implementation.\n"); + printk(KERN_INFO "Preemptible hierarchical RCU implementation.\n"); rcu_bootup_announce_oddness(); } @@ -111,7 +108,7 @@ void rcu_force_quiescent_state(void) EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); /* - * Record a preemptable-RCU quiescent state for the specified CPU. Note + * Record a preemptible-RCU quiescent state for the specified CPU. Note * that this just means that the task currently running on the CPU is * not in a quiescent state. There might be any number of tasks blocked * while in an RCU read-side critical section. @@ -134,12 +131,12 @@ static void rcu_preempt_qs(int cpu) * We have entered the scheduler, and the current task might soon be * context-switched away from. If this task is in an RCU read-side * critical section, we will no longer be able to rely on the CPU to - * record that fact, so we enqueue the task on the appropriate entry - * of the blocked_tasks[] array. The task will dequeue itself when - * it exits the outermost enclosing RCU read-side critical section. - * Therefore, the current grace period cannot be permitted to complete - * until the blocked_tasks[] entry indexed by the low-order bit of - * rnp->gpnum empties. + * record that fact, so we enqueue the task on the blkd_tasks list. + * The task will dequeue itself when it exits the outermost enclosing + * RCU read-side critical section. Therefore, the current grace period + * cannot be permitted to complete until the blkd_tasks list entries + * predating the current grace period drain, in other words, until + * rnp->gp_tasks becomes NULL. * * Caller must disable preemption. */ @@ -147,7 +144,6 @@ static void rcu_preempt_note_context_switch(int cpu) { struct task_struct *t = current; unsigned long flags; - int phase; struct rcu_data *rdp; struct rcu_node *rnp; @@ -169,15 +165,30 @@ static void rcu_preempt_note_context_switch(int cpu) * (i.e., this CPU has not yet passed through a quiescent * state for the current grace period), then as long * as that task remains queued, the current grace period - * cannot end. + * cannot end. Note that there is some uncertainty as + * to exactly when the current grace period started. + * We take a conservative approach, which can result + * in unnecessarily waiting on tasks that started very + * slightly after the current grace period began. C'est + * la vie!!! * * But first, note that the current CPU must still be * on line! */ WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0); WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); - phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1; - list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]); + if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) { + list_add(&t->rcu_node_entry, rnp->gp_tasks->prev); + rnp->gp_tasks = &t->rcu_node_entry; +#ifdef CONFIG_RCU_BOOST + if (rnp->boost_tasks != NULL) + rnp->boost_tasks = rnp->gp_tasks; +#endif /* #ifdef CONFIG_RCU_BOOST */ + } else { + list_add(&t->rcu_node_entry, &rnp->blkd_tasks); + if (rnp->qsmask & rdp->grpmask) + rnp->gp_tasks = &t->rcu_node_entry; + } raw_spin_unlock_irqrestore(&rnp->lock, flags); } @@ -196,7 +207,7 @@ static void rcu_preempt_note_context_switch(int cpu) } /* - * Tree-preemptable RCU implementation for rcu_read_lock(). + * Tree-preemptible RCU implementation for rcu_read_lock(). * Just increment ->rcu_read_lock_nesting, shared state will be updated * if we block. */ @@ -212,12 +223,9 @@ EXPORT_SYMBOL_GPL(__rcu_read_lock); * for the specified rcu_node structure. If the caller needs a reliable * answer, it must hold the rcu_node's ->lock. */ -static int rcu_preempted_readers(struct rcu_node *rnp) +static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) { - int phase = rnp->gpnum & 0x1; - - return !list_empty(&rnp->blocked_tasks[phase]) || - !list_empty(&rnp->blocked_tasks[phase + 2]); + return rnp->gp_tasks != NULL; } /* @@ -233,7 +241,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) unsigned long mask; struct rcu_node *rnp_p; - if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { + if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); return; /* Still need more quiescent states! */ } @@ -257,6 +265,21 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) } /* + * Advance a ->blkd_tasks-list pointer to the next entry, instead + * returning NULL if at the end of the list. + */ +static struct list_head *rcu_next_node_entry(struct task_struct *t, + struct rcu_node *rnp) +{ + struct list_head *np; + + np = t->rcu_node_entry.next; + if (np == &rnp->blkd_tasks) + np = NULL; + return np; +} + +/* * Handle special cases during rcu_read_unlock(), such as needing to * notify RCU core processing or task having blocked during the RCU * read-side critical section. @@ -266,6 +289,7 @@ static void rcu_read_unlock_special(struct task_struct *t) int empty; int empty_exp; unsigned long flags; + struct list_head *np; struct rcu_node *rnp; int special; @@ -306,10 +330,19 @@ static void rcu_read_unlock_special(struct task_struct *t) break; raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } - empty = !rcu_preempted_readers(rnp); + empty = !rcu_preempt_blocked_readers_cgp(rnp); empty_exp = !rcu_preempted_readers_exp(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ + np = rcu_next_node_entry(t, rnp); list_del_init(&t->rcu_node_entry); + if (&t->rcu_node_entry == rnp->gp_tasks) + rnp->gp_tasks = np; + if (&t->rcu_node_entry == rnp->exp_tasks) + rnp->exp_tasks = np; +#ifdef CONFIG_RCU_BOOST + if (&t->rcu_node_entry == rnp->boost_tasks) + rnp->boost_tasks = np; +#endif /* #ifdef CONFIG_RCU_BOOST */ t->rcu_blocked_node = NULL; /* @@ -322,6 +355,15 @@ static void rcu_read_unlock_special(struct task_struct *t) else rcu_report_unblock_qs_rnp(rnp, flags); +#ifdef CONFIG_RCU_BOOST + /* Unboost if we were boosted. */ + if (special & RCU_READ_UNLOCK_BOOSTED) { + t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED; + rt_mutex_unlock(t->rcu_boost_mutex); + t->rcu_boost_mutex = NULL; + } +#endif /* #ifdef CONFIG_RCU_BOOST */ + /* * If this was the last task on the expedited lists, * then we need to report up the rcu_node hierarchy. @@ -334,7 +376,7 @@ static void rcu_read_unlock_special(struct task_struct *t) } /* - * Tree-preemptable RCU implementation for rcu_read_unlock(). + * Tree-preemptible RCU implementation for rcu_read_unlock(). * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then * invoke rcu_read_unlock_special() to clean up after a context switch @@ -356,8 +398,6 @@ void __rcu_read_unlock(void) } EXPORT_SYMBOL_GPL(__rcu_read_unlock); -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR - #ifdef CONFIG_RCU_CPU_STALL_VERBOSE /* @@ -367,18 +407,16 @@ EXPORT_SYMBOL_GPL(__rcu_read_unlock); static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) { unsigned long flags; - struct list_head *lp; - int phase; struct task_struct *t; - if (rcu_preempted_readers(rnp)) { - raw_spin_lock_irqsave(&rnp->lock, flags); - phase = rnp->gpnum & 0x1; - lp = &rnp->blocked_tasks[phase]; - list_for_each_entry(t, lp, rcu_node_entry) - sched_show_task(t); - raw_spin_unlock_irqrestore(&rnp->lock, flags); - } + if (!rcu_preempt_blocked_readers_cgp(rnp)) + return; + raw_spin_lock_irqsave(&rnp->lock, flags); + t = list_entry(rnp->gp_tasks, + struct task_struct, rcu_node_entry); + list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) + sched_show_task(t); + raw_spin_unlock_irqrestore(&rnp->lock, flags); } /* @@ -408,16 +446,14 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) */ static void rcu_print_task_stall(struct rcu_node *rnp) { - struct list_head *lp; - int phase; struct task_struct *t; - if (rcu_preempted_readers(rnp)) { - phase = rnp->gpnum & 0x1; - lp = &rnp->blocked_tasks[phase]; - list_for_each_entry(t, lp, rcu_node_entry) - printk(" P%d", t->pid); - } + if (!rcu_preempt_blocked_readers_cgp(rnp)) + return; + t = list_entry(rnp->gp_tasks, + struct task_struct, rcu_node_entry); + list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) + printk(" P%d", t->pid); } /* @@ -430,18 +466,21 @@ static void rcu_preempt_stall_reset(void) rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2; } -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ - /* * Check that the list of blocked tasks for the newly completed grace * period is in fact empty. It is a serious bug to complete a grace * period that still has RCU readers blocked! This function must be * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock * must be held by the caller. + * + * Also, if there are blocked tasks on the list, they automatically + * block the newly created grace period, so set up ->gp_tasks accordingly. */ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) { - WARN_ON_ONCE(rcu_preempted_readers(rnp)); + WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); + if (!list_empty(&rnp->blkd_tasks)) + rnp->gp_tasks = rnp->blkd_tasks.next; WARN_ON_ONCE(rnp->qsmask); } @@ -465,50 +504,68 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { - int i; struct list_head *lp; struct list_head *lp_root; int retval = 0; struct rcu_node *rnp_root = rcu_get_root(rsp); - struct task_struct *tp; + struct task_struct *t; if (rnp == rnp_root) { WARN_ONCE(1, "Last CPU thought to be offlined?"); return 0; /* Shouldn't happen: at least one CPU online. */ } - WARN_ON_ONCE(rnp != rdp->mynode && - (!list_empty(&rnp->blocked_tasks[0]) || - !list_empty(&rnp->blocked_tasks[1]) || - !list_empty(&rnp->blocked_tasks[2]) || - !list_empty(&rnp->blocked_tasks[3]))); + + /* If we are on an internal node, complain bitterly. */ + WARN_ON_ONCE(rnp != rdp->mynode); /* - * Move tasks up to root rcu_node. Rely on the fact that the - * root rcu_node can be at most one ahead of the rest of the - * rcu_nodes in terms of gp_num value. This fact allows us to - * move the blocked_tasks[] array directly, element by element. + * Move tasks up to root rcu_node. Don't try to get fancy for + * this corner-case operation -- just put this node's tasks + * at the head of the root node's list, and update the root node's + * ->gp_tasks and ->exp_tasks pointers to those of this node's, + * if non-NULL. This might result in waiting for more tasks than + * absolutely necessary, but this is a good performance/complexity + * tradeoff. */ - if (rcu_preempted_readers(rnp)) + if (rcu_preempt_blocked_readers_cgp(rnp)) retval |= RCU_OFL_TASKS_NORM_GP; if (rcu_preempted_readers_exp(rnp)) retval |= RCU_OFL_TASKS_EXP_GP; - for (i = 0; i < 4; i++) { - lp = &rnp->blocked_tasks[i]; - lp_root = &rnp_root->blocked_tasks[i]; - while (!list_empty(lp)) { - tp = list_entry(lp->next, typeof(*tp), rcu_node_entry); - raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ - list_del(&tp->rcu_node_entry); - tp->rcu_blocked_node = rnp_root; - list_add(&tp->rcu_node_entry, lp_root); - raw_spin_unlock(&rnp_root->lock); /* irqs remain disabled */ - } + lp = &rnp->blkd_tasks; + lp_root = &rnp_root->blkd_tasks; + while (!list_empty(lp)) { + t = list_entry(lp->next, typeof(*t), rcu_node_entry); + raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ + list_del(&t->rcu_node_entry); + t->rcu_blocked_node = rnp_root; + list_add(&t->rcu_node_entry, lp_root); + if (&t->rcu_node_entry == rnp->gp_tasks) + rnp_root->gp_tasks = rnp->gp_tasks; + if (&t->rcu_node_entry == rnp->exp_tasks) + rnp_root->exp_tasks = rnp->exp_tasks; +#ifdef CONFIG_RCU_BOOST + if (&t->rcu_node_entry == rnp->boost_tasks) + rnp_root->boost_tasks = rnp->boost_tasks; +#endif /* #ifdef CONFIG_RCU_BOOST */ + raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ } + +#ifdef CONFIG_RCU_BOOST + /* In case root is being boosted and leaf is not. */ + raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ + if (rnp_root->boost_tasks != NULL && + rnp_root->boost_tasks != rnp_root->gp_tasks) + rnp_root->boost_tasks = rnp_root->gp_tasks; + raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ +#endif /* #ifdef CONFIG_RCU_BOOST */ + + rnp->gp_tasks = NULL; + rnp->exp_tasks = NULL; return retval; } /* - * Do CPU-offline processing for preemptable RCU. + * Do CPU-offline processing for preemptible RCU. */ static void rcu_preempt_offline_cpu(int cpu) { @@ -537,7 +594,7 @@ static void rcu_preempt_check_callbacks(int cpu) } /* - * Process callbacks for preemptable RCU. + * Process callbacks for preemptible RCU. */ static void rcu_preempt_process_callbacks(void) { @@ -546,7 +603,7 @@ static void rcu_preempt_process_callbacks(void) } /* - * Queue a preemptable-RCU callback for invocation after a grace period. + * Queue a preemptible-RCU callback for invocation after a grace period. */ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { @@ -594,8 +651,7 @@ static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); */ static int rcu_preempted_readers_exp(struct rcu_node *rnp) { - return !list_empty(&rnp->blocked_tasks[2]) || - !list_empty(&rnp->blocked_tasks[3]); + return rnp->exp_tasks != NULL; } /* @@ -655,13 +711,17 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) static void sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) { - int must_wait; + unsigned long flags; + int must_wait = 0; - raw_spin_lock(&rnp->lock); /* irqs already disabled */ - list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]); - list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]); - must_wait = rcu_preempted_readers_exp(rnp); - raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ + raw_spin_lock_irqsave(&rnp->lock, flags); + if (list_empty(&rnp->blkd_tasks)) + raw_spin_unlock_irqrestore(&rnp->lock, flags); + else { + rnp->exp_tasks = rnp->blkd_tasks.next; + rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ + must_wait = 1; + } if (!must_wait) rcu_report_exp_rnp(rsp, rnp); } @@ -669,9 +729,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) /* * Wait for an rcu-preempt grace period, but expedite it. The basic idea * is to invoke synchronize_sched_expedited() to push all the tasks to - * the ->blocked_tasks[] lists, move all entries from the first set of - * ->blocked_tasks[] lists to the second set, and finally wait for this - * second set to drain. + * the ->blkd_tasks lists and wait for this list to drain. */ void synchronize_rcu_expedited(void) { @@ -703,7 +761,7 @@ void synchronize_rcu_expedited(void) if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) goto unlock_mb_ret; /* Others did our work for us. */ - /* force all RCU readers onto blocked_tasks[]. */ + /* force all RCU readers onto ->blkd_tasks lists. */ synchronize_sched_expedited(); raw_spin_lock_irqsave(&rsp->onofflock, flags); @@ -715,7 +773,7 @@ void synchronize_rcu_expedited(void) raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } - /* Snapshot current state of ->blocked_tasks[] lists. */ + /* Snapshot current state of ->blkd_tasks lists. */ rcu_for_each_leaf_node(rsp, rnp) sync_rcu_preempt_exp_init(rsp, rnp); if (NUM_RCU_NODES > 1) @@ -723,7 +781,7 @@ void synchronize_rcu_expedited(void) raw_spin_unlock_irqrestore(&rsp->onofflock, flags); - /* Wait for snapshotted ->blocked_tasks[] lists to drain. */ + /* Wait for snapshotted ->blkd_tasks lists to drain. */ rnp = rcu_get_root(rsp); wait_event(sync_rcu_preempt_exp_wq, sync_rcu_preempt_exp_done(rnp)); @@ -739,7 +797,7 @@ mb_ret: EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); /* - * Check to see if there is any immediate preemptable-RCU-related work + * Check to see if there is any immediate preemptible-RCU-related work * to be done. */ static int rcu_preempt_pending(int cpu) @@ -749,7 +807,7 @@ static int rcu_preempt_pending(int cpu) } /* - * Does preemptable RCU need the CPU to stay out of dynticks mode? + * Does preemptible RCU need the CPU to stay out of dynticks mode? */ static int rcu_preempt_needs_cpu(int cpu) { @@ -766,7 +824,7 @@ void rcu_barrier(void) EXPORT_SYMBOL_GPL(rcu_barrier); /* - * Initialize preemptable RCU's per-CPU data. + * Initialize preemptible RCU's per-CPU data. */ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) { @@ -774,7 +832,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) } /* - * Move preemptable RCU's callbacks from dying CPU to other online CPU. + * Move preemptible RCU's callbacks from dying CPU to other online CPU. */ static void rcu_preempt_send_cbs_to_online(void) { @@ -782,7 +840,7 @@ static void rcu_preempt_send_cbs_to_online(void) } /* - * Initialize preemptable RCU's state structures. + * Initialize preemptible RCU's state structures. */ static void __init __rcu_init_preempt(void) { @@ -790,7 +848,7 @@ static void __init __rcu_init_preempt(void) } /* - * Check for a task exiting while in a preemptable-RCU read-side + * Check for a task exiting while in a preemptible-RCU read-side * critical section, clean up if so. No need to issue warnings, * as debug_check_no_locks_held() already does this if lockdep * is enabled. @@ -802,11 +860,13 @@ void exit_rcu(void) if (t->rcu_read_lock_nesting == 0) return; t->rcu_read_lock_nesting = 1; - rcu_read_unlock(); + __rcu_read_unlock(); } #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ +static struct rcu_state *rcu_state = &rcu_sched_state; + /* * Tell them what RCU they are running. */ @@ -836,7 +896,7 @@ void rcu_force_quiescent_state(void) EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); /* - * Because preemptable RCU does not exist, we never have to check for + * Because preemptible RCU does not exist, we never have to check for * CPUs being in quiescent states. */ static void rcu_preempt_note_context_switch(int cpu) @@ -844,10 +904,10 @@ static void rcu_preempt_note_context_switch(int cpu) } /* - * Because preemptable RCU does not exist, there are never any preempted + * Because preemptible RCU does not exist, there are never any preempted * RCU readers. */ -static int rcu_preempted_readers(struct rcu_node *rnp) +static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) { return 0; } @@ -862,10 +922,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) #endif /* #ifdef CONFIG_HOTPLUG_CPU */ -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR - /* - * Because preemptable RCU does not exist, we never have to check for + * Because preemptible RCU does not exist, we never have to check for * tasks blocked within RCU read-side critical sections. */ static void rcu_print_detail_task_stall(struct rcu_state *rsp) @@ -873,7 +931,7 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) } /* - * Because preemptable RCU does not exist, we never have to check for + * Because preemptible RCU does not exist, we never have to check for * tasks blocked within RCU read-side critical sections. */ static void rcu_print_task_stall(struct rcu_node *rnp) @@ -888,10 +946,8 @@ static void rcu_preempt_stall_reset(void) { } -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ - /* - * Because there is no preemptable RCU, there can be no readers blocked, + * Because there is no preemptible RCU, there can be no readers blocked, * so there is no need to check for blocked tasks. So check only for * bogus qsmask values. */ @@ -903,7 +959,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) #ifdef CONFIG_HOTPLUG_CPU /* - * Because preemptable RCU does not exist, it never needs to migrate + * Because preemptible RCU does not exist, it never needs to migrate * tasks that were blocked within RCU read-side critical sections, and * such non-existent tasks cannot possibly have been blocking the current * grace period. @@ -916,7 +972,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, } /* - * Because preemptable RCU does not exist, it never needs CPU-offline + * Because preemptible RCU does not exist, it never needs CPU-offline * processing. */ static void rcu_preempt_offline_cpu(int cpu) @@ -926,7 +982,7 @@ static void rcu_preempt_offline_cpu(int cpu) #endif /* #ifdef CONFIG_HOTPLUG_CPU */ /* - * Because preemptable RCU does not exist, it never has any callbacks + * Because preemptible RCU does not exist, it never has any callbacks * to check. */ static void rcu_preempt_check_callbacks(int cpu) @@ -934,7 +990,7 @@ static void rcu_preempt_check_callbacks(int cpu) } /* - * Because preemptable RCU does not exist, it never has any callbacks + * Because preemptible RCU does not exist, it never has any callbacks * to process. */ static void rcu_preempt_process_callbacks(void) @@ -943,7 +999,7 @@ static void rcu_preempt_process_callbacks(void) /* * Wait for an rcu-preempt grace period, but make it happen quickly. - * But because preemptable RCU does not exist, map to rcu-sched. + * But because preemptible RCU does not exist, map to rcu-sched. */ void synchronize_rcu_expedited(void) { @@ -954,7 +1010,7 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); #ifdef CONFIG_HOTPLUG_CPU /* - * Because preemptable RCU does not exist, there is never any need to + * Because preemptible RCU does not exist, there is never any need to * report on tasks preempted in RCU read-side critical sections during * expedited RCU grace periods. */ @@ -966,7 +1022,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) #endif /* #ifdef CONFIG_HOTPLUG_CPU */ /* - * Because preemptable RCU does not exist, it never has any work to do. + * Because preemptible RCU does not exist, it never has any work to do. */ static int rcu_preempt_pending(int cpu) { @@ -974,7 +1030,7 @@ static int rcu_preempt_pending(int cpu) } /* - * Because preemptable RCU does not exist, it never needs any CPU. + * Because preemptible RCU does not exist, it never needs any CPU. */ static int rcu_preempt_needs_cpu(int cpu) { @@ -982,7 +1038,7 @@ static int rcu_preempt_needs_cpu(int cpu) } /* - * Because preemptable RCU does not exist, rcu_barrier() is just + * Because preemptible RCU does not exist, rcu_barrier() is just * another name for rcu_barrier_sched(). */ void rcu_barrier(void) @@ -992,7 +1048,7 @@ void rcu_barrier(void) EXPORT_SYMBOL_GPL(rcu_barrier); /* - * Because preemptable RCU does not exist, there is no per-CPU + * Because preemptible RCU does not exist, there is no per-CPU * data to initialize. */ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) @@ -1000,14 +1056,14 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) } /* - * Because there is no preemptable RCU, there are no callbacks to move. + * Because there is no preemptible RCU, there are no callbacks to move. */ static void rcu_preempt_send_cbs_to_online(void) { } /* - * Because preemptable RCU does not exist, it need not be initialized. + * Because preemptible RCU does not exist, it need not be initialized. */ static void __init __rcu_init_preempt(void) { @@ -1015,6 +1071,276 @@ static void __init __rcu_init_preempt(void) #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ +#ifdef CONFIG_RCU_BOOST + +#include "rtmutex_common.h" + +#ifdef CONFIG_RCU_TRACE + +static void rcu_initiate_boost_trace(struct rcu_node *rnp) +{ + if (list_empty(&rnp->blkd_tasks)) + rnp->n_balk_blkd_tasks++; + else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL) + rnp->n_balk_exp_gp_tasks++; + else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL) + rnp->n_balk_boost_tasks++; + else if (rnp->gp_tasks != NULL && rnp->qsmask != 0) + rnp->n_balk_notblocked++; + else if (rnp->gp_tasks != NULL && + ULONG_CMP_LT(jiffies, rnp->boost_time)) + rnp->n_balk_notyet++; + else + rnp->n_balk_nos++; +} + +#else /* #ifdef CONFIG_RCU_TRACE */ + +static void rcu_initiate_boost_trace(struct rcu_node *rnp) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_TRACE */ + +/* + * Carry out RCU priority boosting on the task indicated by ->exp_tasks + * or ->boost_tasks, advancing the pointer to the next task in the + * ->blkd_tasks list. + * + * Note that irqs must be enabled: boosting the task can block. + * Returns 1 if there are more tasks needing to be boosted. + */ +static int rcu_boost(struct rcu_node *rnp) +{ + unsigned long flags; + struct rt_mutex mtx; + struct task_struct *t; + struct list_head *tb; + + if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) + return 0; /* Nothing left to boost. */ + + raw_spin_lock_irqsave(&rnp->lock, flags); + + /* + * Recheck under the lock: all tasks in need of boosting + * might exit their RCU read-side critical sections on their own. + */ + if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return 0; + } + + /* + * Preferentially boost tasks blocking expedited grace periods. + * This cannot starve the normal grace periods because a second + * expedited grace period must boost all blocked tasks, including + * those blocking the pre-existing normal grace period. + */ + if (rnp->exp_tasks != NULL) { + tb = rnp->exp_tasks; + rnp->n_exp_boosts++; + } else { + tb = rnp->boost_tasks; + rnp->n_normal_boosts++; + } + rnp->n_tasks_boosted++; + + /* + * We boost task t by manufacturing an rt_mutex that appears to + * be held by task t. We leave a pointer to that rt_mutex where + * task t can find it, and task t will release the mutex when it + * exits its outermost RCU read-side critical section. Then + * simply acquiring this artificial rt_mutex will boost task + * t's priority. (Thanks to tglx for suggesting this approach!) + * + * Note that task t must acquire rnp->lock to remove itself from + * the ->blkd_tasks list, which it will do from exit() if from + * nowhere else. We therefore are guaranteed that task t will + * stay around at least until we drop rnp->lock. Note that + * rnp->lock also resolves races between our priority boosting + * and task t's exiting its outermost RCU read-side critical + * section. + */ + t = container_of(tb, struct task_struct, rcu_node_entry); + rt_mutex_init_proxy_locked(&mtx, t); + t->rcu_boost_mutex = &mtx; + t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ + rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ + + return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL; +} + +/* + * Timer handler to initiate waking up of boost kthreads that + * have yielded the CPU due to excessive numbers of tasks to + * boost. We wake up the per-rcu_node kthread, which in turn + * will wake up the booster kthread. + */ +static void rcu_boost_kthread_timer(unsigned long arg) +{ + invoke_rcu_node_kthread((struct rcu_node *)arg); +} + +/* + * Priority-boosting kthread. One per leaf rcu_node and one for the + * root rcu_node. + */ +static int rcu_boost_kthread(void *arg) +{ + struct rcu_node *rnp = (struct rcu_node *)arg; + int spincnt = 0; + int more2boost; + + for (;;) { + rnp->boost_kthread_status = RCU_KTHREAD_WAITING; + wait_event_interruptible(rnp->boost_wq, rnp->boost_tasks || + rnp->exp_tasks); + rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; + more2boost = rcu_boost(rnp); + if (more2boost) + spincnt++; + else + spincnt = 0; + if (spincnt > 10) { + rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp); + spincnt = 0; + } + } + /* NOTREACHED */ + return 0; +} + +/* + * Check to see if it is time to start boosting RCU readers that are + * blocking the current grace period, and, if so, tell the per-rcu_node + * kthread to start boosting them. If there is an expedited grace + * period in progress, it is always time to boost. + * + * The caller must hold rnp->lock, which this function releases, + * but irqs remain disabled. The ->boost_kthread_task is immortal, + * so we don't need to worry about it going away. + */ +static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) +{ + struct task_struct *t; + + if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { + rnp->n_balk_exp_gp_tasks++; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; + } + if (rnp->exp_tasks != NULL || + (rnp->gp_tasks != NULL && + rnp->boost_tasks == NULL && + rnp->qsmask == 0 && + ULONG_CMP_GE(jiffies, rnp->boost_time))) { + if (rnp->exp_tasks == NULL) + rnp->boost_tasks = rnp->gp_tasks; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + t = rnp->boost_kthread_task; + if (t != NULL) + wake_up_process(t); + } else { + rcu_initiate_boost_trace(rnp); + raw_spin_unlock_irqrestore(&rnp->lock, flags); + } +} + +/* + * Set the affinity of the boost kthread. The CPU-hotplug locks are + * held, so no one should be messing with the existence of the boost + * kthread. + */ +static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, + cpumask_var_t cm) +{ + struct task_struct *t; + + t = rnp->boost_kthread_task; + if (t != NULL) + set_cpus_allowed_ptr(rnp->boost_kthread_task, cm); +} + +#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) + +/* + * Do priority-boost accounting for the start of a new grace period. + */ +static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) +{ + rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; +} + +/* + * Initialize the RCU-boost waitqueue. + */ +static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp) +{ + init_waitqueue_head(&rnp->boost_wq); +} + +/* + * Create an RCU-boost kthread for the specified node if one does not + * already exist. We only create this kthread for preemptible RCU. + * Returns zero if all is well, a negated errno otherwise. + */ +static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, + struct rcu_node *rnp, + int rnp_index) +{ + unsigned long flags; + struct sched_param sp; + struct task_struct *t; + + if (&rcu_preempt_state != rsp) + return 0; + if (rnp->boost_kthread_task != NULL) + return 0; + t = kthread_create(rcu_boost_kthread, (void *)rnp, + "rcub%d", rnp_index); + if (IS_ERR(t)) + return PTR_ERR(t); + raw_spin_lock_irqsave(&rnp->lock, flags); + rnp->boost_kthread_task = t; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + wake_up_process(t); + sp.sched_priority = RCU_KTHREAD_PRIO; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + return 0; +} + +#else /* #ifdef CONFIG_RCU_BOOST */ + +static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) +{ + raw_spin_unlock_irqrestore(&rnp->lock, flags); +} + +static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, + cpumask_var_t cm) +{ +} + +static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) +{ +} + +static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp) +{ +} + +static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, + struct rcu_node *rnp, + int rnp_index) +{ + return 0; +} + +#endif /* #else #ifdef CONFIG_RCU_BOOST */ + #ifndef CONFIG_SMP void synchronize_sched_expedited(void) @@ -1187,8 +1513,8 @@ static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); * * Because it is not legal to invoke rcu_process_callbacks() with irqs * disabled, we do one pass of force_quiescent_state(), then do a - * raise_softirq() to cause rcu_process_callbacks() to be invoked later. - * The per-cpu rcu_dyntick_drain variable controls the sequencing. + * invoke_rcu_cpu_kthread() to cause rcu_process_callbacks() to be invoked + * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. */ int rcu_needs_cpu(int cpu) { @@ -1239,7 +1565,7 @@ int rcu_needs_cpu(int cpu) /* If RCU callbacks are still pending, RCU still needs this CPU. */ if (c) - raise_softirq(RCU_SOFTIRQ); + invoke_rcu_cpu_kthread(); return c; } diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index c8e97853b970..aa0fd72b4bc7 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -46,6 +46,18 @@ #define RCU_TREE_NONCORE #include "rcutree.h" +DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); +DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_cpu); +DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); +DECLARE_PER_CPU(char, rcu_cpu_has_work); + +static char convert_kthread_status(unsigned int kthread_status) +{ + if (kthread_status > RCU_KTHREAD_MAX) + return '?'; + return "SRWOY"[kthread_status]; +} + static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) { if (!rdp->beenonline) @@ -64,7 +76,21 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->dynticks_fqs); #endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); - seq_printf(m, " ql=%ld b=%ld", rdp->qlen, rdp->blimit); + seq_printf(m, " ql=%ld qs=%c%c%c%c kt=%d/%c/%d ktl=%x b=%ld", + rdp->qlen, + ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != + rdp->nxttail[RCU_NEXT_TAIL]], + ".R"[rdp->nxttail[RCU_WAIT_TAIL] != + rdp->nxttail[RCU_NEXT_READY_TAIL]], + ".W"[rdp->nxttail[RCU_DONE_TAIL] != + rdp->nxttail[RCU_WAIT_TAIL]], + ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]], + per_cpu(rcu_cpu_has_work, rdp->cpu), + convert_kthread_status(per_cpu(rcu_cpu_kthread_status, + rdp->cpu)), + per_cpu(rcu_cpu_kthread_cpu, rdp->cpu), + per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff, + rdp->blimit); seq_printf(m, " ci=%lu co=%lu ca=%lu\n", rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); } @@ -121,7 +147,18 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) rdp->dynticks_fqs); #endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); - seq_printf(m, ",%ld,%ld", rdp->qlen, rdp->blimit); + seq_printf(m, ",%ld,\"%c%c%c%c\",%d,\"%c\",%ld", rdp->qlen, + ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != + rdp->nxttail[RCU_NEXT_TAIL]], + ".R"[rdp->nxttail[RCU_WAIT_TAIL] != + rdp->nxttail[RCU_NEXT_READY_TAIL]], + ".W"[rdp->nxttail[RCU_DONE_TAIL] != + rdp->nxttail[RCU_WAIT_TAIL]], + ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]], + per_cpu(rcu_cpu_has_work, rdp->cpu), + convert_kthread_status(per_cpu(rcu_cpu_kthread_status, + rdp->cpu)), + rdp->blimit); seq_printf(m, ",%lu,%lu,%lu\n", rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); } @@ -157,11 +194,76 @@ static const struct file_operations rcudata_csv_fops = { .release = single_release, }; +#ifdef CONFIG_RCU_BOOST + +static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) +{ + seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu " + "j=%04x bt=%04x\n", + rnp->grplo, rnp->grphi, + "T."[list_empty(&rnp->blkd_tasks)], + "N."[!rnp->gp_tasks], + "E."[!rnp->exp_tasks], + "B."[!rnp->boost_tasks], + convert_kthread_status(rnp->boost_kthread_status), + rnp->n_tasks_boosted, rnp->n_exp_boosts, + rnp->n_normal_boosts, + (int)(jiffies & 0xffff), + (int)(rnp->boost_time & 0xffff)); + seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n", + " balk", + rnp->n_balk_blkd_tasks, + rnp->n_balk_exp_gp_tasks, + rnp->n_balk_boost_tasks, + rnp->n_balk_notblocked, + rnp->n_balk_notyet, + rnp->n_balk_nos); +} + +static int show_rcu_node_boost(struct seq_file *m, void *unused) +{ + struct rcu_node *rnp; + + rcu_for_each_leaf_node(&rcu_preempt_state, rnp) + print_one_rcu_node_boost(m, rnp); + return 0; +} + +static int rcu_node_boost_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_rcu_node_boost, NULL); +} + +static const struct file_operations rcu_node_boost_fops = { + .owner = THIS_MODULE, + .open = rcu_node_boost_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/* + * Create the rcuboost debugfs entry. Standard error return. + */ +static int rcu_boost_trace_create_file(struct dentry *rcudir) +{ + return !debugfs_create_file("rcuboost", 0444, rcudir, NULL, + &rcu_node_boost_fops); +} + +#else /* #ifdef CONFIG_RCU_BOOST */ + +static int rcu_boost_trace_create_file(struct dentry *rcudir) +{ + return 0; /* There cannot be an error if we didn't create it! */ +} + +#endif /* #else #ifdef CONFIG_RCU_BOOST */ + static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) { unsigned long gpnum; int level = 0; - int phase; struct rcu_node *rnp; gpnum = rsp->gpnum; @@ -178,13 +280,11 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) seq_puts(m, "\n"); level = rnp->level; } - phase = gpnum & 0x1; - seq_printf(m, "%lx/%lx %c%c>%c%c %d:%d ^%d ", + seq_printf(m, "%lx/%lx %c%c>%c %d:%d ^%d ", rnp->qsmask, rnp->qsmaskinit, - "T."[list_empty(&rnp->blocked_tasks[phase])], - "E."[list_empty(&rnp->blocked_tasks[phase + 2])], - "T."[list_empty(&rnp->blocked_tasks[!phase])], - "E."[list_empty(&rnp->blocked_tasks[!phase + 2])], + ".G"[rnp->gp_tasks != NULL], + ".E"[rnp->exp_tasks != NULL], + ".T"[!list_empty(&rnp->blkd_tasks)], rnp->grplo, rnp->grphi, rnp->grpnum); } seq_puts(m, "\n"); @@ -216,16 +316,35 @@ static const struct file_operations rcuhier_fops = { .release = single_release, }; +static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) +{ + unsigned long flags; + unsigned long completed; + unsigned long gpnum; + unsigned long gpage; + unsigned long gpmax; + struct rcu_node *rnp = &rsp->node[0]; + + raw_spin_lock_irqsave(&rnp->lock, flags); + completed = rsp->completed; + gpnum = rsp->gpnum; + if (rsp->completed == rsp->gpnum) + gpage = 0; + else + gpage = jiffies - rsp->gp_start; + gpmax = rsp->gp_max; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + seq_printf(m, "%s: completed=%ld gpnum=%lu age=%ld max=%ld\n", + rsp->name, completed, gpnum, gpage, gpmax); +} + static int show_rcugp(struct seq_file *m, void *unused) { #ifdef CONFIG_TREE_PREEMPT_RCU - seq_printf(m, "rcu_preempt: completed=%ld gpnum=%lu\n", - rcu_preempt_state.completed, rcu_preempt_state.gpnum); + show_one_rcugp(m, &rcu_preempt_state); #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - seq_printf(m, "rcu_sched: completed=%ld gpnum=%lu\n", - rcu_sched_state.completed, rcu_sched_state.gpnum); - seq_printf(m, "rcu_bh: completed=%ld gpnum=%lu\n", - rcu_bh_state.completed, rcu_bh_state.gpnum); + show_one_rcugp(m, &rcu_sched_state); + show_one_rcugp(m, &rcu_bh_state); return 0; } @@ -298,6 +417,29 @@ static const struct file_operations rcu_pending_fops = { .release = single_release, }; +static int show_rcutorture(struct seq_file *m, void *unused) +{ + seq_printf(m, "rcutorture test sequence: %lu %s\n", + rcutorture_testseq >> 1, + (rcutorture_testseq & 0x1) ? "(test in progress)" : ""); + seq_printf(m, "rcutorture update version number: %lu\n", + rcutorture_vernum); + return 0; +} + +static int rcutorture_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_rcutorture, NULL); +} + +static const struct file_operations rcutorture_fops = { + .owner = THIS_MODULE, + .open = rcutorture_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static struct dentry *rcudir; static int __init rcutree_trace_init(void) @@ -318,6 +460,9 @@ static int __init rcutree_trace_init(void) if (!retval) goto free_out; + if (rcu_boost_trace_create_file(rcudir)) + goto free_out; + retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); if (!retval) goto free_out; @@ -331,6 +476,11 @@ static int __init rcutree_trace_init(void) NULL, &rcu_pending_fops); if (!retval) goto free_out; + + retval = debugfs_create_file("rcutorture", 0444, rcudir, + NULL, &rcutorture_fops); + if (!retval) + goto free_out; return 0; free_out: debugfs_remove_recursive(rcudir); diff --git a/kernel/softirq.c b/kernel/softirq.c index 174f976c2874..13960170cad4 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -58,7 +58,7 @@ DEFINE_PER_CPU(struct task_struct *, ksoftirqd); char *softirq_to_name[NR_SOFTIRQS] = { "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", - "TASKLET", "SCHED", "HRTIMER", "RCU" + "TASKLET", "SCHED", "HRTIMER" }; /* |