diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/audit_tree.c | 19 | ||||
-rw-r--r-- | kernel/fork.c | 4 | ||||
-rw-r--r-- | kernel/rcupdate.c | 4 | ||||
-rw-r--r-- | kernel/rcutiny.c | 33 | ||||
-rw-r--r-- | kernel/rcutiny_plugin.h | 10 | ||||
-rw-r--r-- | kernel/rcutorture.c | 159 | ||||
-rw-r--r-- | kernel/rcutree.c | 694 | ||||
-rw-r--r-- | kernel/rcutree.h | 31 | ||||
-rw-r--r-- | kernel/rcutree_plugin.h | 174 | ||||
-rw-r--r-- | kernel/rcutree_trace.c | 19 | ||||
-rw-r--r-- | kernel/sched/core.c | 76 | ||||
-rw-r--r-- | kernel/sched/fair.c | 11 | ||||
-rw-r--r-- | kernel/sched/rt.c | 13 | ||||
-rw-r--r-- | kernel/sched/sched.h | 8 | ||||
-rw-r--r-- | kernel/sched/stop_task.c | 22 | ||||
-rw-r--r-- | kernel/task_work.c | 1 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 3 | ||||
-rw-r--r-- | kernel/time/timekeeping.c | 39 | ||||
-rw-r--r-- | kernel/timer.c | 9 | ||||
-rw-r--r-- | kernel/trace/trace_syscalls.c | 4 |
20 files changed, 782 insertions, 551 deletions
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 3a5ca582ba1e..ed206fd88cca 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -250,7 +250,6 @@ static void untag_chunk(struct node *p) spin_unlock(&hash_lock); spin_unlock(&entry->lock); fsnotify_destroy_mark(entry); - fsnotify_put_mark(entry); goto out; } @@ -259,7 +258,7 @@ static void untag_chunk(struct node *p) fsnotify_duplicate_mark(&new->mark, entry); if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { - free_chunk(new); + fsnotify_put_mark(&new->mark); goto Fallback; } @@ -293,7 +292,7 @@ static void untag_chunk(struct node *p) spin_unlock(&hash_lock); spin_unlock(&entry->lock); fsnotify_destroy_mark(entry); - fsnotify_put_mark(entry); + fsnotify_put_mark(&new->mark); /* drop initial reference */ goto out; Fallback: @@ -322,7 +321,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) entry = &chunk->mark; if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) { - free_chunk(chunk); + fsnotify_put_mark(entry); return -ENOSPC; } @@ -347,6 +346,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) insert_hash(chunk); spin_unlock(&hash_lock); spin_unlock(&entry->lock); + fsnotify_put_mark(entry); /* drop initial reference */ return 0; } @@ -396,7 +396,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) fsnotify_duplicate_mark(chunk_entry, old_entry); if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) { spin_unlock(&old_entry->lock); - free_chunk(chunk); + fsnotify_put_mark(chunk_entry); fsnotify_put_mark(old_entry); return -ENOSPC; } @@ -444,8 +444,8 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) spin_unlock(&chunk_entry->lock); spin_unlock(&old_entry->lock); fsnotify_destroy_mark(old_entry); + fsnotify_put_mark(chunk_entry); /* drop initial reference */ fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */ - fsnotify_put_mark(old_entry); /* and kill it */ return 0; } @@ -916,7 +916,12 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark); evict_chunk(chunk); - fsnotify_put_mark(entry); + + /* + * We are guaranteed to have at least one reference to the mark from + * either the inode or the caller of fsnotify_destroy_mark(). + */ + BUG_ON(atomic_read(&entry->refcnt) < 1); } static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode, diff --git a/kernel/fork.c b/kernel/fork.c index 3bd2280d79f6..2c8857e12855 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -455,8 +455,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) if (retval) goto out; - if (file && uprobe_mmap(tmp)) - goto out; + if (file) + uprobe_mmap(tmp); } /* a new mm has just been created */ arch_dup_mmap(oldmm, mm); diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 4e6a61b15e86..29ca1c6da594 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -45,6 +45,7 @@ #include <linux/mutex.h> #include <linux/export.h> #include <linux/hardirq.h> +#include <linux/delay.h> #define CREATE_TRACE_POINTS #include <trace/events/rcu.h> @@ -81,6 +82,9 @@ void __rcu_read_unlock(void) } else { barrier(); /* critical section before exit code. */ t->rcu_read_lock_nesting = INT_MIN; +#ifdef CONFIG_PROVE_RCU_DELAY + udelay(10); /* Make preemption more probable. */ +#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ barrier(); /* assign before ->rcu_read_unlock_special load */ if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) rcu_read_unlock_special(t); diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 547b1fe5b052..e4c6a598d6f7 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -56,25 +56,28 @@ static void __call_rcu(struct rcu_head *head, static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ -static void rcu_idle_enter_common(long long oldval) +static void rcu_idle_enter_common(long long newval) { - if (rcu_dynticks_nesting) { + if (newval) { RCU_TRACE(trace_rcu_dyntick("--=", - oldval, rcu_dynticks_nesting)); + rcu_dynticks_nesting, newval)); + rcu_dynticks_nesting = newval; return; } - RCU_TRACE(trace_rcu_dyntick("Start", oldval, rcu_dynticks_nesting)); + RCU_TRACE(trace_rcu_dyntick("Start", rcu_dynticks_nesting, newval)); if (!is_idle_task(current)) { struct task_struct *idle = idle_task(smp_processor_id()); RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task", - oldval, rcu_dynticks_nesting)); + rcu_dynticks_nesting, newval)); ftrace_dump(DUMP_ALL); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", current->pid, current->comm, idle->pid, idle->comm); /* must be idle task! */ } rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ + barrier(); + rcu_dynticks_nesting = newval; } /* @@ -84,17 +87,16 @@ static void rcu_idle_enter_common(long long oldval) void rcu_idle_enter(void) { unsigned long flags; - long long oldval; + long long newval; local_irq_save(flags); - oldval = rcu_dynticks_nesting; WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0); if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) - rcu_dynticks_nesting = 0; + newval = 0; else - rcu_dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; - rcu_idle_enter_common(oldval); + newval = rcu_dynticks_nesting - DYNTICK_TASK_NEST_VALUE; + rcu_idle_enter_common(newval); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_enter); @@ -105,15 +107,15 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter); void rcu_irq_exit(void) { unsigned long flags; - long long oldval; + long long newval; local_irq_save(flags); - oldval = rcu_dynticks_nesting; - rcu_dynticks_nesting--; - WARN_ON_ONCE(rcu_dynticks_nesting < 0); - rcu_idle_enter_common(oldval); + newval = rcu_dynticks_nesting - 1; + WARN_ON_ONCE(newval < 0); + rcu_idle_enter_common(newval); local_irq_restore(flags); } +EXPORT_SYMBOL_GPL(rcu_irq_exit); /* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */ static void rcu_idle_exit_common(long long oldval) @@ -171,6 +173,7 @@ void rcu_irq_enter(void) rcu_idle_exit_common(oldval); local_irq_restore(flags); } +EXPORT_SYMBOL_GPL(rcu_irq_enter); #ifdef CONFIG_DEBUG_LOCK_ALLOC diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 918fd1e8509c..3d0190282204 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -278,7 +278,7 @@ static int rcu_boost(void) rcu_preempt_ctrlblk.exp_tasks == NULL) return 0; /* Nothing to boost. */ - raw_local_irq_save(flags); + local_irq_save(flags); /* * Recheck with irqs disabled: all tasks in need of boosting @@ -287,7 +287,7 @@ static int rcu_boost(void) */ if (rcu_preempt_ctrlblk.boost_tasks == NULL && rcu_preempt_ctrlblk.exp_tasks == NULL) { - raw_local_irq_restore(flags); + local_irq_restore(flags); return 0; } @@ -317,7 +317,7 @@ static int rcu_boost(void) t = container_of(tb, struct task_struct, rcu_node_entry); rt_mutex_init_proxy_locked(&mtx, t); t->rcu_boost_mutex = &mtx; - raw_local_irq_restore(flags); + local_irq_restore(flags); rt_mutex_lock(&mtx); rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ @@ -991,9 +991,9 @@ static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n) { unsigned long flags; - raw_local_irq_save(flags); + local_irq_save(flags); rcp->qlen -= n; - raw_local_irq_restore(flags); + local_irq_restore(flags); } /* diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 25b15033c61f..aaa7b9f3532a 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -53,10 +53,11 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@fre static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ static int nfakewriters = 4; /* # fake writer threads */ -static int stat_interval; /* Interval between stats, in seconds. */ - /* Defaults to "only at end of test". */ +static int stat_interval = 60; /* Interval between stats, in seconds. */ + /* Zero means "only at end of test". */ static bool verbose; /* Print more debug info. */ -static bool test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ +static bool test_no_idle_hz = true; + /* Test RCU support for tickless idle CPUs. */ static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ static int stutter = 5; /* Start/stop testing interval (in sec) */ static int irqreader = 1; /* RCU readers from irq (timers). */ @@ -119,11 +120,11 @@ MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); #define TORTURE_FLAG "-torture:" #define PRINTK_STRING(s) \ - do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) + do { pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0) #define VERBOSE_PRINTK_STRING(s) \ - do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) + do { if (verbose) pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0) #define VERBOSE_PRINTK_ERRSTRING(s) \ - do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) + do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) static char printk_buf[4096]; @@ -176,8 +177,14 @@ static long n_rcu_torture_boosts; static long n_rcu_torture_timers; static long n_offline_attempts; static long n_offline_successes; +static unsigned long sum_offline; +static int min_offline = -1; +static int max_offline; static long n_online_attempts; static long n_online_successes; +static unsigned long sum_online; +static int min_online = -1; +static int max_online; static long n_barrier_attempts; static long n_barrier_successes; static struct list_head rcu_torture_removed; @@ -235,7 +242,7 @@ rcutorture_shutdown_notify(struct notifier_block *unused1, if (fullstop == FULLSTOP_DONTSTOP) fullstop = FULLSTOP_SHUTDOWN; else - printk(KERN_WARNING /* but going down anyway, so... */ + pr_warn(/* but going down anyway, so... */ "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); mutex_unlock(&fullstop_mutex); return NOTIFY_DONE; @@ -248,7 +255,7 @@ rcutorture_shutdown_notify(struct notifier_block *unused1, static void rcutorture_shutdown_absorb(char *title) { if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { - printk(KERN_NOTICE + pr_notice( "rcutorture thread %s parking due to system shutdown\n", title); schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT); @@ -1214,11 +1221,13 @@ rcu_torture_printk(char *page) n_rcu_torture_boost_failure, n_rcu_torture_boosts, n_rcu_torture_timers); - cnt += sprintf(&page[cnt], "onoff: %ld/%ld:%ld/%ld ", - n_online_successes, - n_online_attempts, - n_offline_successes, - n_offline_attempts); + cnt += sprintf(&page[cnt], + "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ", + n_online_successes, n_online_attempts, + n_offline_successes, n_offline_attempts, + min_online, max_online, + min_offline, max_offline, + sum_online, sum_offline, HZ); cnt += sprintf(&page[cnt], "barrier: %ld/%ld:%ld", n_barrier_successes, n_barrier_attempts, @@ -1267,7 +1276,7 @@ rcu_torture_stats_print(void) int cnt; cnt = rcu_torture_printk(printk_buf); - printk(KERN_ALERT "%s", printk_buf); + pr_alert("%s", printk_buf); } /* @@ -1380,20 +1389,20 @@ rcu_torture_stutter(void *arg) static inline void rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) { - printk(KERN_ALERT "%s" TORTURE_FLAG - "--- %s: nreaders=%d nfakewriters=%d " - "stat_interval=%d verbose=%d test_no_idle_hz=%d " - "shuffle_interval=%d stutter=%d irqreader=%d " - "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " - "test_boost=%d/%d test_boost_interval=%d " - "test_boost_duration=%d shutdown_secs=%d " - "onoff_interval=%d onoff_holdoff=%d\n", - torture_type, tag, nrealreaders, nfakewriters, - stat_interval, verbose, test_no_idle_hz, shuffle_interval, - stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, - test_boost, cur_ops->can_boost, - test_boost_interval, test_boost_duration, shutdown_secs, - onoff_interval, onoff_holdoff); + pr_alert("%s" TORTURE_FLAG + "--- %s: nreaders=%d nfakewriters=%d " + "stat_interval=%d verbose=%d test_no_idle_hz=%d " + "shuffle_interval=%d stutter=%d irqreader=%d " + "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " + "test_boost=%d/%d test_boost_interval=%d " + "test_boost_duration=%d shutdown_secs=%d " + "onoff_interval=%d onoff_holdoff=%d\n", + torture_type, tag, nrealreaders, nfakewriters, + stat_interval, verbose, test_no_idle_hz, shuffle_interval, + stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, + test_boost, cur_ops->can_boost, + test_boost_interval, test_boost_duration, shutdown_secs, + onoff_interval, onoff_holdoff); } static struct notifier_block rcutorture_shutdown_nb = { @@ -1460,9 +1469,9 @@ rcu_torture_shutdown(void *arg) !kthread_should_stop()) { delta = shutdown_time - jiffies_snap; if (verbose) - printk(KERN_ALERT "%s" TORTURE_FLAG - "rcu_torture_shutdown task: %lu jiffies remaining\n", - torture_type, delta); + pr_alert("%s" TORTURE_FLAG + "rcu_torture_shutdown task: %lu jiffies remaining\n", + torture_type, delta); schedule_timeout_interruptible(delta); jiffies_snap = ACCESS_ONCE(jiffies); } @@ -1490,8 +1499,10 @@ static int __cpuinit rcu_torture_onoff(void *arg) { int cpu; + unsigned long delta; int maxcpu = -1; DEFINE_RCU_RANDOM(rand); + unsigned long starttime; VERBOSE_PRINTK_STRING("rcu_torture_onoff task started"); for_each_online_cpu(cpu) @@ -1506,29 +1517,51 @@ rcu_torture_onoff(void *arg) cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1); if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { if (verbose) - printk(KERN_ALERT "%s" TORTURE_FLAG - "rcu_torture_onoff task: offlining %d\n", - torture_type, cpu); + pr_alert("%s" TORTURE_FLAG + "rcu_torture_onoff task: offlining %d\n", + torture_type, cpu); + starttime = jiffies; n_offline_attempts++; if (cpu_down(cpu) == 0) { if (verbose) - printk(KERN_ALERT "%s" TORTURE_FLAG - "rcu_torture_onoff task: offlined %d\n", - torture_type, cpu); + pr_alert("%s" TORTURE_FLAG + "rcu_torture_onoff task: offlined %d\n", + torture_type, cpu); n_offline_successes++; + delta = jiffies - starttime; + sum_offline += delta; + if (min_offline < 0) { + min_offline = delta; + max_offline = delta; + } + if (min_offline > delta) + min_offline = delta; + if (max_offline < delta) + max_offline = delta; } } else if (cpu_is_hotpluggable(cpu)) { if (verbose) - printk(KERN_ALERT "%s" TORTURE_FLAG - "rcu_torture_onoff task: onlining %d\n", - torture_type, cpu); + pr_alert("%s" TORTURE_FLAG + "rcu_torture_onoff task: onlining %d\n", + torture_type, cpu); + starttime = jiffies; n_online_attempts++; if (cpu_up(cpu) == 0) { if (verbose) - printk(KERN_ALERT "%s" TORTURE_FLAG - "rcu_torture_onoff task: onlined %d\n", - torture_type, cpu); + pr_alert("%s" TORTURE_FLAG + "rcu_torture_onoff task: onlined %d\n", + torture_type, cpu); n_online_successes++; + delta = jiffies - starttime; + sum_online += delta; + if (min_online < 0) { + min_online = delta; + max_online = delta; + } + if (min_online > delta) + min_online = delta; + if (max_online < delta) + max_online = delta; } } schedule_timeout_interruptible(onoff_interval * HZ); @@ -1593,14 +1626,14 @@ static int __cpuinit rcu_torture_stall(void *args) if (!kthread_should_stop()) { stop_at = get_seconds() + stall_cpu; /* RCU CPU stall is expected behavior in following code. */ - printk(KERN_ALERT "rcu_torture_stall start.\n"); + pr_alert("rcu_torture_stall start.\n"); rcu_read_lock(); preempt_disable(); while (ULONG_CMP_LT(get_seconds(), stop_at)) continue; /* Induce RCU CPU stall warning. */ preempt_enable(); rcu_read_unlock(); - printk(KERN_ALERT "rcu_torture_stall end.\n"); + pr_alert("rcu_torture_stall end.\n"); } rcutorture_shutdown_absorb("rcu_torture_stall"); while (!kthread_should_stop()) @@ -1716,12 +1749,12 @@ static int rcu_torture_barrier_init(void) if (n_barrier_cbs == 0) return 0; if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) { - printk(KERN_ALERT "%s" TORTURE_FLAG - " Call or barrier ops missing for %s,\n", - torture_type, cur_ops->name); - printk(KERN_ALERT "%s" TORTURE_FLAG - " RCU barrier testing omitted from run.\n", - torture_type); + pr_alert("%s" TORTURE_FLAG + " Call or barrier ops missing for %s,\n", + torture_type, cur_ops->name); + pr_alert("%s" TORTURE_FLAG + " RCU barrier testing omitted from run.\n", + torture_type); return 0; } atomic_set(&barrier_cbs_count, 0); @@ -1814,7 +1847,7 @@ rcu_torture_cleanup(void) mutex_lock(&fullstop_mutex); rcutorture_record_test_transition(); if (fullstop == FULLSTOP_SHUTDOWN) { - printk(KERN_WARNING /* but going down anyway, so... */ + pr_warn(/* but going down anyway, so... */ "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); mutex_unlock(&fullstop_mutex); schedule_timeout_uninterruptible(10); @@ -1938,17 +1971,17 @@ rcu_torture_init(void) break; } if (i == ARRAY_SIZE(torture_ops)) { - printk(KERN_ALERT "rcu-torture: invalid torture type: \"%s\"\n", - torture_type); - printk(KERN_ALERT "rcu-torture types:"); + pr_alert("rcu-torture: invalid torture type: \"%s\"\n", + torture_type); + pr_alert("rcu-torture types:"); for (i = 0; i < ARRAY_SIZE(torture_ops); i++) - printk(KERN_ALERT " %s", torture_ops[i]->name); - printk(KERN_ALERT "\n"); + pr_alert(" %s", torture_ops[i]->name); + pr_alert("\n"); mutex_unlock(&fullstop_mutex); return -EINVAL; } if (cur_ops->fqs == NULL && fqs_duration != 0) { - printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); + pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); fqs_duration = 0; } if (cur_ops->init) @@ -1996,14 +2029,15 @@ rcu_torture_init(void) /* Start up the kthreads. */ VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task"); - writer_task = kthread_run(rcu_torture_writer, NULL, - "rcu_torture_writer"); + writer_task = kthread_create(rcu_torture_writer, NULL, + "rcu_torture_writer"); if (IS_ERR(writer_task)) { firsterr = PTR_ERR(writer_task); VERBOSE_PRINTK_ERRSTRING("Failed to create writer"); writer_task = NULL; goto unwind; } + wake_up_process(writer_task); fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), GFP_KERNEL); if (fakewriter_tasks == NULL) { @@ -2118,14 +2152,15 @@ rcu_torture_init(void) } if (shutdown_secs > 0) { shutdown_time = jiffies + shutdown_secs * HZ; - shutdown_task = kthread_run(rcu_torture_shutdown, NULL, - "rcu_torture_shutdown"); + shutdown_task = kthread_create(rcu_torture_shutdown, NULL, + "rcu_torture_shutdown"); if (IS_ERR(shutdown_task)) { firsterr = PTR_ERR(shutdown_task); VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown"); shutdown_task = NULL; goto unwind; } + wake_up_process(shutdown_task); } i = rcu_torture_onoff_init(); if (i != 0) { diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 11a4fdca1df7..7387e46009d9 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -52,6 +52,7 @@ #include <linux/prefetch.h> #include <linux/delay.h> #include <linux/stop_machine.h> +#include <linux/random.h> #include "rcutree.h" #include <trace/events/rcu.h> @@ -61,6 +62,7 @@ /* Data structures. */ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; +static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; #define RCU_STATE_INITIALIZER(sname, cr) { \ .level = { &sname##_state.node[0] }, \ @@ -72,7 +74,6 @@ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; .orphan_nxttail = &sname##_state.orphan_nxtlist, \ .orphan_donetail = &sname##_state.orphan_donelist, \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ - .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.fqslock), \ .name = #sname, \ } @@ -88,7 +89,7 @@ LIST_HEAD(rcu_struct_flavors); /* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */ static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF; -module_param(rcu_fanout_leaf, int, 0); +module_param(rcu_fanout_leaf, int, 0444); int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */ NUM_RCU_LVL_0, @@ -174,8 +175,6 @@ void rcu_sched_qs(int cpu) { struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); - rdp->passed_quiesce_gpnum = rdp->gpnum; - barrier(); if (rdp->passed_quiesce == 0) trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs"); rdp->passed_quiesce = 1; @@ -185,8 +184,6 @@ void rcu_bh_qs(int cpu) { struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); - rdp->passed_quiesce_gpnum = rdp->gpnum; - barrier(); if (rdp->passed_quiesce == 0) trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs"); rdp->passed_quiesce = 1; @@ -215,9 +212,9 @@ static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ static int qhimark = 10000; /* If this many pending, ignore blimit. */ static int qlowmark = 100; /* Once only this many pending, use blimit. */ -module_param(blimit, int, 0); -module_param(qhimark, int, 0); -module_param(qlowmark, int, 0); +module_param(blimit, int, 0444); +module_param(qhimark, int, 0444); +module_param(qlowmark, int, 0444); int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; @@ -225,7 +222,14 @@ int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; module_param(rcu_cpu_stall_suppress, int, 0644); module_param(rcu_cpu_stall_timeout, int, 0644); -static void force_quiescent_state(struct rcu_state *rsp, int relaxed); +static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS; +static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS; + +module_param(jiffies_till_first_fqs, ulong, 0644); +module_param(jiffies_till_next_fqs, ulong, 0644); + +static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)); +static void force_quiescent_state(struct rcu_state *rsp); static int rcu_pending(int cpu); /* @@ -251,7 +255,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); */ void rcu_bh_force_quiescent_state(void) { - force_quiescent_state(&rcu_bh_state, 0); + force_quiescent_state(&rcu_bh_state); } EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); @@ -285,7 +289,7 @@ EXPORT_SYMBOL_GPL(rcutorture_record_progress); */ void rcu_sched_force_quiescent_state(void) { - force_quiescent_state(&rcu_sched_state, 0); + force_quiescent_state(&rcu_sched_state); } EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); @@ -304,7 +308,9 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) static int cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) { - return *rdp->nxttail[RCU_DONE_TAIL] && !rcu_gp_in_progress(rsp); + return *rdp->nxttail[RCU_DONE_TAIL + + ACCESS_ONCE(rsp->completed) != rdp->completed] && + !rcu_gp_in_progress(rsp); } /* @@ -316,35 +322,6 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) } /* - * If the specified CPU is offline, tell the caller that it is in - * a quiescent state. Otherwise, whack it with a reschedule IPI. - * Grace periods can end up waiting on an offline CPU when that - * CPU is in the process of coming online -- it will be added to the - * rcu_node bitmasks before it actually makes it online. The same thing - * can happen while a CPU is in the process of coming online. Because this - * race is quite rare, we check for it after detecting that the grace - * period has been delayed rather than checking each and every CPU - * each and every time we start a new grace period. - */ -static int rcu_implicit_offline_qs(struct rcu_data *rdp) -{ - /* - * If the CPU is offline for more than a jiffy, it is in a quiescent - * state. We can trust its state not to change because interrupts - * are disabled. The reason for the jiffy's worth of slack is to - * handle CPUs initializing on the way up and finding their way - * to the idle loop on the way down. - */ - if (cpu_is_offline(rdp->cpu) && - ULONG_CMP_LT(rdp->rsp->gp_start + 2, jiffies)) { - trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl"); - rdp->offline_fqs++; - return 1; - } - return 0; -} - -/* * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle * * If the new value of the ->dynticks_nesting counter now is zero, @@ -672,7 +649,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) * Return true if the specified CPU has passed through a quiescent * state by virtue of being in or having passed through an dynticks * idle state since the last call to dyntick_save_progress_counter() - * for this same CPU. + * for this same CPU, or by virtue of having been offline. */ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) { @@ -696,8 +673,26 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) return 1; } - /* Go check for the CPU being offline. */ - return rcu_implicit_offline_qs(rdp); + /* + * Check for the CPU being offline, but only if the grace period + * is old enough. We don't need to worry about the CPU changing + * state: If we see it offline even once, it has been through a + * quiescent state. + * + * The reason for insisting that the grace period be at least + * one jiffy old is that CPUs that are not quite online and that + * have just gone offline can still execute RCU read-side critical + * sections. + */ + if (ULONG_CMP_GE(rdp->rsp->gp_start + 2, jiffies)) + return 0; /* Grace period is not old enough. */ + barrier(); + if (cpu_is_offline(rdp->cpu)) { + trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl"); + rdp->offline_fqs++; + return 1; + } + return 0; } static int jiffies_till_stall_check(void) @@ -754,14 +749,15 @@ static void print_other_cpu_stall(struct rcu_state *rsp) rcu_for_each_leaf_node(rsp, rnp) { raw_spin_lock_irqsave(&rnp->lock, flags); ndetected += rcu_print_task_stall(rnp); + if (rnp->qsmask != 0) { + for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) + if (rnp->qsmask & (1UL << cpu)) { + print_cpu_stall_info(rsp, + rnp->grplo + cpu); + ndetected++; + } + } raw_spin_unlock_irqrestore(&rnp->lock, flags); - if (rnp->qsmask == 0) - continue; - for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) - if (rnp->qsmask & (1UL << cpu)) { - print_cpu_stall_info(rsp, rnp->grplo + cpu); - ndetected++; - } } /* @@ -781,11 +777,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp) else if (!trigger_all_cpu_backtrace()) dump_stack(); - /* If so configured, complain about tasks blocking the grace period. */ + /* Complain about tasks blocking the grace period. */ rcu_print_detail_task_stall(rsp); - force_quiescent_state(rsp, 0); /* Kick them all. */ + force_quiescent_state(rsp); /* Kick them all. */ } static void print_cpu_stall(struct rcu_state *rsp) @@ -826,7 +822,8 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) j = ACCESS_ONCE(jiffies); js = ACCESS_ONCE(rsp->jiffies_stall); rnp = rdp->mynode; - if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) { + if (rcu_gp_in_progress(rsp) && + (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) { /* We haven't checked in, so go dump stack. */ print_cpu_stall(rsp); @@ -888,12 +885,8 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct */ rdp->gpnum = rnp->gpnum; trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart"); - if (rnp->qsmask & rdp->grpmask) { - rdp->qs_pending = 1; - rdp->passed_quiesce = 0; - } else { - rdp->qs_pending = 0; - } + rdp->passed_quiesce = 0; + rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); zero_cpu_stall_ticks(rdp); } } @@ -973,10 +966,13 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat * our behalf. Catch up with this state to avoid noting * spurious new grace periods. If another grace period * has started, then rnp->gpnum will have advanced, so - * we will detect this later on. + * we will detect this later on. Of course, any quiescent + * states we found for the old GP are now invalid. */ - if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) + if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) { rdp->gpnum = rdp->completed; + rdp->passed_quiesce = 0; + } /* * If RCU does not need a quiescent state from this CPU, @@ -1020,97 +1016,56 @@ rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat /* Prior grace period ended, so advance callbacks for current CPU. */ __rcu_process_gp_end(rsp, rnp, rdp); - /* - * Because this CPU just now started the new grace period, we know - * that all of its callbacks will be covered by this upcoming grace - * period, even the ones that were registered arbitrarily recently. - * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL. - * - * Other CPUs cannot be sure exactly when the grace period started. - * Therefore, their recently registered callbacks must pass through - * an additional RCU_NEXT_READY stage, so that they will be handled - * by the next RCU grace period. - */ - rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; - rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; - /* Set state so that this CPU will detect the next quiescent state. */ __note_new_gpnum(rsp, rnp, rdp); } /* - * Start a new RCU grace period if warranted, re-initializing the hierarchy - * in preparation for detecting the next grace period. The caller must hold - * the root node's ->lock, which is released before return. Hard irqs must - * be disabled. - * - * Note that it is legal for a dying CPU (which is marked as offline) to - * invoke this function. This can happen when the dying CPU reports its - * quiescent state. + * Initialize a new grace period. */ -static void -rcu_start_gp(struct rcu_state *rsp, unsigned long flags) - __releases(rcu_get_root(rsp)->lock) +static int rcu_gp_init(struct rcu_state *rsp) { - struct rcu_data *rdp = this_cpu_ptr(rsp->rda); + struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); - if (!rcu_scheduler_fully_active || - !cpu_needs_another_gp(rsp, rdp)) { - /* - * Either the scheduler hasn't yet spawned the first - * non-idle task or this CPU does not need another - * grace period. Either way, don't start a new grace - * period. - */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; - } + raw_spin_lock_irq(&rnp->lock); + rsp->gp_flags = 0; /* Clear all flags: New grace period. */ - if (rsp->fqs_active) { - /* - * This CPU needs a grace period, but force_quiescent_state() - * is running. Tell it to start one on this CPU's behalf. - */ - rsp->fqs_need_gp = 1; - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; + if (rcu_gp_in_progress(rsp)) { + /* Grace period already in progress, don't start another. */ + raw_spin_unlock_irq(&rnp->lock); + return 0; } /* Advance to a new grace period and initialize state. */ rsp->gpnum++; trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); - WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT); - rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */ - rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; record_gp_stall_check_time(rsp); - raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */ + raw_spin_unlock_irq(&rnp->lock); /* Exclude any concurrent CPU-hotplug operations. */ - raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ + get_online_cpus(); /* * Set the quiescent-state-needed bits in all the rcu_node - * structures for all currently online CPUs in breadth-first - * order, starting from the root rcu_node structure. This - * operation relies on the layout of the hierarchy within the - * rsp->node[] array. Note that other CPUs will access only - * the leaves of the hierarchy, which still indicate that no + * structures for all currently online CPUs in breadth-first order, + * starting from the root rcu_node structure, relying on the layout + * of the tree within the rsp->node[] array. Note that other CPUs + * will access only the leaves of the hierarchy, thus seeing that no * grace period is in progress, at least until the corresponding * leaf node has been initialized. In addition, we have excluded * CPU-hotplug operations. * - * Note that the grace period cannot complete until we finish - * the initialization process, as there will be at least one - * qsmask bit set in the root node until that time, namely the - * one corresponding to this CPU, due to the fact that we have - * irqs disabled. + * The grace period cannot complete until the initialization + * process finishes, because this kthread handles both. */ rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + raw_spin_lock_irq(&rnp->lock); + rdp = this_cpu_ptr(rsp->rda); rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; rnp->gpnum = rsp->gpnum; + WARN_ON_ONCE(rnp->completed != rsp->completed); rnp->completed = rsp->completed; if (rnp == rdp->mynode) rcu_start_gp_per_cpu(rsp, rnp, rdp); @@ -1118,37 +1073,54 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) trace_rcu_grace_period_init(rsp->name, rnp->gpnum, rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_irq(&rnp->lock); +#ifdef CONFIG_PROVE_RCU_DELAY + if ((random32() % (rcu_num_nodes * 8)) == 0) + schedule_timeout_uninterruptible(2); +#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ + cond_resched(); } - rnp = rcu_get_root(rsp); - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - raw_spin_unlock_irqrestore(&rsp->onofflock, flags); + put_online_cpus(); + return 1; } /* - * Report a full set of quiescent states to the specified rcu_state - * data structure. This involves cleaning up after the prior grace - * period and letting rcu_start_gp() start up the next grace period - * if one is needed. Note that the caller must hold rnp->lock, as - * required by rcu_start_gp(), which will release it. + * Do one round of quiescent-state forcing. */ -static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) - __releases(rcu_get_root(rsp)->lock) +int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) { - unsigned long gp_duration; + int fqs_state = fqs_state_in; struct rcu_node *rnp = rcu_get_root(rsp); - struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); + rsp->n_force_qs++; + if (fqs_state == RCU_SAVE_DYNTICK) { + /* Collect dyntick-idle snapshots. */ + force_qs_rnp(rsp, dyntick_save_progress_counter); + fqs_state = RCU_FORCE_QS; + } else { + /* Handle dyntick-idle and offline CPUs. */ + force_qs_rnp(rsp, rcu_implicit_dynticks_qs); + } + /* Clear flag to prevent immediate re-entry. */ + if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { + raw_spin_lock_irq(&rnp->lock); + rsp->gp_flags &= ~RCU_GP_FLAG_FQS; + raw_spin_unlock_irq(&rnp->lock); + } + return fqs_state; +} - /* - * Ensure that all grace-period and pre-grace-period activity - * is seen before the assignment to rsp->completed. - */ - smp_mb(); /* See above block comment. */ +/* + * Clean up after the old grace period. + */ +static void rcu_gp_cleanup(struct rcu_state *rsp) +{ + unsigned long gp_duration; + struct rcu_data *rdp; + struct rcu_node *rnp = rcu_get_root(rsp); + + raw_spin_lock_irq(&rnp->lock); gp_duration = jiffies - rsp->gp_start; if (gp_duration > rsp->gp_max) rsp->gp_max = gp_duration; @@ -1160,35 +1132,149 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) * they can do to advance the grace period. It is therefore * safe for us to drop the lock in order to mark the grace * period as completed in all of the rcu_node structures. - * - * But if this CPU needs another grace period, it will take - * care of this while initializing the next grace period. - * We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL - * because the callbacks have not yet been advanced: Those - * callbacks are waiting on the grace period that just now - * completed. */ - if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) { - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_irq(&rnp->lock); - /* - * Propagate new ->completed value to rcu_node structures - * so that other CPUs don't have to wait until the start - * of the next grace period to process their callbacks. - */ - rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - rnp->completed = rsp->gpnum; - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - } - rnp = rcu_get_root(rsp); - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + /* + * Propagate new ->completed value to rcu_node structures so + * that other CPUs don't have to wait until the start of the next + * grace period to process their callbacks. This also avoids + * some nasty RCU grace-period initialization races by forcing + * the end of the current grace period to be completely recorded in + * all of the rcu_node structures before the beginning of the next + * grace period is recorded in any of the rcu_node structures. + */ + rcu_for_each_node_breadth_first(rsp, rnp) { + raw_spin_lock_irq(&rnp->lock); + rnp->completed = rsp->gpnum; + raw_spin_unlock_irq(&rnp->lock); + cond_resched(); } + rnp = rcu_get_root(rsp); + raw_spin_lock_irq(&rnp->lock); - rsp->completed = rsp->gpnum; /* Declare the grace period complete. */ + rsp->completed = rsp->gpnum; /* Declare grace period done. */ trace_rcu_grace_period(rsp->name, rsp->completed, "end"); rsp->fqs_state = RCU_GP_IDLE; - rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ + rdp = this_cpu_ptr(rsp->rda); + if (cpu_needs_another_gp(rsp, rdp)) + rsp->gp_flags = 1; + raw_spin_unlock_irq(&rnp->lock); +} + +/* + * Body of kthread that handles grace periods. + */ +static int __noreturn rcu_gp_kthread(void *arg) +{ + int fqs_state; + unsigned long j; + int ret; + struct rcu_state *rsp = arg; + struct rcu_node *rnp = rcu_get_root(rsp); + + for (;;) { + + /* Handle grace-period start. */ + for (;;) { + wait_event_interruptible(rsp->gp_wq, + rsp->gp_flags & + RCU_GP_FLAG_INIT); + if ((rsp->gp_flags & RCU_GP_FLAG_INIT) && + rcu_gp_init(rsp)) + break; + cond_resched(); + flush_signals(current); + } + + /* Handle quiescent-state forcing. */ + fqs_state = RCU_SAVE_DYNTICK; + j = jiffies_till_first_fqs; + if (j > HZ) { + j = HZ; + jiffies_till_first_fqs = HZ; + } + for (;;) { + rsp->jiffies_force_qs = jiffies + j; + ret = wait_event_interruptible_timeout(rsp->gp_wq, + (rsp->gp_flags & RCU_GP_FLAG_FQS) || + (!ACCESS_ONCE(rnp->qsmask) && + !rcu_preempt_blocked_readers_cgp(rnp)), + j); + /* If grace period done, leave loop. */ + if (!ACCESS_ONCE(rnp->qsmask) && + !rcu_preempt_blocked_readers_cgp(rnp)) + break; + /* If time for quiescent-state forcing, do it. */ + if (ret == 0 || (rsp->gp_flags & RCU_GP_FLAG_FQS)) { + fqs_state = rcu_gp_fqs(rsp, fqs_state); + cond_resched(); + } else { + /* Deal with stray signal. */ + cond_resched(); + flush_signals(current); + } + j = jiffies_till_next_fqs; + if (j > HZ) { + j = HZ; + jiffies_till_next_fqs = HZ; + } else if (j < 1) { + j = 1; + jiffies_till_next_fqs = 1; + } + } + + /* Handle grace-period end. */ + rcu_gp_cleanup(rsp); + } +} + +/* + * Start a new RCU grace period if warranted, re-initializing the hierarchy + * in preparation for detecting the next grace period. The caller must hold + * the root node's ->lock, which is released before return. Hard irqs must + * be disabled. + * + * Note that it is legal for a dying CPU (which is marked as offline) to + * invoke this function. This can happen when the dying CPU reports its + * quiescent state. + */ +static void +rcu_start_gp(struct rcu_state *rsp, unsigned long flags) + __releases(rcu_get_root(rsp)->lock) +{ + struct rcu_data *rdp = this_cpu_ptr(rsp->rda); + struct rcu_node *rnp = rcu_get_root(rsp); + + if (!rsp->gp_kthread || + !cpu_needs_another_gp(rsp, rdp)) { + /* + * Either we have not yet spawned the grace-period + * task or this CPU does not need another grace period. + * Either way, don't start a new grace period. + */ + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; + } + + rsp->gp_flags = RCU_GP_FLAG_INIT; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + wake_up(&rsp->gp_wq); +} + +/* + * Report a full set of quiescent states to the specified rcu_state + * data structure. This involves cleaning up after the prior grace + * period and letting rcu_start_gp() start up the next grace period + * if one is needed. Note that the caller must hold rnp->lock, as + * required by rcu_start_gp(), which will release it. + */ +static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) + __releases(rcu_get_root(rsp)->lock) +{ + WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); + raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); + wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ } /* @@ -1257,7 +1343,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, * based on quiescent states detected in an earlier grace period! */ static void -rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastgp) +rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; unsigned long mask; @@ -1265,7 +1351,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); - if (lastgp != rnp->gpnum || rnp->completed == rnp->gpnum) { + if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum || + rnp->completed == rnp->gpnum) { /* * The grace period in which this quiescent state was @@ -1324,7 +1411,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) * Tell RCU we are done (but rcu_report_qs_rdp() will be the * judge of that). */ - rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesce_gpnum); + rcu_report_qs_rdp(rdp->cpu, rsp, rdp); } #ifdef CONFIG_HOTPLUG_CPU @@ -1389,17 +1476,6 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) int i; struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); - /* - * If there is an rcu_barrier() operation in progress, then - * only the task doing that operation is permitted to adopt - * callbacks. To do otherwise breaks rcu_barrier() and friends - * by causing them to fail to wait for the callbacks in the - * orphanage. - */ - if (rsp->rcu_barrier_in_progress && - rsp->rcu_barrier_in_progress != current) - return; - /* Do the accounting first. */ rdp->qlen_lazy += rsp->qlen_lazy; rdp->qlen += rsp->qlen; @@ -1454,9 +1530,8 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) * The CPU has been completely removed, and some other CPU is reporting * this fact from process context. Do the remainder of the cleanup, * including orphaning the outgoing CPU's RCU callbacks, and also - * adopting them, if there is no _rcu_barrier() instance running. - * There can only be one CPU hotplug operation at a time, so no other - * CPU can be attempting to update rcu_cpu_kthread_task. + * adopting them. There can only be one CPU hotplug operation at a time, + * so no other CPU can be attempting to update rcu_cpu_kthread_task. */ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { @@ -1513,14 +1588,13 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", cpu, rdp->qlen, rdp->nxtlist); + init_callback_list(rdp); + /* Disallow further callbacks on this CPU. */ + rdp->nxttail[RCU_NEXT_TAIL] = NULL; } #else /* #ifdef CONFIG_HOTPLUG_CPU */ -static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) -{ -} - static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) { } @@ -1685,6 +1759,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) struct rcu_node *rnp; rcu_for_each_leaf_node(rsp, rnp) { + cond_resched(); mask = 0; raw_spin_lock_irqsave(&rnp->lock, flags); if (!rcu_gp_in_progress(rsp)) { @@ -1721,72 +1796,39 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) * Force quiescent states on reluctant CPUs, and also detect which * CPUs are in dyntick-idle mode. */ -static void force_quiescent_state(struct rcu_state *rsp, int relaxed) +static void force_quiescent_state(struct rcu_state *rsp) { unsigned long flags; - struct rcu_node *rnp = rcu_get_root(rsp); - - trace_rcu_utilization("Start fqs"); - if (!rcu_gp_in_progress(rsp)) { - trace_rcu_utilization("End fqs"); - return; /* No grace period in progress, nothing to force. */ - } - if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) { - rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ - trace_rcu_utilization("End fqs"); - return; /* Someone else is already on the job. */ - } - if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies)) - goto unlock_fqs_ret; /* no emergency and done recently. */ - rsp->n_force_qs++; - raw_spin_lock(&rnp->lock); /* irqs already disabled */ - rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; - if(!rcu_gp_in_progress(rsp)) { - rsp->n_force_qs_ngp++; - raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ - goto unlock_fqs_ret; /* no GP in progress, time updated. */ - } - rsp->fqs_active = 1; - switch (rsp->fqs_state) { - case RCU_GP_IDLE: - case RCU_GP_INIT: - - break; /* grace period idle or initializing, ignore. */ - - case RCU_SAVE_DYNTICK: - - raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ - - /* Record dyntick-idle state. */ - force_qs_rnp(rsp, dyntick_save_progress_counter); - raw_spin_lock(&rnp->lock); /* irqs already disabled */ - if (rcu_gp_in_progress(rsp)) - rsp->fqs_state = RCU_FORCE_QS; - break; - - case RCU_FORCE_QS: - - /* Check dyntick-idle state, send IPI to laggarts. */ - raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ - force_qs_rnp(rsp, rcu_implicit_dynticks_qs); - - /* Leave state in case more forcing is required. */ - - raw_spin_lock(&rnp->lock); /* irqs already disabled */ - break; + bool ret; + struct rcu_node *rnp; + struct rcu_node *rnp_old = NULL; + + /* Funnel through hierarchy to reduce memory contention. */ + rnp = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode; + for (; rnp != NULL; rnp = rnp->parent) { + ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) || + !raw_spin_trylock(&rnp->fqslock); + if (rnp_old != NULL) + raw_spin_unlock(&rnp_old->fqslock); + if (ret) { + rsp->n_force_qs_lh++; + return; + } + rnp_old = rnp; } - rsp->fqs_active = 0; - if (rsp->fqs_need_gp) { - raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */ - rsp->fqs_need_gp = 0; - rcu_start_gp(rsp, flags); /* releases rnp->lock */ - trace_rcu_utilization("End fqs"); - return; + /* rnp_old == rcu_get_root(rsp), rnp == NULL. */ + + /* Reached the root of the rcu_node tree, acquire lock. */ + raw_spin_lock_irqsave(&rnp_old->lock, flags); + raw_spin_unlock(&rnp_old->fqslock); + if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { + rsp->n_force_qs_lh++; + raw_spin_unlock_irqrestore(&rnp_old->lock, flags); + return; /* Someone beat us to it. */ } - raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ -unlock_fqs_ret: - raw_spin_unlock_irqrestore(&rsp->fqslock, flags); - trace_rcu_utilization("End fqs"); + rsp->gp_flags |= RCU_GP_FLAG_FQS; + raw_spin_unlock_irqrestore(&rnp_old->lock, flags); + wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ } /* @@ -1803,13 +1845,6 @@ __rcu_process_callbacks(struct rcu_state *rsp) WARN_ON_ONCE(rdp->beenonline == 0); /* - * If an RCU GP has gone long enough, go check for dyntick - * idle CPUs and, if needed, send resched IPIs. - */ - if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) - force_quiescent_state(rsp, 1); - - /* * Advance callbacks in response to end of earlier grace * period that some other CPU ended. */ @@ -1836,6 +1871,8 @@ static void rcu_process_callbacks(struct softirq_action *unused) { struct rcu_state *rsp; + if (cpu_is_offline(smp_processor_id())) + return; trace_rcu_utilization("Start RCU core"); for_each_rcu_flavor(rsp) __rcu_process_callbacks(rsp); @@ -1907,12 +1944,11 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, rdp->blimit = LONG_MAX; if (rsp->n_force_qs == rdp->n_force_qs_snap && *rdp->nxttail[RCU_DONE_TAIL] != head) - force_quiescent_state(rsp, 0); + force_quiescent_state(rsp); rdp->n_force_qs_snap = rsp->n_force_qs; rdp->qlen_last_fqs_check = rdp->qlen; } - } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) - force_quiescent_state(rsp, 1); + } } static void @@ -1927,8 +1963,6 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), head->func = func; head->next = NULL; - smp_mb(); /* Ensure RCU update seen before callback registry. */ - /* * Opportunistically note grace-period endings and beginnings. * Note that we might see a beginning right after we see an @@ -1939,6 +1973,12 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), rdp = this_cpu_ptr(rsp->rda); /* Add the callback to our list. */ + if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL)) { + /* _call_rcu() is illegal on offline CPU; leak the callback. */ + WARN_ON_ONCE(1); + local_irq_restore(flags); + return; + } ACCESS_ONCE(rdp->qlen)++; if (lazy) rdp->qlen_lazy++; @@ -2193,17 +2233,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) /* Is the RCU core waiting for a quiescent state from this CPU? */ if (rcu_scheduler_fully_active && rdp->qs_pending && !rdp->passed_quiesce) { - - /* - * If force_quiescent_state() coming soon and this CPU - * needs a quiescent state, and this is either RCU-sched - * or RCU-bh, force a local reschedule. - */ rdp->n_rp_qs_pending++; - if (!rdp->preemptible && - ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1, - jiffies)) - set_need_resched(); } else if (rdp->qs_pending && rdp->passed_quiesce) { rdp->n_rp_report_qs++; return 1; @@ -2233,13 +2263,6 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) return 1; } - /* Has an RCU GP gone long enough to send resched IPIs &c? */ - if (rcu_gp_in_progress(rsp) && - ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) { - rdp->n_rp_need_fqs++; - return 1; - } - /* nothing to do */ rdp->n_rp_need_nothing++; return 0; @@ -2324,13 +2347,10 @@ static void rcu_barrier_func(void *type) static void _rcu_barrier(struct rcu_state *rsp) { int cpu; - unsigned long flags; struct rcu_data *rdp; - struct rcu_data rd; unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done); unsigned long snap_done; - init_rcu_head_on_stack(&rd.barrier_head); _rcu_barrier_trace(rsp, "Begin", -1, snap); /* Take mutex to serialize concurrent rcu_barrier() requests. */ @@ -2370,70 +2390,30 @@ static void _rcu_barrier(struct rcu_state *rsp) /* * Initialize the count to one rather than to zero in order to * avoid a too-soon return to zero in case of a short grace period - * (or preemption of this task). Also flag this task as doing - * an rcu_barrier(). This will prevent anyone else from adopting - * orphaned callbacks, which could cause otherwise failure if a - * CPU went offline and quickly came back online. To see this, - * consider the following sequence of events: - * - * 1. We cause CPU 0 to post an rcu_barrier_callback() callback. - * 2. CPU 1 goes offline, orphaning its callbacks. - * 3. CPU 0 adopts CPU 1's orphaned callbacks. - * 4. CPU 1 comes back online. - * 5. We cause CPU 1 to post an rcu_barrier_callback() callback. - * 6. Both rcu_barrier_callback() callbacks are invoked, awakening - * us -- but before CPU 1's orphaned callbacks are invoked!!! + * (or preemption of this task). Exclude CPU-hotplug operations + * to ensure that no offline CPU has callbacks queued. */ init_completion(&rsp->barrier_completion); atomic_set(&rsp->barrier_cpu_count, 1); - raw_spin_lock_irqsave(&rsp->onofflock, flags); - rsp->rcu_barrier_in_progress = current; - raw_spin_unlock_irqrestore(&rsp->onofflock, flags); + get_online_cpus(); /* - * Force every CPU with callbacks to register a new callback - * that will tell us when all the preceding callbacks have - * been invoked. If an offline CPU has callbacks, wait for - * it to either come back online or to finish orphaning those - * callbacks. + * Force each CPU with callbacks to register a new callback. + * When that callback is invoked, we will know that all of the + * corresponding CPU's preceding callbacks have been invoked. */ - for_each_possible_cpu(cpu) { - preempt_disable(); + for_each_online_cpu(cpu) { rdp = per_cpu_ptr(rsp->rda, cpu); - if (cpu_is_offline(cpu)) { - _rcu_barrier_trace(rsp, "Offline", cpu, - rsp->n_barrier_done); - preempt_enable(); - while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen)) - schedule_timeout_interruptible(1); - } else if (ACCESS_ONCE(rdp->qlen)) { + if (ACCESS_ONCE(rdp->qlen)) { _rcu_barrier_trace(rsp, "OnlineQ", cpu, rsp->n_barrier_done); smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); - preempt_enable(); } else { _rcu_barrier_trace(rsp, "OnlineNQ", cpu, rsp->n_barrier_done); - preempt_enable(); } } - - /* - * Now that all online CPUs have rcu_barrier_callback() callbacks - * posted, we can adopt all of the orphaned callbacks and place - * an rcu_barrier_callback() callback after them. When that is done, - * we are guaranteed to have an rcu_barrier_callback() callback - * following every callback that could possibly have been - * registered before _rcu_barrier() was called. - */ - raw_spin_lock_irqsave(&rsp->onofflock, flags); - rcu_adopt_orphan_cbs(rsp); - rsp->rcu_barrier_in_progress = NULL; - raw_spin_unlock_irqrestore(&rsp->onofflock, flags); - atomic_inc(&rsp->barrier_cpu_count); - smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */ - rd.rsp = rsp; - rsp->call(&rd.barrier_head, rcu_barrier_callback); + put_online_cpus(); /* * Now that we have an rcu_barrier_callback() callback on each @@ -2454,8 +2434,6 @@ static void _rcu_barrier(struct rcu_state *rsp) /* Other rcu_barrier() invocations can now safely proceed. */ mutex_unlock(&rsp->barrier_mutex); - - destroy_rcu_head_on_stack(&rd.barrier_head); } /** @@ -2521,6 +2499,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; + init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; atomic_set(&rdp->dynticks->dynticks, (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); @@ -2553,7 +2532,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rdp->completed = rnp->completed; rdp->passed_quiesce = 0; rdp->qs_pending = 0; - rdp->passed_quiesce_gpnum = rnp->gpnum - 1; trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl"); } raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ @@ -2623,6 +2601,28 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, } /* + * Spawn the kthread that handles this RCU flavor's grace periods. + */ +static int __init rcu_spawn_gp_kthread(void) +{ + unsigned long flags; + struct rcu_node *rnp; + struct rcu_state *rsp; + struct task_struct *t; + + for_each_rcu_flavor(rsp) { + t = kthread_run(rcu_gp_kthread, rsp, rsp->name); + BUG_ON(IS_ERR(t)); + rnp = rcu_get_root(rsp); + raw_spin_lock_irqsave(&rnp->lock, flags); + rsp->gp_kthread = t; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + } + return 0; +} +early_initcall(rcu_spawn_gp_kthread); + +/* * This function is invoked towards the end of the scheduler's initialization * process. Before this is called, the idle task might contain * RCU read-side critical sections (during which time, this idle @@ -2657,7 +2657,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) int cprv; int i; - cprv = NR_CPUS; + cprv = nr_cpu_ids; for (i = rcu_num_lvls - 1; i >= 0; i--) { ccur = rsp->levelcnt[i]; rsp->levelspread[i] = (cprv + ccur - 1) / ccur; @@ -2672,10 +2672,14 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) static void __init rcu_init_one(struct rcu_state *rsp, struct rcu_data __percpu *rda) { - static char *buf[] = { "rcu_node_level_0", - "rcu_node_level_1", - "rcu_node_level_2", - "rcu_node_level_3" }; /* Match MAX_RCU_LVLS */ + static char *buf[] = { "rcu_node_0", + "rcu_node_1", + "rcu_node_2", + "rcu_node_3" }; /* Match MAX_RCU_LVLS */ + static char *fqs[] = { "rcu_node_fqs_0", + "rcu_node_fqs_1", + "rcu_node_fqs_2", + "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */ int cpustride = 1; int i; int j; @@ -2700,7 +2704,11 @@ static void __init rcu_init_one(struct rcu_state *rsp, raw_spin_lock_init(&rnp->lock); lockdep_set_class_and_name(&rnp->lock, &rcu_node_class[i], buf[i]); - rnp->gpnum = 0; + raw_spin_lock_init(&rnp->fqslock); + lockdep_set_class_and_name(&rnp->fqslock, + &rcu_fqs_class[i], fqs[i]); + rnp->gpnum = rsp->gpnum; + rnp->completed = rsp->completed; rnp->qsmask = 0; rnp->qsmaskinit = 0; rnp->grplo = j * cpustride; @@ -2723,6 +2731,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, } rsp->rda = rda; + init_waitqueue_head(&rsp->gp_wq); rnp = rsp->level[rcu_num_lvls - 1]; for_each_possible_cpu(i) { while (i > rnp->grphi) @@ -2746,7 +2755,8 @@ static void __init rcu_init_geometry(void) int rcu_capacity[MAX_RCU_LVLS + 1]; /* If the compile-time values are accurate, just leave. */ - if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF) + if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF && + nr_cpu_ids == NR_CPUS) return; /* diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 1224d4c05382..7576fd4d8ce6 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -196,6 +196,7 @@ struct rcu_node { /* Refused to boost: not sure why, though. */ /* This can happen due to race conditions. */ #endif /* #ifdef CONFIG_RCU_BOOST */ + raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; } ____cacheline_internodealigned_in_smp; /* @@ -239,8 +240,6 @@ struct rcu_data { /* in order to detect GP end. */ unsigned long gpnum; /* Highest gp number that this CPU */ /* is aware of having started. */ - unsigned long passed_quiesce_gpnum; - /* gpnum at time of quiescent state. */ bool passed_quiesce; /* User-mode/idle loop etc. */ bool qs_pending; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ @@ -306,11 +305,13 @@ struct rcu_data { unsigned long n_rp_cpu_needs_gp; unsigned long n_rp_gp_completed; unsigned long n_rp_gp_started; - unsigned long n_rp_need_fqs; unsigned long n_rp_need_nothing; - /* 6) _rcu_barrier() callback. */ + /* 6) _rcu_barrier() and OOM callbacks. */ struct rcu_head barrier_head; +#ifdef CONFIG_RCU_FAST_NO_HZ + struct rcu_head oom_head; +#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ int cpu; struct rcu_state *rsp; @@ -369,20 +370,17 @@ struct rcu_state { u8 fqs_state ____cacheline_internodealigned_in_smp; /* Force QS state. */ - u8 fqs_active; /* force_quiescent_state() */ - /* is running. */ - u8 fqs_need_gp; /* A CPU was prevented from */ - /* starting a new grace */ - /* period because */ - /* force_quiescent_state() */ - /* was running. */ u8 boost; /* Subject to priority boost. */ unsigned long gpnum; /* Current gp number. */ unsigned long completed; /* # of last completed gp. */ + struct task_struct *gp_kthread; /* Task for grace periods. */ + wait_queue_head_t gp_wq; /* Where GP task waits. */ + int gp_flags; /* Commands for GP task. */ /* End of fields guarded by root rcu_node's lock. */ - raw_spinlock_t onofflock; /* exclude on/offline and */ + raw_spinlock_t onofflock ____cacheline_internodealigned_in_smp; + /* exclude on/offline and */ /* starting new GP. */ struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ /* need a grace period. */ @@ -392,16 +390,11 @@ struct rcu_state { struct rcu_head **orphan_donetail; /* Tail of above. */ long qlen_lazy; /* Number of lazy callbacks. */ long qlen; /* Total number of callbacks. */ - struct task_struct *rcu_barrier_in_progress; - /* Task doing rcu_barrier(), */ - /* or NULL if no barrier. */ struct mutex barrier_mutex; /* Guards barrier fields. */ atomic_t barrier_cpu_count; /* # CPUs waiting on. */ struct completion barrier_completion; /* Wake at barrier end. */ unsigned long n_barrier_done; /* ++ at start and end of */ /* _rcu_barrier(). */ - raw_spinlock_t fqslock; /* Only one task forcing */ - /* quiescent states. */ unsigned long jiffies_force_qs; /* Time at which to invoke */ /* force_quiescent_state(). */ unsigned long n_force_qs; /* Number of calls to */ @@ -420,6 +413,10 @@ struct rcu_state { struct list_head flavors; /* List of RCU flavors. */ }; +/* Values for rcu_state structure's gp_flags field. */ +#define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */ +#define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ + extern struct list_head rcu_struct_flavors; #define for_each_rcu_flavor(rsp) \ list_for_each_entry((rsp), &rcu_struct_flavors, flavors) diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index c1961aed1213..9c71c1b18e03 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -25,6 +25,7 @@ */ #include <linux/delay.h> +#include <linux/oom.h> #include <linux/smpboot.h> #define RCU_KTHREAD_PRIO 1 @@ -119,7 +120,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed); */ void rcu_force_quiescent_state(void) { - force_quiescent_state(&rcu_preempt_state, 0); + force_quiescent_state(&rcu_preempt_state); } EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); @@ -137,8 +138,6 @@ static void rcu_preempt_qs(int cpu) { struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); - rdp->passed_quiesce_gpnum = rdp->gpnum; - barrier(); if (rdp->passed_quiesce == 0) trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs"); rdp->passed_quiesce = 1; @@ -423,9 +422,11 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) unsigned long flags; struct task_struct *t; - if (!rcu_preempt_blocked_readers_cgp(rnp)) - return; raw_spin_lock_irqsave(&rnp->lock, flags); + if (!rcu_preempt_blocked_readers_cgp(rnp)) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; + } t = list_entry(rnp->gp_tasks, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) @@ -585,17 +586,23 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ } + rnp->gp_tasks = NULL; + rnp->exp_tasks = NULL; #ifdef CONFIG_RCU_BOOST - /* In case root is being boosted and leaf is not. */ + rnp->boost_tasks = NULL; + /* + * In case root is being boosted and leaf was not. Make sure + * that we boost the tasks blocking the current grace period + * in this case. + */ raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ if (rnp_root->boost_tasks != NULL && - rnp_root->boost_tasks != rnp_root->gp_tasks) + rnp_root->boost_tasks != rnp_root->gp_tasks && + rnp_root->boost_tasks != rnp_root->exp_tasks) rnp_root->boost_tasks = rnp_root->gp_tasks; raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ #endif /* #ifdef CONFIG_RCU_BOOST */ - rnp->gp_tasks = NULL; - rnp->exp_tasks = NULL; return retval; } @@ -677,7 +684,7 @@ void synchronize_rcu(void) EXPORT_SYMBOL_GPL(synchronize_rcu); static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); -static long sync_rcu_preempt_exp_count; +static unsigned long sync_rcu_preempt_exp_count; static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); /* @@ -792,7 +799,7 @@ void synchronize_rcu_expedited(void) unsigned long flags; struct rcu_node *rnp; struct rcu_state *rsp = &rcu_preempt_state; - long snap; + unsigned long snap; int trycount = 0; smp_mb(); /* Caller's modifications seen first by other CPUs. */ @@ -800,33 +807,47 @@ void synchronize_rcu_expedited(void) smp_mb(); /* Above access cannot bleed into critical section. */ /* + * Block CPU-hotplug operations. This means that any CPU-hotplug + * operation that finds an rcu_node structure with tasks in the + * process of being boosted will know that all tasks blocking + * this expedited grace period will already be in the process of + * being boosted. This simplifies the process of moving tasks + * from leaf to root rcu_node structures. + */ + get_online_cpus(); + + /* * Acquire lock, falling back to synchronize_rcu() if too many * lock-acquisition failures. Of course, if someone does the * expedited grace period for us, just leave. */ while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { + if (ULONG_CMP_LT(snap, + ACCESS_ONCE(sync_rcu_preempt_exp_count))) { + put_online_cpus(); + goto mb_ret; /* Others did our work for us. */ + } if (trycount++ < 10) { udelay(trycount * num_online_cpus()); } else { + put_online_cpus(); synchronize_rcu(); return; } - if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) - goto mb_ret; /* Others did our work for us. */ } - if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) + if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) { + put_online_cpus(); goto unlock_mb_ret; /* Others did our work for us. */ + } /* force all RCU readers onto ->blkd_tasks lists. */ synchronize_sched_expedited(); - raw_spin_lock_irqsave(&rsp->onofflock, flags); - /* Initialize ->expmask for all non-leaf rcu_node structures. */ rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + raw_spin_lock_irqsave(&rnp->lock, flags); rnp->expmask = rnp->qsmaskinit; - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_irqrestore(&rnp->lock, flags); } /* Snapshot current state of ->blkd_tasks lists. */ @@ -835,7 +856,7 @@ void synchronize_rcu_expedited(void) if (NUM_RCU_NODES > 1) sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); - raw_spin_unlock_irqrestore(&rsp->onofflock, flags); + put_online_cpus(); /* Wait for snapshotted ->blkd_tasks lists to drain. */ rnp = rcu_get_root(rsp); @@ -1192,9 +1213,9 @@ static int rcu_boost_kthread(void *arg) * kthread to start boosting them. If there is an expedited grace * period in progress, it is always time to boost. * - * The caller must hold rnp->lock, which this function releases, - * but irqs remain disabled. The ->boost_kthread_task is immortal, - * so we don't need to worry about it going away. + * The caller must hold rnp->lock, which this function releases. + * The ->boost_kthread_task is immortal, so we don't need to worry + * about it going away. */ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) { @@ -1814,16 +1835,16 @@ static void rcu_prepare_for_idle(int cpu) #ifdef CONFIG_TREE_PREEMPT_RCU if (per_cpu(rcu_preempt_data, cpu).nxtlist) { rcu_preempt_qs(cpu); - force_quiescent_state(&rcu_preempt_state, 0); + force_quiescent_state(&rcu_preempt_state); } #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ if (per_cpu(rcu_sched_data, cpu).nxtlist) { rcu_sched_qs(cpu); - force_quiescent_state(&rcu_sched_state, 0); + force_quiescent_state(&rcu_sched_state); } if (per_cpu(rcu_bh_data, cpu).nxtlist) { rcu_bh_qs(cpu); - force_quiescent_state(&rcu_bh_state, 0); + force_quiescent_state(&rcu_bh_state); } /* @@ -1851,6 +1872,88 @@ static void rcu_idle_count_callbacks_posted(void) __this_cpu_add(rcu_dynticks.nonlazy_posted, 1); } +/* + * Data for flushing lazy RCU callbacks at OOM time. + */ +static atomic_t oom_callback_count; +static DECLARE_WAIT_QUEUE_HEAD(oom_callback_wq); + +/* + * RCU OOM callback -- decrement the outstanding count and deliver the + * wake-up if we are the last one. + */ +static void rcu_oom_callback(struct rcu_head *rhp) +{ + if (atomic_dec_and_test(&oom_callback_count)) + wake_up(&oom_callback_wq); +} + +/* + * Post an rcu_oom_notify callback on the current CPU if it has at + * least one lazy callback. This will unnecessarily post callbacks + * to CPUs that already have a non-lazy callback at the end of their + * callback list, but this is an infrequent operation, so accept some + * extra overhead to keep things simple. + */ +static void rcu_oom_notify_cpu(void *unused) +{ + struct rcu_state *rsp; + struct rcu_data *rdp; + + for_each_rcu_flavor(rsp) { + rdp = __this_cpu_ptr(rsp->rda); + if (rdp->qlen_lazy != 0) { + atomic_inc(&oom_callback_count); + rsp->call(&rdp->oom_head, rcu_oom_callback); + } + } +} + +/* + * If low on memory, ensure that each CPU has a non-lazy callback. + * This will wake up CPUs that have only lazy callbacks, in turn + * ensuring that they free up the corresponding memory in a timely manner. + * Because an uncertain amount of memory will be freed in some uncertain + * timeframe, we do not claim to have freed anything. + */ +static int rcu_oom_notify(struct notifier_block *self, + unsigned long notused, void *nfreed) +{ + int cpu; + + /* Wait for callbacks from earlier instance to complete. */ + wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0); + + /* + * Prevent premature wakeup: ensure that all increments happen + * before there is a chance of the counter reaching zero. + */ + atomic_set(&oom_callback_count, 1); + + get_online_cpus(); + for_each_online_cpu(cpu) { + smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1); + cond_resched(); + } + put_online_cpus(); + + /* Unconditionally decrement: no need to wake ourselves up. */ + atomic_dec(&oom_callback_count); + + return NOTIFY_OK; +} + +static struct notifier_block rcu_oom_nb = { + .notifier_call = rcu_oom_notify +}; + +static int __init rcu_register_oom_notifier(void) +{ + register_oom_notifier(&rcu_oom_nb); + return 0; +} +early_initcall(rcu_register_oom_notifier); + #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ #ifdef CONFIG_RCU_CPU_STALL_INFO @@ -1861,11 +1964,15 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu) { struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); struct timer_list *tltp = &rdtp->idle_gp_timer; + char c; - sprintf(cp, "drain=%d %c timer=%lu", - rdtp->dyntick_drain, - rdtp->dyntick_holdoff == jiffies ? 'H' : '.', - timer_pending(tltp) ? tltp->expires - jiffies : -1); + c = rdtp->dyntick_holdoff == jiffies ? 'H' : '.'; + if (timer_pending(tltp)) + sprintf(cp, "drain=%d %c timer=%lu", + rdtp->dyntick_drain, c, tltp->expires - jiffies); + else + sprintf(cp, "drain=%d %c timer not pending", + rdtp->dyntick_drain, c); } #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ @@ -1933,11 +2040,10 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp) /* Increment ->ticks_this_gp for all flavors of RCU. */ static void increment_cpu_stall_ticks(void) { - __get_cpu_var(rcu_sched_data).ticks_this_gp++; - __get_cpu_var(rcu_bh_data).ticks_this_gp++; -#ifdef CONFIG_TREE_PREEMPT_RCU - __get_cpu_var(rcu_preempt_data).ticks_this_gp++; -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + __this_cpu_ptr(rsp->rda)->ticks_this_gp++; } #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 31968931f146..693513bc50e6 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -51,8 +51,8 @@ static int show_rcubarrier(struct seq_file *m, void *unused) struct rcu_state *rsp; for_each_rcu_flavor(rsp) - seq_printf(m, "%s: %c bcc: %d nbd: %lu\n", - rsp->name, rsp->rcu_barrier_in_progress ? 'B' : '.', + seq_printf(m, "%s: bcc: %d nbd: %lu\n", + rsp->name, atomic_read(&rsp->barrier_cpu_count), rsp->n_barrier_done); return 0; @@ -86,12 +86,11 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) { if (!rdp->beenonline) return; - seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pgp=%lu qp=%d", + seq_printf(m, "%3d%cc=%lu g=%lu pq=%d qp=%d", rdp->cpu, cpu_is_offline(rdp->cpu) ? '!' : ' ', rdp->completed, rdp->gpnum, - rdp->passed_quiesce, rdp->passed_quiesce_gpnum, - rdp->qs_pending); + rdp->passed_quiesce, rdp->qs_pending); seq_printf(m, " dt=%d/%llx/%d df=%lu", atomic_read(&rdp->dynticks->dynticks), rdp->dynticks->dynticks_nesting, @@ -149,12 +148,11 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) { if (!rdp->beenonline) return; - seq_printf(m, "%d,%s,%lu,%lu,%d,%lu,%d", + seq_printf(m, "%d,%s,%lu,%lu,%d,%d", rdp->cpu, cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", rdp->completed, rdp->gpnum, - rdp->passed_quiesce, rdp->passed_quiesce_gpnum, - rdp->qs_pending); + rdp->passed_quiesce, rdp->qs_pending); seq_printf(m, ",%d,%llx,%d,%lu", atomic_read(&rdp->dynticks->dynticks), rdp->dynticks->dynticks_nesting, @@ -185,7 +183,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused) int cpu; struct rcu_state *rsp; - seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); + seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pq\","); seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\""); #ifdef CONFIG_RCU_BOOST @@ -385,10 +383,9 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) rdp->n_rp_report_qs, rdp->n_rp_cb_ready, rdp->n_rp_cpu_needs_gp); - seq_printf(m, "gpc=%ld gps=%ld nf=%ld nn=%ld\n", + seq_printf(m, "gpc=%ld gps=%ld nn=%ld\n", rdp->n_rp_gp_completed, rdp->n_rp_gp_started, - rdp->n_rp_need_fqs, rdp->n_rp_need_nothing); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 82ad284f823b..8c38b5e7ce47 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3142,6 +3142,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) # define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) #endif +static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) +{ + u64 temp = (__force u64) rtime; + + temp *= (__force u64) utime; + + if (sizeof(cputime_t) == 4) + temp = div_u64(temp, (__force u32) total); + else + temp = div64_u64(temp, (__force u64) total); + + return (__force cputime_t) temp; +} + void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) { cputime_t rtime, utime = p->utime, total = utime + p->stime; @@ -3151,13 +3165,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) */ rtime = nsecs_to_cputime(p->se.sum_exec_runtime); - if (total) { - u64 temp = (__force u64) rtime; - - temp *= (__force u64) utime; - do_div(temp, (__force u32) total); - utime = (__force cputime_t) temp; - } else + if (total) + utime = scale_utime(utime, rtime, total); + else utime = rtime; /* @@ -3184,13 +3194,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) total = cputime.utime + cputime.stime; rtime = nsecs_to_cputime(cputime.sum_exec_runtime); - if (total) { - u64 temp = (__force u64) rtime; - - temp *= (__force u64) cputime.utime; - do_div(temp, (__force u32) total); - utime = (__force cputime_t) temp; - } else + if (total) + utime = scale_utime(cputime.utime, rtime, total); + else utime = rtime; sig->prev_utime = max(sig->prev_utime, utime); @@ -5298,27 +5304,17 @@ void idle_task_exit(void) } /* - * While a dead CPU has no uninterruptible tasks queued at this point, - * it might still have a nonzero ->nr_uninterruptible counter, because - * for performance reasons the counter is not stricly tracking tasks to - * their home CPUs. So we just add the counter to another CPU's counter, - * to keep the global sum constant after CPU-down: - */ -static void migrate_nr_uninterruptible(struct rq *rq_src) -{ - struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); - - rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; - rq_src->nr_uninterruptible = 0; -} - -/* - * remove the tasks which were accounted by rq from calc_load_tasks. + * Since this CPU is going 'away' for a while, fold any nr_active delta + * we might have. Assumes we're called after migrate_tasks() so that the + * nr_active count is stable. + * + * Also see the comment "Global load-average calculations". */ -static void calc_global_load_remove(struct rq *rq) +static void calc_load_migrate(struct rq *rq) { - atomic_long_sub(rq->calc_load_active, &calc_load_tasks); - rq->calc_load_active = 0; + long delta = calc_load_fold_active(rq); + if (delta) + atomic_long_add(delta, &calc_load_tasks); } /* @@ -5611,9 +5607,18 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) migrate_tasks(cpu); BUG_ON(rq->nr_running != 1); /* the migration thread */ raw_spin_unlock_irqrestore(&rq->lock, flags); + break; - migrate_nr_uninterruptible(rq); - calc_global_load_remove(rq); + case CPU_DEAD: + { + struct rq *dest_rq; + + local_irq_save(flags); + dest_rq = cpu_rq(smp_processor_id()); + raw_spin_lock(&dest_rq->lock); + calc_load_migrate(rq); + raw_spin_unlock_irqrestore(&dest_rq->lock, flags); + } break; #endif } @@ -7246,6 +7251,7 @@ int in_sched_functions(unsigned long addr) #ifdef CONFIG_CGROUP_SCHED struct task_group root_task_group; +LIST_HEAD(task_groups); #endif DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d0cc03b3e70b..c219bf8d704c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3387,6 +3387,14 @@ static int tg_load_down(struct task_group *tg, void *data) static void update_h_load(long cpu) { + struct rq *rq = cpu_rq(cpu); + unsigned long now = jiffies; + + if (rq->h_load_throttle == now) + return; + + rq->h_load_throttle = now; + rcu_read_lock(); walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); rcu_read_unlock(); @@ -4293,11 +4301,10 @@ redo: env.src_rq = busiest; env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); + update_h_load(env.src_cpu); more_balance: local_irq_save(flags); double_rq_lock(this_rq, busiest); - if (!env.loop) - update_h_load(env.src_cpu); /* * cur_ld_moved - load moved in current iteration diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 573e1ca01102..944cb68420e9 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -788,6 +788,19 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) const struct cpumask *span; span = sched_rt_period_mask(); +#ifdef CONFIG_RT_GROUP_SCHED + /* + * FIXME: isolated CPUs should really leave the root task group, + * whether they are isolcpus or were isolated via cpusets, lest + * the timer run on a CPU which does not service all runqueues, + * potentially leaving other CPUs indefinitely throttled. If + * isolation is really required, the user will turn the throttle + * off to kill the perturbations it causes anyway. Meanwhile, + * this maintains functionality for boot and/or troubleshooting. + */ + if (rt_b == &root_task_group.rt_bandwidth) + span = cpu_online_mask; +#endif for_each_cpu(i, span) { int enqueue = 0; struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c35a1a7dd4d6..f6714d009e77 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -80,7 +80,7 @@ extern struct mutex sched_domains_mutex; struct cfs_rq; struct rt_rq; -static LIST_HEAD(task_groups); +extern struct list_head task_groups; struct cfs_bandwidth { #ifdef CONFIG_CFS_BANDWIDTH @@ -374,7 +374,11 @@ struct rq { #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ struct list_head leaf_cfs_rq_list; -#endif +#ifdef CONFIG_SMP + unsigned long h_load_throttle; +#endif /* CONFIG_SMP */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ + #ifdef CONFIG_RT_GROUP_SCHED struct list_head leaf_rt_rq_list; #endif diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 7b386e86fd23..da5eb5bed84a 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -27,8 +27,10 @@ static struct task_struct *pick_next_task_stop(struct rq *rq) { struct task_struct *stop = rq->stop; - if (stop && stop->on_rq) + if (stop && stop->on_rq) { + stop->se.exec_start = rq->clock_task; return stop; + } return NULL; } @@ -52,6 +54,21 @@ static void yield_task_stop(struct rq *rq) static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) { + struct task_struct *curr = rq->curr; + u64 delta_exec; + + delta_exec = rq->clock_task - curr->se.exec_start; + if (unlikely((s64)delta_exec < 0)) + delta_exec = 0; + + schedstat_set(curr->se.statistics.exec_max, + max(curr->se.statistics.exec_max, delta_exec)); + + curr->se.sum_exec_runtime += delta_exec; + account_group_exec_runtime(curr, delta_exec); + + curr->se.exec_start = rq->clock_task; + cpuacct_charge(curr, delta_exec); } static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) @@ -60,6 +77,9 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) static void set_curr_task_stop(struct rq *rq) { + struct task_struct *stop = rq->stop; + + stop->se.exec_start = rq->clock_task; } static void switched_to_stop(struct rq *rq, struct task_struct *p) diff --git a/kernel/task_work.c b/kernel/task_work.c index 91d4e1742a0c..d320d44903bd 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -75,6 +75,7 @@ void task_work_run(void) p = q->next; q->func(q); q = p; + cond_resched(); } } } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 024540f97f74..4b1785a7bb83 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -436,7 +436,8 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) if (unlikely(local_softirq_pending() && cpu_online(cpu))) { static int ratelimit; - if (ratelimit < 10) { + if (ratelimit < 10 && + (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", (unsigned int) local_softirq_pending()); ratelimit++; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e16af197a2bc..34e5eac81424 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -115,6 +115,7 @@ static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts) { tk->xtime_sec += ts->tv_sec; tk->xtime_nsec += (u64)ts->tv_nsec << tk->shift; + tk_normalize_xtime(tk); } static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm) @@ -276,7 +277,7 @@ static void timekeeping_forward_now(struct timekeeper *tk) tk->xtime_nsec += cycle_delta * tk->mult; /* If arch requires, add in gettimeoffset() */ - tk->xtime_nsec += arch_gettimeoffset() << tk->shift; + tk->xtime_nsec += (u64)arch_gettimeoffset() << tk->shift; tk_normalize_xtime(tk); @@ -427,7 +428,7 @@ int do_settimeofday(const struct timespec *tv) struct timespec ts_delta, xt; unsigned long flags; - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) + if (!timespec_valid_strict(tv)) return -EINVAL; write_seqlock_irqsave(&tk->lock, flags); @@ -463,6 +464,8 @@ int timekeeping_inject_offset(struct timespec *ts) { struct timekeeper *tk = &timekeeper; unsigned long flags; + struct timespec tmp; + int ret = 0; if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; @@ -471,10 +474,17 @@ int timekeeping_inject_offset(struct timespec *ts) timekeeping_forward_now(tk); + /* Make sure the proposed value is valid */ + tmp = timespec_add(tk_xtime(tk), *ts); + if (!timespec_valid_strict(&tmp)) { + ret = -EINVAL; + goto error; + } tk_xtime_add(tk, ts); tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); +error: /* even if we error out, we forwarded the time, so call update */ timekeeping_update(tk, true); write_sequnlock_irqrestore(&tk->lock, flags); @@ -482,7 +492,7 @@ int timekeeping_inject_offset(struct timespec *ts) /* signal hrtimers about time change */ clock_was_set(); - return 0; + return ret; } EXPORT_SYMBOL(timekeeping_inject_offset); @@ -649,7 +659,20 @@ void __init timekeeping_init(void) struct timespec now, boot, tmp; read_persistent_clock(&now); + if (!timespec_valid_strict(&now)) { + pr_warn("WARNING: Persistent clock returned invalid value!\n" + " Check your CMOS/BIOS settings.\n"); + now.tv_sec = 0; + now.tv_nsec = 0; + } + read_boot_clock(&boot); + if (!timespec_valid_strict(&boot)) { + pr_warn("WARNING: Boot clock returned invalid value!\n" + " Check your CMOS/BIOS settings.\n"); + boot.tv_sec = 0; + boot.tv_nsec = 0; + } seqlock_init(&tk->lock); @@ -690,7 +713,7 @@ static struct timespec timekeeping_suspend_time; static void __timekeeping_inject_sleeptime(struct timekeeper *tk, struct timespec *delta) { - if (!timespec_valid(delta)) { + if (!timespec_valid_strict(delta)) { printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " "sleep delta value!\n"); return; @@ -1129,6 +1152,10 @@ static void update_wall_time(void) offset = (clock->read(clock) - clock->cycle_last) & clock->mask; #endif + /* Check if there's really nothing to do */ + if (offset < tk->cycle_interval) + goto out; + /* * With NO_HZ we may have to accumulate many cycle_intervals * (think "ticks") worth of time at once. To do this efficiently, @@ -1161,9 +1188,9 @@ static void update_wall_time(void) * the vsyscall implementations are converted to use xtime_nsec * (shifted nanoseconds), this can be killed. */ - remainder = tk->xtime_nsec & ((1 << tk->shift) - 1); + remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1); tk->xtime_nsec -= remainder; - tk->xtime_nsec += 1 << tk->shift; + tk->xtime_nsec += 1ULL << tk->shift; tk->ntp_error += remainder << tk->ntp_error_shift; /* diff --git a/kernel/timer.c b/kernel/timer.c index a61c09374eba..8c5e7b908c68 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1407,13 +1407,6 @@ SYSCALL_DEFINE1(alarm, unsigned int, seconds) #endif -#ifndef __alpha__ - -/* - * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this - * should be moved into arch/i386 instead? - */ - /** * sys_getpid - return the thread group id of the current process * @@ -1469,8 +1462,6 @@ SYSCALL_DEFINE0(getegid) return from_kgid_munged(current_user_ns(), current_egid()); } -#endif - static void process_timeout(unsigned long __data) { wake_up_process((struct task_struct *)__data); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 60e4d7875672..6b245f64c8dd 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -506,6 +506,8 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) int size; syscall_nr = syscall_get_nr(current, regs); + if (syscall_nr < 0) + return; if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) return; @@ -580,6 +582,8 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) int size; syscall_nr = syscall_get_nr(current, regs); + if (syscall_nr < 0) + return; if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) return; |