diff options
Diffstat (limited to 'kernel')
34 files changed, 2155 insertions, 965 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 909a7d31ffd3..86cb5c6e8932 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1215,6 +1215,41 @@ static void cgroup_destroy_root(struct cgroup_root *root) cgroup_free_root(root); } +/* + * look up cgroup associated with current task's cgroup namespace on the + * specified hierarchy + */ +static struct cgroup * +current_cgns_cgroup_from_root(struct cgroup_root *root) +{ + struct cgroup *res = NULL; + struct css_set *cset; + + lockdep_assert_held(&css_set_lock); + + rcu_read_lock(); + + cset = current->nsproxy->cgroup_ns->root_cset; + if (cset == &init_css_set) { + res = &root->cgrp; + } else { + struct cgrp_cset_link *link; + + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { + struct cgroup *c = link->cgrp; + + if (c->root == root) { + res = c; + break; + } + } + } + rcu_read_unlock(); + + BUG_ON(!res); + return res; +} + /* look up cgroup associated with given css_set on the specified hierarchy */ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, struct cgroup_root *root) @@ -1593,6 +1628,33 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) return 0; } +static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, + struct kernfs_root *kf_root) +{ + int len = 0; + char *buf = NULL; + struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root); + struct cgroup *ns_cgroup; + + buf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + spin_lock_bh(&css_set_lock); + ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot); + len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX); + spin_unlock_bh(&css_set_lock); + + if (len >= PATH_MAX) + len = -ERANGE; + else if (len > 0) { + seq_escape(sf, buf, " \t\n\\"); + len = 0; + } + kfree(buf); + return len; +} + static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root) { @@ -5433,6 +5495,7 @@ static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, .rename = cgroup_rename, + .show_path = cgroup_show_path, }; static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) diff --git a/kernel/cpu.c b/kernel/cpu.c index 3e3f6e49eabb..d948e44c471e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -703,21 +703,6 @@ static int takedown_cpu(unsigned int cpu) struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); int err; - /* - * By now we've cleared cpu_active_mask, wait for all preempt-disabled - * and RCU users of this state to go away such that all new such users - * will observe it. - * - * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might - * not imply sync_sched(), so wait for both. - * - * Do sync before park smpboot threads to take care the rcu boost case. - */ - if (IS_ENABLED(CONFIG_PREEMPT)) - synchronize_rcu_mult(call_rcu, call_rcu_sched); - else - synchronize_rcu(); - /* Park the smpboot threads */ kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread); smpboot_park_threads(cpu); @@ -923,8 +908,6 @@ void cpuhp_online_idle(enum cpuhp_state state) st->state = CPUHP_AP_ONLINE_IDLE; - /* The cpu is marked online, set it active now */ - set_cpu_active(cpu, true); /* Unpark the stopper thread and the hotplug thread of this cpu */ stop_machine_unpark(cpu); kthread_unpark(st->thread); @@ -1236,6 +1219,12 @@ static struct cpuhp_step cpuhp_ap_states[] = { .name = "ap:offline", .cant_stop = true, }, + /* First state is scheduler control. Interrupts are disabled */ + [CPUHP_AP_SCHED_STARTING] = { + .name = "sched:starting", + .startup = sched_cpu_starting, + .teardown = sched_cpu_dying, + }, /* * Low level startup/teardown notifiers. Run with interrupts * disabled. Will be removed once the notifiers are converted to @@ -1274,6 +1263,15 @@ static struct cpuhp_step cpuhp_ap_states[] = { * The dynamically registered state space is here */ +#ifdef CONFIG_SMP + /* Last state is scheduler control setting the cpu active */ + [CPUHP_AP_ACTIVE] = { + .name = "sched:active", + .startup = sched_cpu_activate, + .teardown = sched_cpu_deactivate, + }, +#endif + /* CPU is fully up and running. */ [CPUHP_ONLINE] = { .name = "online", diff --git a/kernel/fork.c b/kernel/fork.c index d277e83ed3e0..3e8451527cbe 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1494,7 +1494,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, * sigaltstack should be cleared when sharing the same VM */ if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM) - p->sas_ss_sp = p->sas_ss_size = 0; + sas_ss_reset(p); /* * Syscall tracing and stepping should be turned off in the diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 78c1c0ee6dc1..81f1a7107c0e 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -45,6 +45,7 @@ #include <linux/bitops.h> #include <linux/gfp.h> #include <linux/kmemcheck.h> +#include <linux/random.h> #include <asm/sections.h> @@ -708,7 +709,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) * yet. Otherwise we look it up. We cache the result in the lock object * itself, so actual lookup of the hash should be once per lock object. */ -static inline struct lock_class * +static struct lock_class * register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) { struct lockdep_subclass_key *key; @@ -3585,7 +3586,35 @@ static int __lock_is_held(struct lockdep_map *lock) return 0; } -static void __lock_pin_lock(struct lockdep_map *lock) +static struct pin_cookie __lock_pin_lock(struct lockdep_map *lock) +{ + struct pin_cookie cookie = NIL_COOKIE; + struct task_struct *curr = current; + int i; + + if (unlikely(!debug_locks)) + return cookie; + + for (i = 0; i < curr->lockdep_depth; i++) { + struct held_lock *hlock = curr->held_locks + i; + + if (match_held_lock(hlock, lock)) { + /* + * Grab 16bits of randomness; this is sufficient to not + * be guessable and still allows some pin nesting in + * our u32 pin_count. + */ + cookie.val = 1 + (prandom_u32() >> 16); + hlock->pin_count += cookie.val; + return cookie; + } + } + + WARN(1, "pinning an unheld lock\n"); + return cookie; +} + +static void __lock_repin_lock(struct lockdep_map *lock, struct pin_cookie cookie) { struct task_struct *curr = current; int i; @@ -3597,7 +3626,7 @@ static void __lock_pin_lock(struct lockdep_map *lock) struct held_lock *hlock = curr->held_locks + i; if (match_held_lock(hlock, lock)) { - hlock->pin_count++; + hlock->pin_count += cookie.val; return; } } @@ -3605,7 +3634,7 @@ static void __lock_pin_lock(struct lockdep_map *lock) WARN(1, "pinning an unheld lock\n"); } -static void __lock_unpin_lock(struct lockdep_map *lock) +static void __lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie) { struct task_struct *curr = current; int i; @@ -3620,7 +3649,11 @@ static void __lock_unpin_lock(struct lockdep_map *lock) if (WARN(!hlock->pin_count, "unpinning an unpinned lock\n")) return; - hlock->pin_count--; + hlock->pin_count -= cookie.val; + + if (WARN((int)hlock->pin_count < 0, "pin count corrupted\n")) + hlock->pin_count = 0; + return; } } @@ -3751,24 +3784,44 @@ int lock_is_held(struct lockdep_map *lock) } EXPORT_SYMBOL_GPL(lock_is_held); -void lock_pin_lock(struct lockdep_map *lock) +struct pin_cookie lock_pin_lock(struct lockdep_map *lock) { + struct pin_cookie cookie = NIL_COOKIE; unsigned long flags; if (unlikely(current->lockdep_recursion)) - return; + return cookie; raw_local_irq_save(flags); check_flags(flags); current->lockdep_recursion = 1; - __lock_pin_lock(lock); + cookie = __lock_pin_lock(lock); current->lockdep_recursion = 0; raw_local_irq_restore(flags); + + return cookie; } EXPORT_SYMBOL_GPL(lock_pin_lock); -void lock_unpin_lock(struct lockdep_map *lock) +void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie cookie) +{ + unsigned long flags; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + check_flags(flags); + + current->lockdep_recursion = 1; + __lock_repin_lock(lock, cookie); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_repin_lock); + +void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie) { unsigned long flags; @@ -3779,7 +3832,7 @@ void lock_unpin_lock(struct lockdep_map *lock) check_flags(flags); current->lockdep_recursion = 1; - __lock_unpin_lock(lock); + __lock_unpin_lock(lock, cookie); current->lockdep_recursion = 0; raw_local_irq_restore(flags); } diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 8ef1919d63b2..f8c5af52a131 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -75,12 +75,7 @@ struct lock_stress_stats { long n_lock_acquired; }; -#if defined(MODULE) -#define LOCKTORTURE_RUNNABLE_INIT 1 -#else -#define LOCKTORTURE_RUNNABLE_INIT 0 -#endif -int torture_runnable = LOCKTORTURE_RUNNABLE_INIT; +int torture_runnable = IS_ENABLED(MODULE); module_param(torture_runnable, int, 0444); MODULE_PARM_DESC(torture_runnable, "Start locktorture at module init"); @@ -394,12 +389,12 @@ static void torture_rtmutex_boost(struct torture_random_state *trsp) if (!rt_task(current)) { /* - * (1) Boost priority once every ~50k operations. When the + * Boost priority once every ~50k operations. When the * task tries to take the lock, the rtmutex it will account * for the new priority, and do any corresponding pi-dance. */ - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * factor))) { + if (trsp && !(torture_random(trsp) % + (cxt.nrealwriters_stress * factor))) { policy = SCHED_FIFO; param.sched_priority = MAX_RT_PRIO - 1; } else /* common case, do nothing */ @@ -748,6 +743,15 @@ static void lock_torture_cleanup(void) if (torture_cleanup_begin()) return; + /* + * Indicates early cleanup, meaning that the test has not run, + * such as when passing bogus args when loading the module. As + * such, only perform the underlying torture-specific cleanups, + * and avoid anything related to locktorture. + */ + if (!cxt.lwsa) + goto end; + if (writer_tasks) { for (i = 0; i < cxt.nrealwriters_stress; i++) torture_stop_kthread(lock_torture_writer, @@ -776,6 +780,7 @@ static void lock_torture_cleanup(void) else lock_torture_print_module_parms(cxt.cur_ops, "End of test: SUCCESS"); +end: torture_cleanup_end(); } @@ -870,6 +875,7 @@ static int __init lock_torture_init(void) VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory"); firsterr = -ENOMEM; kfree(cxt.lwsa); + cxt.lwsa = NULL; goto unwind; } @@ -878,6 +884,7 @@ static int __init lock_torture_init(void) cxt.lrsa[i].n_lock_acquired = 0; } } + lock_torture_print_module_parms(cxt.cur_ops, "Start of test"); /* Prepare torture context. */ diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index d734b7502001..22e025309845 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -191,8 +191,6 @@ static ssize_t qstat_write(struct file *file, const char __user *user_buf, for (i = 0 ; i < qstat_num; i++) WRITE_ONCE(ptr[i], 0); - for (i = 0 ; i < qstat_num; i++) - WRITE_ONCE(ptr[i], 0); } return count; } @@ -214,10 +212,8 @@ static int __init init_qspinlock_stat(void) struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL); int i; - if (!d_qstat) { - pr_warn("Could not create 'qlockstat' debugfs directory\n"); - return 0; - } + if (!d_qstat) + goto out; /* * Create the debugfs files @@ -227,12 +223,20 @@ static int __init init_qspinlock_stat(void) * performance. */ for (i = 0; i < qstat_num; i++) - debugfs_create_file(qstat_names[i], 0400, d_qstat, - (void *)(long)i, &fops_qstat); + if (!debugfs_create_file(qstat_names[i], 0400, d_qstat, + (void *)(long)i, &fops_qstat)) + goto fail_undo; + + if (!debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat, + (void *)(long)qstat_reset_cnts, &fops_qstat)) + goto fail_undo; - debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat, - (void *)(long)qstat_reset_cnts, &fops_qstat); return 0; +fail_undo: + debugfs_remove_recursive(d_qstat); +out: + pr_warn("Could not create 'qlockstat' debugfs entries\n"); + return -ENOMEM; } fs_initcall(init_qspinlock_stat); diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index 3a5048572065..1591f6b3539f 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c @@ -191,11 +191,12 @@ int __down_read_trylock(struct rw_semaphore *sem) /* * get a write lock on the semaphore */ -void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) +int __sched __down_write_common(struct rw_semaphore *sem, int state) { struct rwsem_waiter waiter; struct task_struct *tsk; unsigned long flags; + int ret = 0; raw_spin_lock_irqsave(&sem->wait_lock, flags); @@ -215,21 +216,33 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) */ if (sem->count == 0) break; - set_task_state(tsk, TASK_UNINTERRUPTIBLE); + if (signal_pending_state(state, current)) { + ret = -EINTR; + goto out; + } + set_task_state(tsk, state); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); schedule(); raw_spin_lock_irqsave(&sem->wait_lock, flags); } /* got the lock */ sem->count = -1; +out: list_del(&waiter.list); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + + return ret; } void __sched __down_write(struct rw_semaphore *sem) { - __down_write_nested(sem, 0); + __down_write_common(sem, TASK_UNINTERRUPTIBLE); +} + +int __sched __down_write_killable(struct rw_semaphore *sem) +{ + return __down_write_common(sem, TASK_KILLABLE); } /* diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index a4d4de05b2d1..09e30c6225e5 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -433,12 +433,13 @@ static inline bool rwsem_has_spinner(struct rw_semaphore *sem) /* * Wait until we successfully acquire the write lock */ -__visible -struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) +static inline struct rw_semaphore * +__rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) { long count; bool waiting = true; /* any queued threads before us */ struct rwsem_waiter waiter; + struct rw_semaphore *ret = sem; /* undo write bias from down_write operation, stop active locking */ count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem); @@ -478,7 +479,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); /* wait until we successfully acquire the lock */ - set_current_state(TASK_UNINTERRUPTIBLE); + set_current_state(state); while (true) { if (rwsem_try_write_lock(count, sem)) break; @@ -486,21 +487,48 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) /* Block until there are no active lockers. */ do { + if (signal_pending_state(state, current)) + goto out_nolock; + schedule(); - set_current_state(TASK_UNINTERRUPTIBLE); + set_current_state(state); } while ((count = sem->count) & RWSEM_ACTIVE_MASK); raw_spin_lock_irq(&sem->wait_lock); } __set_current_state(TASK_RUNNING); + list_del(&waiter.list); + raw_spin_unlock_irq(&sem->wait_lock); + return ret; + +out_nolock: + __set_current_state(TASK_RUNNING); + raw_spin_lock_irq(&sem->wait_lock); list_del(&waiter.list); + if (list_empty(&sem->wait_list)) + rwsem_atomic_update(-RWSEM_WAITING_BIAS, sem); + else + __rwsem_do_wake(sem, RWSEM_WAKE_ANY); raw_spin_unlock_irq(&sem->wait_lock); - return sem; + return ERR_PTR(-EINTR); +} + +__visible struct rw_semaphore * __sched +rwsem_down_write_failed(struct rw_semaphore *sem) +{ + return __rwsem_down_write_failed_common(sem, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(rwsem_down_write_failed); +__visible struct rw_semaphore * __sched +rwsem_down_write_failed_killable(struct rw_semaphore *sem) +{ + return __rwsem_down_write_failed_common(sem, TASK_KILLABLE); +} +EXPORT_SYMBOL(rwsem_down_write_failed_killable); + /* * handle waking up a waiter on the semaphore * - up_read/up_write has decremented the active part of count if we come here diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 205be0ce34de..c817216c1615 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -55,6 +55,25 @@ void __sched down_write(struct rw_semaphore *sem) EXPORT_SYMBOL(down_write); /* + * lock for writing + */ +int __sched down_write_killable(struct rw_semaphore *sem) +{ + might_sleep(); + rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); + + if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) { + rwsem_release(&sem->dep_map, 1, _RET_IP_); + return -EINTR; + } + + rwsem_set_owner(sem); + return 0; +} + +EXPORT_SYMBOL(down_write_killable); + +/* * trylock for writing -- returns 1 if successful, 0 if contention */ int down_write_trylock(struct rw_semaphore *sem) diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 032b2c015beb..18dfc485225c 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -5,6 +5,7 @@ KCOV_INSTRUMENT := n obj-y += update.o sync.o obj-$(CONFIG_SRCU) += srcu.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o +obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o obj-$(CONFIG_TREE_RCU) += tree.o obj-$(CONFIG_PREEMPT_RCU) += tree.o obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c new file mode 100644 index 000000000000..3cee0d8393ed --- /dev/null +++ b/kernel/rcu/rcuperf.c @@ -0,0 +1,655 @@ +/* + * Read-Copy Update module-based performance-test facility + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (C) IBM Corporation, 2015 + * + * Authors: Paul E. McKenney <paulmck@us.ibm.com> + */ +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/err.h> +#include <linux/spinlock.h> +#include <linux/smp.h> +#include <linux/rcupdate.h> +#include <linux/interrupt.h> +#include <linux/sched.h> +#include <linux/atomic.h> +#include <linux/bitops.h> +#include <linux/completion.h> +#include <linux/moduleparam.h> +#include <linux/percpu.h> +#include <linux/notifier.h> +#include <linux/reboot.h> +#include <linux/freezer.h> +#include <linux/cpu.h> +#include <linux/delay.h> +#include <linux/stat.h> +#include <linux/srcu.h> +#include <linux/slab.h> +#include <asm/byteorder.h> +#include <linux/torture.h> +#include <linux/vmalloc.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>"); + +#define PERF_FLAG "-perf:" +#define PERFOUT_STRING(s) \ + pr_alert("%s" PERF_FLAG s "\n", perf_type) +#define VERBOSE_PERFOUT_STRING(s) \ + do { if (verbose) pr_alert("%s" PERF_FLAG " %s\n", perf_type, s); } while (0) +#define VERBOSE_PERFOUT_ERRSTRING(s) \ + do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) + +torture_param(bool, gp_exp, true, "Use expedited GP wait primitives"); +torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); +torture_param(int, nreaders, -1, "Number of RCU reader threads"); +torture_param(int, nwriters, -1, "Number of RCU updater threads"); +torture_param(bool, shutdown, false, "Shutdown at end of performance tests."); +torture_param(bool, verbose, true, "Enable verbose debugging printk()s"); + +static char *perf_type = "rcu"; +module_param(perf_type, charp, 0444); +MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, rcu_bh, ...)"); + +static int nrealreaders; +static int nrealwriters; +static struct task_struct **writer_tasks; +static struct task_struct **reader_tasks; +static struct task_struct *shutdown_task; + +static u64 **writer_durations; +static int *writer_n_durations; +static atomic_t n_rcu_perf_reader_started; +static atomic_t n_rcu_perf_writer_started; +static atomic_t n_rcu_perf_writer_finished; +static wait_queue_head_t shutdown_wq; +static u64 t_rcu_perf_writer_started; +static u64 t_rcu_perf_writer_finished; +static unsigned long b_rcu_perf_writer_started; +static unsigned long b_rcu_perf_writer_finished; + +static int rcu_perf_writer_state; +#define RTWS_INIT 0 +#define RTWS_EXP_SYNC 1 +#define RTWS_SYNC 2 +#define RTWS_IDLE 2 +#define RTWS_STOPPING 3 + +#define MAX_MEAS 10000 +#define MIN_MEAS 100 + +#if defined(MODULE) || defined(CONFIG_RCU_PERF_TEST_RUNNABLE) +#define RCUPERF_RUNNABLE_INIT 1 +#else +#define RCUPERF_RUNNABLE_INIT 0 +#endif +static int perf_runnable = RCUPERF_RUNNABLE_INIT; +module_param(perf_runnable, int, 0444); +MODULE_PARM_DESC(perf_runnable, "Start rcuperf at boot"); + +/* + * Operations vector for selecting different types of tests. + */ + +struct rcu_perf_ops { + int ptype; + void (*init)(void); + void (*cleanup)(void); + int (*readlock)(void); + void (*readunlock)(int idx); + unsigned long (*started)(void); + unsigned long (*completed)(void); + unsigned long (*exp_completed)(void); + void (*sync)(void); + void (*exp_sync)(void); + const char *name; +}; + +static struct rcu_perf_ops *cur_ops; + +/* + * Definitions for rcu perf testing. + */ + +static int rcu_perf_read_lock(void) __acquires(RCU) +{ + rcu_read_lock(); + return 0; +} + +static void rcu_perf_read_unlock(int idx) __releases(RCU) +{ + rcu_read_unlock(); +} + +static unsigned long __maybe_unused rcu_no_completed(void) +{ + return 0; +} + +static void rcu_sync_perf_init(void) +{ +} + +static struct rcu_perf_ops rcu_ops = { + .ptype = RCU_FLAVOR, + .init = rcu_sync_perf_init, + .readlock = rcu_perf_read_lock, + .readunlock = rcu_perf_read_unlock, + .started = rcu_batches_started, + .completed = rcu_batches_completed, + .exp_completed = rcu_exp_batches_completed, + .sync = synchronize_rcu, + .exp_sync = synchronize_rcu_expedited, + .name = "rcu" +}; + +/* + * Definitions for rcu_bh perf testing. + */ + +static int rcu_bh_perf_read_lock(void) __acquires(RCU_BH) +{ + rcu_read_lock_bh(); + return 0; +} + +static void rcu_bh_perf_read_unlock(int idx) __releases(RCU_BH) +{ + rcu_read_unlock_bh(); +} + +static struct rcu_perf_ops rcu_bh_ops = { + .ptype = RCU_BH_FLAVOR, + .init = rcu_sync_perf_init, + .readlock = rcu_bh_perf_read_lock, + .readunlock = rcu_bh_perf_read_unlock, + .started = rcu_batches_started_bh, + .completed = rcu_batches_completed_bh, + .exp_completed = rcu_exp_batches_completed_sched, + .sync = synchronize_rcu_bh, + .exp_sync = synchronize_rcu_bh_expedited, + .name = "rcu_bh" +}; + +/* + * Definitions for srcu perf testing. + */ + +DEFINE_STATIC_SRCU(srcu_ctl_perf); +static struct srcu_struct *srcu_ctlp = &srcu_ctl_perf; + +static int srcu_perf_read_lock(void) __acquires(srcu_ctlp) +{ + return srcu_read_lock(srcu_ctlp); +} + +static void srcu_perf_read_unlock(int idx) __releases(srcu_ctlp) +{ + srcu_read_unlock(srcu_ctlp, idx); +} + +static unsigned long srcu_perf_completed(void) +{ + return srcu_batches_completed(srcu_ctlp); +} + +static void srcu_perf_synchronize(void) +{ + synchronize_srcu(srcu_ctlp); +} + +static void srcu_perf_synchronize_expedited(void) +{ + synchronize_srcu_expedited(srcu_ctlp); +} + +static struct rcu_perf_ops srcu_ops = { + .ptype = SRCU_FLAVOR, + .init = rcu_sync_perf_init, + .readlock = srcu_perf_read_lock, + .readunlock = srcu_perf_read_unlock, + .started = NULL, + .completed = srcu_perf_completed, + .exp_completed = srcu_perf_completed, + .sync = srcu_perf_synchronize, + .exp_sync = srcu_perf_synchronize_expedited, + .name = "srcu" +}; + +/* + * Definitions for sched perf testing. + */ + +static int sched_perf_read_lock(void) +{ + preempt_disable(); + return 0; +} + +static void sched_perf_read_unlock(int idx) +{ + preempt_enable(); +} + +static struct rcu_perf_ops sched_ops = { + .ptype = RCU_SCHED_FLAVOR, + .init = rcu_sync_perf_init, + .readlock = sched_perf_read_lock, + .readunlock = sched_perf_read_unlock, + .started = rcu_batches_started_sched, + .completed = rcu_batches_completed_sched, + .exp_completed = rcu_exp_batches_completed_sched, + .sync = synchronize_sched, + .exp_sync = synchronize_sched_expedited, + .name = "sched" +}; + +#ifdef CONFIG_TASKS_RCU + +/* + * Definitions for RCU-tasks perf testing. + */ + +static int tasks_perf_read_lock(void) +{ + return 0; +} + +static void tasks_perf_read_unlock(int idx) +{ +} + +static struct rcu_perf_ops tasks_ops = { + .ptype = RCU_TASKS_FLAVOR, + .init = rcu_sync_perf_init, + .readlock = tasks_perf_read_lock, + .readunlock = tasks_perf_read_unlock, + .started = rcu_no_completed, + .completed = rcu_no_completed, + .sync = synchronize_rcu_tasks, + .exp_sync = synchronize_rcu_tasks, + .name = "tasks" +}; + +#define RCUPERF_TASKS_OPS &tasks_ops, + +static bool __maybe_unused torturing_tasks(void) +{ + return cur_ops == &tasks_ops; +} + +#else /* #ifdef CONFIG_TASKS_RCU */ + +#define RCUPERF_TASKS_OPS + +static bool __maybe_unused torturing_tasks(void) +{ + return false; +} + +#endif /* #else #ifdef CONFIG_TASKS_RCU */ + +/* + * If performance tests complete, wait for shutdown to commence. + */ +static void rcu_perf_wait_shutdown(void) +{ + cond_resched_rcu_qs(); + if (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters) + return; + while (!torture_must_stop()) + schedule_timeout_uninterruptible(1); +} + +/* + * RCU perf reader kthread. Repeatedly does empty RCU read-side + * critical section, minimizing update-side interference. + */ +static int +rcu_perf_reader(void *arg) +{ + unsigned long flags; + int idx; + long me = (long)arg; + + VERBOSE_PERFOUT_STRING("rcu_perf_reader task started"); + set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); + set_user_nice(current, MAX_NICE); + atomic_inc(&n_rcu_perf_reader_started); + + do { + local_irq_save(flags); + idx = cur_ops->readlock(); + cur_ops->readunlock(idx); + local_irq_restore(flags); + rcu_perf_wait_shutdown(); + } while (!torture_must_stop()); + torture_kthread_stopping("rcu_perf_reader"); + return 0; +} + +/* + * RCU perf writer kthread. Repeatedly does a grace period. + */ +static int +rcu_perf_writer(void *arg) +{ + int i = 0; + int i_max; + long me = (long)arg; + struct sched_param sp; + bool started = false, done = false, alldone = false; + u64 t; + u64 *wdp; + u64 *wdpp = writer_durations[me]; + + VERBOSE_PERFOUT_STRING("rcu_perf_writer task started"); + WARN_ON(rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp); + WARN_ON(rcu_gp_is_normal() && gp_exp); + WARN_ON(!wdpp); + set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); + sp.sched_priority = 1; + sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); + + if (holdoff) + schedule_timeout_uninterruptible(holdoff * HZ); + + t = ktime_get_mono_fast_ns(); + if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) { + t_rcu_perf_writer_started = t; + if (gp_exp) { + b_rcu_perf_writer_started = + cur_ops->exp_completed() / 2; + } else { + b_rcu_perf_writer_started = + cur_ops->completed(); + } + } + + do { + wdp = &wdpp[i]; + *wdp = ktime_get_mono_fast_ns(); + if (gp_exp) { + rcu_perf_writer_state = RTWS_EXP_SYNC; + cur_ops->exp_sync(); + } else { + rcu_perf_writer_state = RTWS_SYNC; + cur_ops->sync(); + } + rcu_perf_writer_state = RTWS_IDLE; + t = ktime_get_mono_fast_ns(); + *wdp = t - *wdp; + i_max = i; + if (!started && + atomic_read(&n_rcu_perf_writer_started) >= nrealwriters) + started = true; + if (!done && i >= MIN_MEAS) { + done = true; + sp.sched_priority = 0; + sched_setscheduler_nocheck(current, + SCHED_NORMAL, &sp); + pr_alert("%s" PERF_FLAG + "rcu_perf_writer %ld has %d measurements\n", + perf_type, me, MIN_MEAS); + if (atomic_inc_return(&n_rcu_perf_writer_finished) >= + nrealwriters) { + schedule_timeout_interruptible(10); + rcu_ftrace_dump(DUMP_ALL); + PERFOUT_STRING("Test complete"); + t_rcu_perf_writer_finished = t; + if (gp_exp) { + b_rcu_perf_writer_finished = + cur_ops->exp_completed() / 2; + } else { + b_rcu_perf_writer_finished = + cur_ops->completed(); + } + if (shutdown) { + smp_mb(); /* Assign before wake. */ + wake_up(&shutdown_wq); + } + } + } + if (done && !alldone && + atomic_read(&n_rcu_perf_writer_finished) >= nrealwriters) + alldone = true; + if (started && !alldone && i < MAX_MEAS - 1) + i++; + rcu_perf_wait_shutdown(); + } while (!torture_must_stop()); + rcu_perf_writer_state = RTWS_STOPPING; + writer_n_durations[me] = i_max; + torture_kthread_stopping("rcu_perf_writer"); + return 0; +} + +static inline void +rcu_perf_print_module_parms(struct rcu_perf_ops *cur_ops, const char *tag) +{ + pr_alert("%s" PERF_FLAG + "--- %s: nreaders=%d nwriters=%d verbose=%d shutdown=%d\n", + perf_type, tag, nrealreaders, nrealwriters, verbose, shutdown); +} + +static void +rcu_perf_cleanup(void) +{ + int i; + int j; + int ngps = 0; + u64 *wdp; + u64 *wdpp; + + if (torture_cleanup_begin()) + return; + + if (reader_tasks) { + for (i = 0; i < nrealreaders; i++) + torture_stop_kthread(rcu_perf_reader, + reader_tasks[i]); + kfree(reader_tasks); + } + + if (writer_tasks) { + for (i = 0; i < nrealwriters; i++) { + torture_stop_kthread(rcu_perf_writer, + writer_tasks[i]); + if (!writer_n_durations) + continue; + j = writer_n_durations[i]; + pr_alert("%s%s writer %d gps: %d\n", + perf_type, PERF_FLAG, i, j); + ngps += j; + } + pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n", + perf_type, PERF_FLAG, + t_rcu_perf_writer_started, t_rcu_perf_writer_finished, + t_rcu_perf_writer_finished - + t_rcu_perf_writer_started, + ngps, + b_rcu_perf_writer_finished - + b_rcu_perf_writer_started); + for (i = 0; i < nrealwriters; i++) { + if (!writer_durations) + break; + if (!writer_n_durations) + continue; + wdpp = writer_durations[i]; + if (!wdpp) + continue; + for (j = 0; j <= writer_n_durations[i]; j++) { + wdp = &wdpp[j]; + pr_alert("%s%s %4d writer-duration: %5d %llu\n", + perf_type, PERF_FLAG, + i, j, *wdp); + if (j % 100 == 0) + schedule_timeout_uninterruptible(1); + } + kfree(writer_durations[i]); + } + kfree(writer_tasks); + kfree(writer_durations); + kfree(writer_n_durations); + } + + /* Do flavor-specific cleanup operations. */ + if (cur_ops->cleanup != NULL) + cur_ops->cleanup(); + + torture_cleanup_end(); +} + +/* + * Return the number if non-negative. If -1, the number of CPUs. + * If less than -1, that much less than the number of CPUs, but + * at least one. + */ +static int compute_real(int n) +{ + int nr; + + if (n >= 0) { + nr = n; + } else { + nr = num_online_cpus() + 1 + n; + if (nr <= 0) + nr = 1; + } + return nr; +} + +/* + * RCU perf shutdown kthread. Just waits to be awakened, then shuts + * down system. + */ +static int +rcu_perf_shutdown(void *arg) +{ + do { + wait_event(shutdown_wq, + atomic_read(&n_rcu_perf_writer_finished) >= + nrealwriters); + } while (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters); + smp_mb(); /* Wake before output. */ + rcu_perf_cleanup(); + kernel_power_off(); + return -EINVAL; +} + +static int __init +rcu_perf_init(void) +{ + long i; + int firsterr = 0; + static struct rcu_perf_ops *perf_ops[] = { + &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops, + RCUPERF_TASKS_OPS + }; + + if (!torture_init_begin(perf_type, verbose, &perf_runnable)) + return -EBUSY; + + /* Process args and tell the world that the perf'er is on the job. */ + for (i = 0; i < ARRAY_SIZE(perf_ops); i++) { + cur_ops = perf_ops[i]; + if (strcmp(perf_type, cur_ops->name) == 0) + break; + } + if (i == ARRAY_SIZE(perf_ops)) { + pr_alert("rcu-perf: invalid perf type: \"%s\"\n", + perf_type); + pr_alert("rcu-perf types:"); + for (i = 0; i < ARRAY_SIZE(perf_ops); i++) + pr_alert(" %s", perf_ops[i]->name); + pr_alert("\n"); + firsterr = -EINVAL; + goto unwind; + } + if (cur_ops->init) + cur_ops->init(); + + nrealwriters = compute_real(nwriters); + nrealreaders = compute_real(nreaders); + atomic_set(&n_rcu_perf_reader_started, 0); + atomic_set(&n_rcu_perf_writer_started, 0); + atomic_set(&n_rcu_perf_writer_finished, 0); + rcu_perf_print_module_parms(cur_ops, "Start of test"); + + /* Start up the kthreads. */ + + if (shutdown) { + init_waitqueue_head(&shutdown_wq); + firsterr = torture_create_kthread(rcu_perf_shutdown, NULL, + shutdown_task); + if (firsterr) + goto unwind; + schedule_timeout_uninterruptible(1); + } + reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]), + GFP_KERNEL); + if (reader_tasks == NULL) { + VERBOSE_PERFOUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < nrealreaders; i++) { + firsterr = torture_create_kthread(rcu_perf_reader, (void *)i, + reader_tasks[i]); + if (firsterr) + goto unwind; + } + while (atomic_read(&n_rcu_perf_reader_started) < nrealreaders) + schedule_timeout_uninterruptible(1); + writer_tasks = kcalloc(nrealwriters, sizeof(reader_tasks[0]), + GFP_KERNEL); + writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations), + GFP_KERNEL); + writer_n_durations = + kcalloc(nrealwriters, sizeof(*writer_n_durations), + GFP_KERNEL); + if (!writer_tasks || !writer_durations || !writer_n_durations) { + VERBOSE_PERFOUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < nrealwriters; i++) { + writer_durations[i] = + kcalloc(MAX_MEAS, sizeof(*writer_durations[i]), + GFP_KERNEL); + if (!writer_durations[i]) + goto unwind; + firsterr = torture_create_kthread(rcu_perf_writer, (void *)i, + writer_tasks[i]); + if (firsterr) + goto unwind; + } + torture_init_end(); + return 0; + +unwind: + torture_init_end(); + rcu_perf_cleanup(); + return firsterr; +} + +module_init(rcu_perf_init); +module_exit(rcu_perf_cleanup); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 250ea67c1615..084a28a732eb 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -130,8 +130,8 @@ static struct rcu_torture __rcu *rcu_torture_current; static unsigned long rcu_torture_current_version; static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; static DEFINE_SPINLOCK(rcu_torture_lock); -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = { 0 }; -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = { 0 }; +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count); +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch); static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; static atomic_t n_rcu_torture_alloc; static atomic_t n_rcu_torture_alloc_fail; @@ -916,7 +916,7 @@ rcu_torture_fqs(void *arg) static int rcu_torture_writer(void *arg) { - bool can_expedite = !rcu_gp_is_expedited(); + bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal(); int expediting = 0; unsigned long gp_snap; bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal; @@ -932,7 +932,7 @@ rcu_torture_writer(void *arg) VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); if (!can_expedite) { pr_alert("%s" TORTURE_FLAG - " Grace periods expedited from boot/sysfs for %s,\n", + " GP expediting controlled from boot/sysfs for %s,\n", torture_type, cur_ops->name); pr_alert("%s" TORTURE_FLAG " Disabled dynamic grace-period expediting.\n", @@ -1082,17 +1082,6 @@ rcu_torture_fakewriter(void *arg) return 0; } -static void rcutorture_trace_dump(void) -{ - static atomic_t beenhere = ATOMIC_INIT(0); - - if (atomic_read(&beenhere)) - return; - if (atomic_xchg(&beenhere, 1) != 0) - return; - ftrace_dump(DUMP_ALL); -} - /* * RCU torture reader from timer handler. Dereferences rcu_torture_current, * incrementing the corresponding element of the pipeline array. The @@ -1142,7 +1131,7 @@ static void rcu_torture_timer(unsigned long unused) if (pipe_count > 1) { do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, started, completed); - rcutorture_trace_dump(); + rcu_ftrace_dump(DUMP_ALL); } __this_cpu_inc(rcu_torture_count[pipe_count]); completed = completed - started; @@ -1215,7 +1204,7 @@ rcu_torture_reader(void *arg) if (pipe_count > 1) { do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, started, completed); - rcutorture_trace_dump(); + rcu_ftrace_dump(DUMP_ALL); } __this_cpu_inc(rcu_torture_count[pipe_count]); completed = completed - started; @@ -1333,7 +1322,7 @@ rcu_torture_stats_print(void) rcu_torture_writer_state, gpnum, completed, flags); show_rcu_gp_kthreads(); - rcutorture_trace_dump(); + rcu_ftrace_dump(DUMP_ALL); } rtcv_snap = rcu_torture_current_version; } @@ -1489,7 +1478,9 @@ static int rcu_torture_barrier_cbs(void *arg) * The above smp_load_acquire() ensures barrier_phase load * is ordered before the folloiwng ->call(). */ + local_irq_disable(); /* Just to test no-irq call_rcu(). */ cur_ops->call(&rcu, rcu_torture_barrier_cbf); + local_irq_enable(); if (atomic_dec_and_test(&barrier_cbs_count)) wake_up(&barrier_wq); } while (!torture_must_stop()); @@ -1596,7 +1587,7 @@ static int rcutorture_cpu_notify(struct notifier_block *self, { long cpu = (long)hcpu; - switch (action) { + switch (action & ~CPU_TASKS_FROZEN) { case CPU_ONLINE: case CPU_DOWN_FAILED: (void)rcutorture_booster_init(cpu); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9a535a86e732..c7f1bc4f817c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -102,6 +102,8 @@ struct rcu_state sname##_state = { \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ .name = RCU_STATE_NAME(sname), \ .abbr = sabbr, \ + .exp_mutex = __MUTEX_INITIALIZER(sname##_state.exp_mutex), \ + .exp_wake_mutex = __MUTEX_INITIALIZER(sname##_state.exp_wake_mutex), \ } RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); @@ -370,6 +372,21 @@ void rcu_all_qs(void) rcu_momentary_dyntick_idle(); local_irq_restore(flags); } + if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) { + /* + * Yes, we just checked a per-CPU variable with preemption + * enabled, so we might be migrated to some other CPU at + * this point. That is OK because in that case, the + * migration will supply the needed quiescent state. + * We might end up needlessly disabling preemption and + * invoking rcu_sched_qs() on the destination CPU, but + * the probability and cost are both quite low, so this + * should not be a problem in practice. + */ + preempt_disable(); + rcu_sched_qs(); + preempt_enable(); + } this_cpu_inc(rcu_qs_ctr); barrier(); /* Avoid RCU read-side critical sections leaking up. */ } @@ -385,9 +402,11 @@ module_param(qlowmark, long, 0444); static ulong jiffies_till_first_fqs = ULONG_MAX; static ulong jiffies_till_next_fqs = ULONG_MAX; +static bool rcu_kick_kthreads; module_param(jiffies_till_first_fqs, ulong, 0644); module_param(jiffies_till_next_fqs, ulong, 0644); +module_param(rcu_kick_kthreads, bool, 0644); /* * How long the grace period must be before we start recruiting @@ -460,6 +479,28 @@ unsigned long rcu_batches_completed_bh(void) EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); /* + * Return the number of RCU expedited batches completed thus far for + * debug & stats. Odd numbers mean that a batch is in progress, even + * numbers mean idle. The value returned will thus be roughly double + * the cumulative batches since boot. + */ +unsigned long rcu_exp_batches_completed(void) +{ + return rcu_state_p->expedited_sequence; +} +EXPORT_SYMBOL_GPL(rcu_exp_batches_completed); + +/* + * Return the number of RCU-sched expedited batches completed thus far + * for debug & stats. Similar to rcu_exp_batches_completed(). + */ +unsigned long rcu_exp_batches_completed_sched(void) +{ + return rcu_sched_state.expedited_sequence; +} +EXPORT_SYMBOL_GPL(rcu_exp_batches_completed_sched); + +/* * Force a quiescent state. */ void rcu_force_quiescent_state(void) @@ -637,7 +678,7 @@ static void rcu_eqs_enter_common(long long oldval, bool user) idle_task(smp_processor_id()); trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0); - ftrace_dump(DUMP_ORIG); + rcu_ftrace_dump(DUMP_ORIG); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", current->pid, current->comm, idle->pid, idle->comm); /* must be idle task! */ @@ -799,7 +840,7 @@ static void rcu_eqs_exit_common(long long oldval, int user) trace_rcu_dyntick(TPS("Error on exit: not idle task"), oldval, rdtp->dynticks_nesting); - ftrace_dump(DUMP_ORIG); + rcu_ftrace_dump(DUMP_ORIG); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", current->pid, current->comm, idle->pid, idle->comm); /* must be idle task! */ @@ -1224,8 +1265,10 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) rsp->gp_flags, gp_state_getname(rsp->gp_state), rsp->gp_state, rsp->gp_kthread ? rsp->gp_kthread->state : ~0); - if (rsp->gp_kthread) + if (rsp->gp_kthread) { sched_show_task(rsp->gp_kthread); + wake_up_process(rsp->gp_kthread); + } } } @@ -1249,6 +1292,25 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) } } +/* + * If too much time has passed in the current grace period, and if + * so configured, go kick the relevant kthreads. + */ +static void rcu_stall_kick_kthreads(struct rcu_state *rsp) +{ + unsigned long j; + + if (!rcu_kick_kthreads) + return; + j = READ_ONCE(rsp->jiffies_kick_kthreads); + if (time_after(jiffies, j) && rsp->gp_kthread) { + WARN_ONCE(1, "Kicking %s grace-period kthread\n", rsp->name); + rcu_ftrace_dump(DUMP_ALL); + wake_up_process(rsp->gp_kthread); + WRITE_ONCE(rsp->jiffies_kick_kthreads, j + HZ); + } +} + static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) { int cpu; @@ -1260,6 +1322,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) struct rcu_node *rnp = rcu_get_root(rsp); long totqlen = 0; + /* Kick and suppress, if so configured. */ + rcu_stall_kick_kthreads(rsp); + if (rcu_cpu_stall_suppress) + return; + /* Only let one CPU complain about others per time interval. */ raw_spin_lock_irqsave_rcu_node(rnp, flags); @@ -1333,6 +1400,11 @@ static void print_cpu_stall(struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); long totqlen = 0; + /* Kick and suppress, if so configured. */ + rcu_stall_kick_kthreads(rsp); + if (rcu_cpu_stall_suppress) + return; + /* * OK, time to rat on ourselves... * See Documentation/RCU/stallwarn.txt for info on how to debug @@ -1377,8 +1449,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) unsigned long js; struct rcu_node *rnp; - if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp)) + if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) || + !rcu_gp_in_progress(rsp)) return; + rcu_stall_kick_kthreads(rsp); j = jiffies; /* @@ -2117,8 +2191,11 @@ static int __noreturn rcu_gp_kthread(void *arg) } ret = 0; for (;;) { - if (!ret) + if (!ret) { rsp->jiffies_force_qs = jiffies + j; + WRITE_ONCE(rsp->jiffies_kick_kthreads, + jiffies + 3 * j); + } trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("fqswait")); @@ -2144,6 +2221,15 @@ static int __noreturn rcu_gp_kthread(void *arg) TPS("fqsend")); cond_resched_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); + ret = 0; /* Force full wait till next FQS. */ + j = jiffies_till_next_fqs; + if (j > HZ) { + j = HZ; + jiffies_till_next_fqs = HZ; + } else if (j < 1) { + j = 1; + jiffies_till_next_fqs = 1; + } } else { /* Deal with stray signal. */ cond_resched_rcu_qs(); @@ -2152,14 +2238,12 @@ static int __noreturn rcu_gp_kthread(void *arg) trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("fqswaitsig")); - } - j = jiffies_till_next_fqs; - if (j > HZ) { - j = HZ; - jiffies_till_next_fqs = HZ; - } else if (j < 1) { - j = 1; - jiffies_till_next_fqs = 1; + ret = 1; /* Keep old FQS timing. */ + j = jiffies; + if (time_after(jiffies, rsp->jiffies_force_qs)) + j = 1; + else + j = rsp->jiffies_force_qs - j; } } @@ -3376,8 +3460,12 @@ static void rcu_exp_gp_seq_end(struct rcu_state *rsp) } static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) { + unsigned long s; + smp_mb(); /* Caller's modifications seen first by other CPUs. */ - return rcu_seq_snap(&rsp->expedited_sequence); + s = rcu_seq_snap(&rsp->expedited_sequence); + trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); + return s; } static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) { @@ -3469,7 +3557,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) * for the current expedited grace period. Works only for preemptible * RCU -- other RCU implementation use other means. * - * Caller must hold the root rcu_node's exp_funnel_mutex. + * Caller must hold the rcu_state's exp_mutex. */ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) { @@ -3485,8 +3573,8 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) * recursively up the tree. (Calm down, calm down, we do the recursion * iteratively!) * - * Caller must hold the root rcu_node's exp_funnel_mutex and the - * specified rcu_node structure's ->lock. + * Caller must hold the rcu_state's exp_mutex and the specified rcu_node + * structure's ->lock. */ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake, unsigned long flags) @@ -3523,7 +3611,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, * Report expedited quiescent state for specified node. This is a * lock-acquisition wrapper function for __rcu_report_exp_rnp(). * - * Caller must hold the root rcu_node's exp_funnel_mutex. + * Caller must hold the rcu_state's exp_mutex. */ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake) @@ -3536,8 +3624,8 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, /* * Report expedited quiescent state for multiple CPUs, all covered by the - * specified leaf rcu_node structure. Caller must hold the root - * rcu_node's exp_funnel_mutex. + * specified leaf rcu_node structure. Caller must hold the rcu_state's + * exp_mutex. */ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, unsigned long mask, bool wake) @@ -3555,7 +3643,6 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, /* * Report expedited quiescent state for specified rcu_data (CPU). - * Caller must hold the root rcu_node's exp_funnel_mutex. */ static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, bool wake) @@ -3564,15 +3651,11 @@ static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, } /* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ -static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, - struct rcu_data *rdp, - atomic_long_t *stat, unsigned long s) +static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat, + unsigned long s) { if (rcu_exp_gp_seq_done(rsp, s)) { - if (rnp) - mutex_unlock(&rnp->exp_funnel_mutex); - else if (rdp) - mutex_unlock(&rdp->exp_funnel_mutex); + trace_rcu_exp_grace_period(rsp->name, s, TPS("done")); /* Ensure test happens before caller kfree(). */ smp_mb__before_atomic(); /* ^^^ */ atomic_long_inc(stat); @@ -3582,59 +3665,65 @@ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, } /* - * Funnel-lock acquisition for expedited grace periods. Returns a - * pointer to the root rcu_node structure, or NULL if some other - * task did the expedited grace period for us. + * Funnel-lock acquisition for expedited grace periods. Returns true + * if some other task completed an expedited grace period that this task + * can piggy-back on, and with no mutex held. Otherwise, returns false + * with the mutex held, indicating that the caller must actually do the + * expedited grace period. */ -static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) +static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) { struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); - struct rcu_node *rnp0; - struct rcu_node *rnp1 = NULL; + struct rcu_node *rnp = rdp->mynode; + struct rcu_node *rnp_root = rcu_get_root(rsp); + + /* Low-contention fastpath. */ + if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) && + (rnp == rnp_root || + ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) && + !mutex_is_locked(&rsp->exp_mutex) && + mutex_trylock(&rsp->exp_mutex)) + goto fastpath; /* - * First try directly acquiring the root lock in order to reduce - * latency in the common case where expedited grace periods are - * rare. We check mutex_is_locked() to avoid pathological levels of - * memory contention on ->exp_funnel_mutex in the heavy-load case. + * Each pass through the following loop works its way up + * the rcu_node tree, returning if others have done the work or + * otherwise falls through to acquire rsp->exp_mutex. The mapping + * from CPU to rcu_node structure can be inexact, as it is just + * promoting locality and is not strictly needed for correctness. */ - rnp0 = rcu_get_root(rsp); - if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) { - if (mutex_trylock(&rnp0->exp_funnel_mutex)) { - if (sync_exp_work_done(rsp, rnp0, NULL, - &rdp->expedited_workdone0, s)) - return NULL; - return rnp0; + for (; rnp != NULL; rnp = rnp->parent) { + if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s)) + return true; + + /* Work not done, either wait here or go up. */ + spin_lock(&rnp->exp_lock); + if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) { + + /* Someone else doing GP, so wait for them. */ + spin_unlock(&rnp->exp_lock); + trace_rcu_exp_funnel_lock(rsp->name, rnp->level, + rnp->grplo, rnp->grphi, + TPS("wait")); + wait_event(rnp->exp_wq[(s >> 1) & 0x3], + sync_exp_work_done(rsp, + &rdp->exp_workdone2, s)); + return true; } + rnp->exp_seq_rq = s; /* Followers can wait on us. */ + spin_unlock(&rnp->exp_lock); + trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo, + rnp->grphi, TPS("nxtlvl")); } - - /* - * Each pass through the following loop works its way - * up the rcu_node tree, returning if others have done the - * work or otherwise falls through holding the root rnp's - * ->exp_funnel_mutex. The mapping from CPU to rcu_node structure - * can be inexact, as it is just promoting locality and is not - * strictly needed for correctness. - */ - if (sync_exp_work_done(rsp, NULL, NULL, &rdp->expedited_workdone1, s)) - return NULL; - mutex_lock(&rdp->exp_funnel_mutex); - rnp0 = rdp->mynode; - for (; rnp0 != NULL; rnp0 = rnp0->parent) { - if (sync_exp_work_done(rsp, rnp1, rdp, - &rdp->expedited_workdone2, s)) - return NULL; - mutex_lock(&rnp0->exp_funnel_mutex); - if (rnp1) - mutex_unlock(&rnp1->exp_funnel_mutex); - else - mutex_unlock(&rdp->exp_funnel_mutex); - rnp1 = rnp0; + mutex_lock(&rsp->exp_mutex); +fastpath: + if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) { + mutex_unlock(&rsp->exp_mutex); + return true; } - if (sync_exp_work_done(rsp, rnp1, rdp, - &rdp->expedited_workdone3, s)) - return NULL; - return rnp1; + rcu_exp_gp_seq_start(rsp); + trace_rcu_exp_grace_period(rsp->name, s, TPS("start")); + return false; } /* Invoked on each online non-idle CPU for expedited quiescent state. */ @@ -3649,6 +3738,11 @@ static void sync_sched_exp_handler(void *data) if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) return; + if (rcu_is_cpu_rrupt_from_idle()) { + rcu_report_exp_rdp(&rcu_sched_state, + this_cpu_ptr(&rcu_sched_data), true); + return; + } __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); resched_cpu(smp_processor_id()); } @@ -3773,7 +3867,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) rsp->name); ndetected = 0; rcu_for_each_leaf_node(rsp, rnp) { - ndetected = rcu_print_task_exp_stall(rnp); + ndetected += rcu_print_task_exp_stall(rnp); mask = 1; for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { struct rcu_data *rdp; @@ -3783,7 +3877,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) ndetected++; rdp = per_cpu_ptr(rsp->rda, cpu); pr_cont(" %d-%c%c%c", cpu, - "O."[cpu_online(cpu)], + "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rnp->expmaskinit)], "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); } @@ -3792,7 +3886,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", jiffies - jiffies_start, rsp->expedited_sequence, rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]); - if (!ndetected) { + if (ndetected) { pr_err("blocking rcu_node structures:"); rcu_for_each_node_breadth_first(rsp, rnp) { if (rnp == rnp_root) @@ -3818,6 +3912,41 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) } } +/* + * Wait for the current expedited grace period to complete, and then + * wake up everyone who piggybacked on the just-completed expedited + * grace period. Also update all the ->exp_seq_rq counters as needed + * in order to avoid counter-wrap problems. + */ +static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) +{ + struct rcu_node *rnp; + + synchronize_sched_expedited_wait(rsp); + rcu_exp_gp_seq_end(rsp); + trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); + + /* + * Switch over to wakeup mode, allowing the next GP, but -only- the + * next GP, to proceed. + */ + mutex_lock(&rsp->exp_wake_mutex); + mutex_unlock(&rsp->exp_mutex); + + rcu_for_each_node_breadth_first(rsp, rnp) { + if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { + spin_lock(&rnp->exp_lock); + /* Recheck, avoid hang in case someone just arrived. */ + if (ULONG_CMP_LT(rnp->exp_seq_rq, s)) + rnp->exp_seq_rq = s; + spin_unlock(&rnp->exp_lock); + } + wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]); + } + trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); + mutex_unlock(&rsp->exp_wake_mutex); +} + /** * synchronize_sched_expedited - Brute-force RCU-sched grace period * @@ -3837,7 +3966,6 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) void synchronize_sched_expedited(void) { unsigned long s; - struct rcu_node *rnp; struct rcu_state *rsp = &rcu_sched_state; /* If only one CPU, this is automatically a grace period. */ @@ -3852,17 +3980,14 @@ void synchronize_sched_expedited(void) /* Take a snapshot of the sequence number. */ s = rcu_exp_gp_seq_snap(rsp); - - rnp = exp_funnel_lock(rsp, s); - if (rnp == NULL) + if (exp_funnel_lock(rsp, s)) return; /* Someone else did our work for us. */ - rcu_exp_gp_seq_start(rsp); + /* Initialize the rcu_node tree in preparation for the wait. */ sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); - synchronize_sched_expedited_wait(rsp); - rcu_exp_gp_seq_end(rsp); - mutex_unlock(&rnp->exp_funnel_mutex); + /* Wait and clean up, including waking everyone. */ + rcu_exp_wait_wake(rsp, s); } EXPORT_SYMBOL_GPL(synchronize_sched_expedited); @@ -4162,7 +4287,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); rdp->cpu = cpu; rdp->rsp = rsp; - mutex_init(&rdp->exp_funnel_mutex); rcu_boot_init_nocb_percpu_data(rdp); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } @@ -4420,10 +4544,8 @@ static void __init rcu_init_one(struct rcu_state *rsp) { static const char * const buf[] = RCU_NODE_NAME_INIT; static const char * const fqs[] = RCU_FQS_NAME_INIT; - static const char * const exp[] = RCU_EXP_NAME_INIT; static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; - static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS]; static u8 fl_mask = 0x1; int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ @@ -4482,9 +4604,11 @@ static void __init rcu_init_one(struct rcu_state *rsp) rnp->level = i; INIT_LIST_HEAD(&rnp->blkd_tasks); rcu_init_one_nocb(rnp); - mutex_init(&rnp->exp_funnel_mutex); - lockdep_set_class_and_name(&rnp->exp_funnel_mutex, - &rcu_exp_class[i], exp[i]); + init_waitqueue_head(&rnp->exp_wq[0]); + init_waitqueue_head(&rnp->exp_wq[1]); + init_waitqueue_head(&rnp->exp_wq[2]); + init_waitqueue_head(&rnp->exp_wq[3]); + spin_lock_init(&rnp->exp_lock); } } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index df668c0f9e64..e3959f5e6ddf 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -70,7 +70,6 @@ # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 } # define RCU_NODE_NAME_INIT { "rcu_node_0" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } -# define RCU_EXP_NAME_INIT { "rcu_node_exp_0" } #elif NR_CPUS <= RCU_FANOUT_2 # define RCU_NUM_LVLS 2 # define NUM_RCU_LVL_0 1 @@ -79,7 +78,6 @@ # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 } # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } -# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1" } #elif NR_CPUS <= RCU_FANOUT_3 # define RCU_NUM_LVLS 3 # define NUM_RCU_LVL_0 1 @@ -89,7 +87,6 @@ # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 } # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } -# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" } #elif NR_CPUS <= RCU_FANOUT_4 # define RCU_NUM_LVLS 4 # define NUM_RCU_LVL_0 1 @@ -100,7 +97,6 @@ # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 } # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } -# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" } #else # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ @@ -252,7 +248,9 @@ struct rcu_node { /* Counts of upcoming no-CB GP requests. */ raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; - struct mutex exp_funnel_mutex ____cacheline_internodealigned_in_smp; + spinlock_t exp_lock ____cacheline_internodealigned_in_smp; + unsigned long exp_seq_rq; + wait_queue_head_t exp_wq[4]; } ____cacheline_internodealigned_in_smp; /* @@ -387,11 +385,9 @@ struct rcu_data { #ifdef CONFIG_RCU_FAST_NO_HZ struct rcu_head oom_head; #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ - struct mutex exp_funnel_mutex; - atomic_long_t expedited_workdone0; /* # done by others #0. */ - atomic_long_t expedited_workdone1; /* # done by others #1. */ - atomic_long_t expedited_workdone2; /* # done by others #2. */ - atomic_long_t expedited_workdone3; /* # done by others #3. */ + atomic_long_t exp_workdone1; /* # done by others #1. */ + atomic_long_t exp_workdone2; /* # done by others #2. */ + atomic_long_t exp_workdone3; /* # done by others #3. */ /* 7) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU @@ -505,6 +501,8 @@ struct rcu_state { /* _rcu_barrier(). */ /* End of fields guarded by barrier_mutex. */ + struct mutex exp_mutex; /* Serialize expedited GP. */ + struct mutex exp_wake_mutex; /* Serialize wakeup. */ unsigned long expedited_sequence; /* Take a ticket. */ atomic_long_t expedited_normal; /* # fallbacks to normal. */ atomic_t expedited_need_qs; /* # CPUs left to check in. */ @@ -513,6 +511,8 @@ struct rcu_state { unsigned long jiffies_force_qs; /* Time at which to invoke */ /* force_quiescent_state(). */ + unsigned long jiffies_kick_kthreads; /* Time at which to kick */ + /* kthreads, if configured. */ unsigned long n_force_qs; /* Number of calls to */ /* force_quiescent_state(). */ unsigned long n_force_qs_lh; /* ~Number of calls leaving */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index efdf7b61ce12..ff1cd4e1188d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -722,18 +722,22 @@ static void sync_rcu_exp_handler(void *info) * synchronize_rcu_expedited - Brute-force RCU grace period * * Wait for an RCU-preempt grace period, but expedite it. The basic - * idea is to invoke synchronize_sched_expedited() to push all the tasks to - * the ->blkd_tasks lists and wait for this list to drain. This consumes - * significant time on all CPUs and is unfriendly to real-time workloads, - * so is thus not recommended for any sort of common-case code. - * In fact, if you are using synchronize_rcu_expedited() in a loop, - * please restructure your code to batch your updates, and then Use a - * single synchronize_rcu() instead. + * idea is to IPI all non-idle non-nohz online CPUs. The IPI handler + * checks whether the CPU is in an RCU-preempt critical section, and + * if so, it sets a flag that causes the outermost rcu_read_unlock() + * to report the quiescent state. On the other hand, if the CPU is + * not in an RCU read-side critical section, the IPI handler reports + * the quiescent state immediately. + * + * Although this is a greate improvement over previous expedited + * implementations, it is still unfriendly to real-time workloads, so is + * thus not recommended for any sort of common-case code. In fact, if + * you are using synchronize_rcu_expedited() in a loop, please restructure + * your code to batch your updates, and then Use a single synchronize_rcu() + * instead. */ void synchronize_rcu_expedited(void) { - struct rcu_node *rnp; - struct rcu_node *rnp_unlock; struct rcu_state *rsp = rcu_state_p; unsigned long s; @@ -744,23 +748,14 @@ void synchronize_rcu_expedited(void) } s = rcu_exp_gp_seq_snap(rsp); - - rnp_unlock = exp_funnel_lock(rsp, s); - if (rnp_unlock == NULL) + if (exp_funnel_lock(rsp, s)) return; /* Someone else did our work for us. */ - rcu_exp_gp_seq_start(rsp); - /* Initialize the rcu_node tree in preparation for the wait. */ sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); - /* Wait for snapshotted ->blkd_tasks lists to drain. */ - rnp = rcu_get_root(rsp); - synchronize_sched_expedited_wait(rsp); - - /* Clean up and exit. */ - rcu_exp_gp_seq_end(rsp); - mutex_unlock(&rnp_unlock->exp_funnel_mutex); + /* Wait for ->blkd_tasks lists to drain, then wake everyone up. */ + rcu_exp_wait_wake(rsp, s); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 1088e64f01ad..86782f9a4604 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -185,17 +185,16 @@ static int show_rcuexp(struct seq_file *m, void *v) int cpu; struct rcu_state *rsp = (struct rcu_state *)m->private; struct rcu_data *rdp; - unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0; + unsigned long s1 = 0, s2 = 0, s3 = 0; for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(rsp->rda, cpu); - s0 += atomic_long_read(&rdp->expedited_workdone0); - s1 += atomic_long_read(&rdp->expedited_workdone1); - s2 += atomic_long_read(&rdp->expedited_workdone2); - s3 += atomic_long_read(&rdp->expedited_workdone3); + s1 += atomic_long_read(&rdp->exp_workdone1); + s2 += atomic_long_read(&rdp->exp_workdone2); + s3 += atomic_long_read(&rdp->exp_workdone3); } - seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", - rsp->expedited_sequence, s0, s1, s2, s3, + seq_printf(m, "s=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", + rsp->expedited_sequence, s1, s2, s3, atomic_long_read(&rsp->expedited_normal), atomic_read(&rsp->expedited_need_qs), rsp->expedited_sequence / 2); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index ca828b41c938..3ccdc8eebc5a 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -67,7 +67,7 @@ static int rcu_normal_after_boot; module_param(rcu_normal_after_boot, int, 0); #endif /* #ifndef CONFIG_TINY_RCU */ -#if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT) +#ifdef CONFIG_DEBUG_LOCK_ALLOC /** * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section? * @@ -111,7 +111,7 @@ int rcu_read_lock_sched_held(void) return 0; if (debug_locks) lockdep_opinion = lock_is_held(&rcu_sched_lock_map); - return lockdep_opinion || preempt_count() != 0 || irqs_disabled(); + return lockdep_opinion || !preemptible(); } EXPORT_SYMBOL(rcu_read_lock_sched_held); #endif diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index fedb967a9841..e85a725e5c34 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -318,6 +318,7 @@ u64 sched_clock_cpu(int cpu) return clock; } +EXPORT_SYMBOL_GPL(sched_clock_cpu); void sched_clock_tick(void) { @@ -363,39 +364,6 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); -/* - * As outlined at the top, provides a fast, high resolution, nanosecond - * time source that is monotonic per cpu argument and has bounded drift - * between cpus. - * - * ######################### BIG FAT WARNING ########################## - * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # - * # go backwards !! # - * #################################################################### - */ -u64 cpu_clock(int cpu) -{ - if (!sched_clock_stable()) - return sched_clock_cpu(cpu); - - return sched_clock(); -} - -/* - * Similar to cpu_clock() for the current cpu. Time will only be observed - * to be monotonic if care is taken to only compare timestampt taken on the - * same CPU. - * - * See cpu_clock(). - */ -u64 local_clock(void) -{ - if (!sched_clock_stable()) - return sched_clock_cpu(raw_smp_processor_id()); - - return sched_clock(); -} - #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ void sched_clock_init(void) @@ -410,22 +378,8 @@ u64 sched_clock_cpu(int cpu) return sched_clock(); } - -u64 cpu_clock(int cpu) -{ - return sched_clock(); -} - -u64 local_clock(void) -{ - return sched_clock(); -} - #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ -EXPORT_SYMBOL_GPL(cpu_clock); -EXPORT_SYMBOL_GPL(local_clock); - /* * Running clock - returns the time that has elapsed while a guest has been * running. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d1f7149f8704..404c0784b1fc 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -33,7 +33,7 @@ #include <linux/init.h> #include <linux/uaccess.h> #include <linux/highmem.h> -#include <asm/mmu_context.h> +#include <linux/mmu_context.h> #include <linux/interrupt.h> #include <linux/capability.h> #include <linux/completion.h> @@ -170,6 +170,71 @@ static struct rq *this_rq_lock(void) return rq; } +/* + * __task_rq_lock - lock the rq @p resides on. + */ +struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) + __acquires(rq->lock) +{ + struct rq *rq; + + lockdep_assert_held(&p->pi_lock); + + for (;;) { + rq = task_rq(p); + raw_spin_lock(&rq->lock); + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { + rf->cookie = lockdep_pin_lock(&rq->lock); + return rq; + } + raw_spin_unlock(&rq->lock); + + while (unlikely(task_on_rq_migrating(p))) + cpu_relax(); + } +} + +/* + * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. + */ +struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) + __acquires(p->pi_lock) + __acquires(rq->lock) +{ + struct rq *rq; + + for (;;) { + raw_spin_lock_irqsave(&p->pi_lock, rf->flags); + rq = task_rq(p); + raw_spin_lock(&rq->lock); + /* + * move_queued_task() task_rq_lock() + * + * ACQUIRE (rq->lock) + * [S] ->on_rq = MIGRATING [L] rq = task_rq() + * WMB (__set_task_cpu()) ACQUIRE (rq->lock); + * [S] ->cpu = new_cpu [L] task_rq() + * [L] ->on_rq + * RELEASE (rq->lock) + * + * If we observe the old cpu in task_rq_lock, the acquire of + * the old rq->lock will fully serialize against the stores. + * + * If we observe the new cpu in task_rq_lock, the acquire will + * pair with the WMB to ensure we must then also see migrating. + */ + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { + rf->cookie = lockdep_pin_lock(&rq->lock); + return rq; + } + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); + + while (unlikely(task_on_rq_migrating(p))) + cpu_relax(); + } +} + #ifdef CONFIG_SCHED_HRTICK /* * Use HR-timers to deliver accurate preemption points. @@ -249,29 +314,6 @@ void hrtick_start(struct rq *rq, u64 delay) } } -static int -hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - int cpu = (int)(long)hcpu; - - switch (action) { - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - hrtick_clear(cpu_rq(cpu)); - return NOTIFY_OK; - } - - return NOTIFY_DONE; -} - -static __init void init_hrtick(void) -{ - hotcpu_notifier(hotplug_hrtick, 0); -} #else /* * Called to set the hrtick timer state. @@ -288,10 +330,6 @@ void hrtick_start(struct rq *rq, u64 delay) hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL_PINNED); } - -static inline void init_hrtick(void) -{ -} #endif /* CONFIG_SMP */ static void init_rq_hrtick(struct rq *rq) @@ -315,10 +353,6 @@ static inline void hrtick_clear(struct rq *rq) static inline void init_rq_hrtick(struct rq *rq) { } - -static inline void init_hrtick(void) -{ -} #endif /* CONFIG_SCHED_HRTICK */ /* @@ -400,7 +434,7 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task) * wakeup due to that. * * This cmpxchg() implies a full barrier, which pairs with the write - * barrier implied by the wakeup in wake_up_list(). + * barrier implied by the wakeup in wake_up_q(). */ if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL)) return; @@ -499,7 +533,10 @@ int get_nohz_timer_target(void) rcu_read_lock(); for_each_domain(cpu, sd) { for_each_cpu(i, sched_domain_span(sd)) { - if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) { + if (cpu == i) + continue; + + if (!idle_cpu(i) && is_housekeeping_cpu(i)) { cpu = i; goto unlock; } @@ -1085,12 +1122,20 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) static int __set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask, bool check) { - unsigned long flags; - struct rq *rq; + const struct cpumask *cpu_valid_mask = cpu_active_mask; unsigned int dest_cpu; + struct rq_flags rf; + struct rq *rq; int ret = 0; - rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &rf); + + if (p->flags & PF_KTHREAD) { + /* + * Kernel threads are allowed on online && !active CPUs + */ + cpu_valid_mask = cpu_online_mask; + } /* * Must re-check here, to close a race against __kthread_bind(), @@ -1104,22 +1149,32 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (cpumask_equal(&p->cpus_allowed, new_mask)) goto out; - if (!cpumask_intersects(new_mask, cpu_active_mask)) { + if (!cpumask_intersects(new_mask, cpu_valid_mask)) { ret = -EINVAL; goto out; } do_set_cpus_allowed(p, new_mask); + if (p->flags & PF_KTHREAD) { + /* + * For kernel threads that do indeed end up on online && + * !active we want to ensure they are strict per-cpu threads. + */ + WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && + !cpumask_intersects(new_mask, cpu_active_mask) && + p->nr_cpus_allowed != 1); + } + /* Can the task run on the task's current CPU? If so, we're done */ if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; - dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); + dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); if (task_running(rq, p) || p->state == TASK_WAKING) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); tlb_migrate_finish(p->mm); return 0; @@ -1128,12 +1183,12 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, * OK, since we're going to drop the lock immediately * afterwards anyway. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, rf.cookie); rq = move_queued_task(rq, p, dest_cpu); - lockdep_pin_lock(&rq->lock); + lockdep_repin_lock(&rq->lock, rf.cookie); } out: - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); return ret; } @@ -1317,8 +1372,8 @@ out: */ unsigned long wait_task_inactive(struct task_struct *p, long match_state) { - unsigned long flags; int running, queued; + struct rq_flags rf; unsigned long ncsw; struct rq *rq; @@ -1353,14 +1408,14 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * lock now, to be *sure*. If we're wrong, we'll * just go back and repeat. */ - rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &rf); trace_sched_wait_task(p); running = task_running(rq, p); queued = task_on_rq_queued(p); ncsw = 0; if (!match_state || p->state == match_state) ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); /* * If it changed from the expected state, bail out now. @@ -1434,6 +1489,25 @@ EXPORT_SYMBOL_GPL(kick_process); /* * ->cpus_allowed is protected by both rq->lock and p->pi_lock + * + * A few notes on cpu_active vs cpu_online: + * + * - cpu_active must be a subset of cpu_online + * + * - on cpu-up we allow per-cpu kthreads on the online && !active cpu, + * see __set_cpus_allowed_ptr(). At this point the newly online + * cpu isn't yet part of the sched domains, and balancing will not + * see it. + * + * - on cpu-down we clear cpu_active() to mask the sched domains and + * avoid the load balancer to place new tasks on the to be removed + * cpu. Existing tasks will remain running there and will be taken + * off. + * + * This means that fallback selection must not select !active CPUs. + * And can assume that any active CPU must be online. Conversely + * select_task_rq() below may allow selection of !active CPUs in order + * to satisfy the above rules. */ static int select_fallback_rq(int cpu, struct task_struct *p) { @@ -1452,8 +1526,6 @@ static int select_fallback_rq(int cpu, struct task_struct *p) /* Look for allowed, online CPU in same node. */ for_each_cpu(dest_cpu, nodemask) { - if (!cpu_online(dest_cpu)) - continue; if (!cpu_active(dest_cpu)) continue; if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) @@ -1464,8 +1536,6 @@ static int select_fallback_rq(int cpu, struct task_struct *p) for (;;) { /* Any allowed, online CPU? */ for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { - if (!cpu_online(dest_cpu)) - continue; if (!cpu_active(dest_cpu)) continue; goto out; @@ -1515,8 +1585,10 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) { lockdep_assert_held(&p->pi_lock); - if (p->nr_cpus_allowed > 1) + if (tsk_nr_cpus_allowed(p) > 1) cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); + else + cpu = cpumask_any(tsk_cpus_allowed(p)); /* * In order not to call set_task_cpu() on a blocking task we need @@ -1604,8 +1676,8 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl /* * Mark the task runnable and perform wakeup-preemption. */ -static void -ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) +static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, + struct pin_cookie cookie) { check_preempt_curr(rq, p, wake_flags); p->state = TASK_RUNNING; @@ -1617,9 +1689,9 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) * Our task @p is fully woken up and running; so its safe to * drop the rq->lock, hereafter rq is only used for statistics. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); p->sched_class->task_woken(rq, p); - lockdep_pin_lock(&rq->lock); + lockdep_repin_lock(&rq->lock, cookie); } if (rq->idle_stamp) { @@ -1637,17 +1709,23 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) } static void -ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) +ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, + struct pin_cookie cookie) { + int en_flags = ENQUEUE_WAKEUP; + lockdep_assert_held(&rq->lock); #ifdef CONFIG_SMP if (p->sched_contributes_to_load) rq->nr_uninterruptible--; + + if (wake_flags & WF_MIGRATED) + en_flags |= ENQUEUE_MIGRATED; #endif - ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); - ttwu_do_wakeup(rq, p, wake_flags); + ttwu_activate(rq, p, en_flags); + ttwu_do_wakeup(rq, p, wake_flags, cookie); } /* @@ -1658,17 +1736,18 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) */ static int ttwu_remote(struct task_struct *p, int wake_flags) { + struct rq_flags rf; struct rq *rq; int ret = 0; - rq = __task_rq_lock(p); + rq = __task_rq_lock(p, &rf); if (task_on_rq_queued(p)) { /* check_preempt_curr() may use rq clock */ update_rq_clock(rq); - ttwu_do_wakeup(rq, p, wake_flags); + ttwu_do_wakeup(rq, p, wake_flags, rf.cookie); ret = 1; } - __task_rq_unlock(rq); + __task_rq_unlock(rq, &rf); return ret; } @@ -1678,6 +1757,7 @@ void sched_ttwu_pending(void) { struct rq *rq = this_rq(); struct llist_node *llist = llist_del_all(&rq->wake_list); + struct pin_cookie cookie; struct task_struct *p; unsigned long flags; @@ -1685,15 +1765,19 @@ void sched_ttwu_pending(void) return; raw_spin_lock_irqsave(&rq->lock, flags); - lockdep_pin_lock(&rq->lock); + cookie = lockdep_pin_lock(&rq->lock); while (llist) { p = llist_entry(llist, struct task_struct, wake_entry); llist = llist_next(llist); - ttwu_do_activate(rq, p, 0); + /* + * See ttwu_queue(); we only call ttwu_queue_remote() when + * its a x-cpu wakeup. + */ + ttwu_do_activate(rq, p, WF_MIGRATED, cookie); } - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -1777,9 +1861,10 @@ bool cpus_share_cache(int this_cpu, int that_cpu) } #endif /* CONFIG_SMP */ -static void ttwu_queue(struct task_struct *p, int cpu) +static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) { struct rq *rq = cpu_rq(cpu); + struct pin_cookie cookie; #if defined(CONFIG_SMP) if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { @@ -1790,9 +1875,9 @@ static void ttwu_queue(struct task_struct *p, int cpu) #endif raw_spin_lock(&rq->lock); - lockdep_pin_lock(&rq->lock); - ttwu_do_activate(rq, p, 0); - lockdep_unpin_lock(&rq->lock); + cookie = lockdep_pin_lock(&rq->lock); + ttwu_do_activate(rq, p, wake_flags, cookie); + lockdep_unpin_lock(&rq->lock, cookie); raw_spin_unlock(&rq->lock); } @@ -1961,9 +2046,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; - if (p->sched_class->task_waking) - p->sched_class->task_waking(p); - cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; @@ -1971,7 +2053,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) } #endif /* CONFIG_SMP */ - ttwu_queue(p, cpu); + ttwu_queue(p, cpu, wake_flags); stat: if (schedstat_enabled()) ttwu_stat(p, cpu, wake_flags); @@ -1989,7 +2071,7 @@ out: * ensure that this_rq() is locked, @p is bound to this_rq() and not * the current task. */ -static void try_to_wake_up_local(struct task_struct *p) +static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie) { struct rq *rq = task_rq(p); @@ -2006,11 +2088,11 @@ static void try_to_wake_up_local(struct task_struct *p) * disabled avoiding further scheduler activity on it and we've * not yet picked a replacement task. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); raw_spin_unlock(&rq->lock); raw_spin_lock(&p->pi_lock); raw_spin_lock(&rq->lock); - lockdep_pin_lock(&rq->lock); + lockdep_repin_lock(&rq->lock, cookie); } if (!(p->state & TASK_NORMAL)) @@ -2021,7 +2103,7 @@ static void try_to_wake_up_local(struct task_struct *p) if (!task_on_rq_queued(p)) ttwu_activate(rq, p, ENQUEUE_WAKEUP); - ttwu_do_wakeup(rq, p, 0); + ttwu_do_wakeup(rq, p, 0, cookie); if (schedstat_enabled()) ttwu_stat(p, smp_processor_id(), 0); out: @@ -2381,7 +2463,8 @@ static int dl_overflow(struct task_struct *p, int policy, u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; int cpus, err = -1; - if (new_bw == p->dl.dl_bw) + /* !deadline task may carry old deadline bandwidth */ + if (new_bw == p->dl.dl_bw && task_has_dl_policy(p)) return 0; /* @@ -2420,12 +2503,12 @@ extern void init_dl_bw(struct dl_bw *dl_b); */ void wake_up_new_task(struct task_struct *p) { - unsigned long flags; + struct rq_flags rf; struct rq *rq; - raw_spin_lock_irqsave(&p->pi_lock, flags); /* Initialize new task's runnable average */ init_entity_runnable_average(&p->se); + raw_spin_lock_irqsave(&p->pi_lock, rf.flags); #ifdef CONFIG_SMP /* * Fork balancing, do it here and not earlier because: @@ -2434,8 +2517,10 @@ void wake_up_new_task(struct task_struct *p) */ set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif + /* Post initialize new task's util average when its cfs_rq is set */ + post_init_entity_util_avg(&p->se); - rq = __task_rq_lock(p); + rq = __task_rq_lock(p, &rf); activate_task(rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p); @@ -2446,12 +2531,12 @@ void wake_up_new_task(struct task_struct *p) * Nothing relies on rq->lock after this, so its fine to * drop it. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, rf.cookie); p->sched_class->task_woken(rq, p); - lockdep_pin_lock(&rq->lock); + lockdep_repin_lock(&rq->lock, rf.cookie); } #endif - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); } #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -2713,7 +2798,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) */ static __always_inline struct rq * context_switch(struct rq *rq, struct task_struct *prev, - struct task_struct *next) + struct task_struct *next, struct pin_cookie cookie) { struct mm_struct *mm, *oldmm; @@ -2733,7 +2818,7 @@ context_switch(struct rq *rq, struct task_struct *prev, atomic_inc(&oldmm->mm_count); enter_lazy_tlb(oldmm, next); } else - switch_mm(oldmm, mm, next); + switch_mm_irqs_off(oldmm, mm, next); if (!prev->mm) { prev->active_mm = NULL; @@ -2745,7 +2830,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * of the scheduler it's an obvious special-case), so we * do an early lockdep release here: */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); spin_release(&rq->lock.dep_map, 1, _THIS_IP_); /* Here we just switch the register state and the stack. */ @@ -2867,7 +2952,7 @@ EXPORT_PER_CPU_SYMBOL(kernel_cpustat); */ unsigned long long task_sched_runtime(struct task_struct *p) { - unsigned long flags; + struct rq_flags rf; struct rq *rq; u64 ns; @@ -2887,7 +2972,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) return p->se.sum_exec_runtime; #endif - rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &rf); /* * Must be ->curr _and_ ->on_rq. If dequeued, we would * project cycles that may never be accounted to this @@ -2898,7 +2983,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) p->sched_class->update_curr(rq); } ns = p->se.sum_exec_runtime; - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); return ns; } @@ -2918,7 +3003,7 @@ void scheduler_tick(void) raw_spin_lock(&rq->lock); update_rq_clock(rq); curr->sched_class->task_tick(rq, curr, 0); - update_cpu_load_active(rq); + cpu_load_update_active(rq); calc_global_load_tick(rq); raw_spin_unlock(&rq->lock); @@ -2961,6 +3046,20 @@ u64 scheduler_tick_max_deferment(void) #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_PREEMPT_TRACER)) +/* + * If the value passed in is equal to the current preempt count + * then we just disabled preemption. Start timing the latency. + */ +static inline void preempt_latency_start(int val) +{ + if (preempt_count() == val) { + unsigned long ip = get_lock_parent_ip(); +#ifdef CONFIG_DEBUG_PREEMPT + current->preempt_disable_ip = ip; +#endif + trace_preempt_off(CALLER_ADDR0, ip); + } +} void preempt_count_add(int val) { @@ -2979,17 +3078,21 @@ void preempt_count_add(int val) DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK - 10); #endif - if (preempt_count() == val) { - unsigned long ip = get_lock_parent_ip(); -#ifdef CONFIG_DEBUG_PREEMPT - current->preempt_disable_ip = ip; -#endif - trace_preempt_off(CALLER_ADDR0, ip); - } + preempt_latency_start(val); } EXPORT_SYMBOL(preempt_count_add); NOKPROBE_SYMBOL(preempt_count_add); +/* + * If the value passed in equals to the current preempt count + * then we just enabled preemption. Stop timing the latency. + */ +static inline void preempt_latency_stop(int val) +{ + if (preempt_count() == val) + trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); +} + void preempt_count_sub(int val) { #ifdef CONFIG_DEBUG_PREEMPT @@ -3006,13 +3109,15 @@ void preempt_count_sub(int val) return; #endif - if (preempt_count() == val) - trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); + preempt_latency_stop(val); __preempt_count_sub(val); } EXPORT_SYMBOL(preempt_count_sub); NOKPROBE_SYMBOL(preempt_count_sub); +#else +static inline void preempt_latency_start(int val) { } +static inline void preempt_latency_stop(int val) { } #endif /* @@ -3065,7 +3170,7 @@ static inline void schedule_debug(struct task_struct *prev) * Pick up the highest-prio task: */ static inline struct task_struct * -pick_next_task(struct rq *rq, struct task_struct *prev) +pick_next_task(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) { const struct sched_class *class = &fair_sched_class; struct task_struct *p; @@ -3076,20 +3181,20 @@ pick_next_task(struct rq *rq, struct task_struct *prev) */ if (likely(prev->sched_class == class && rq->nr_running == rq->cfs.h_nr_running)) { - p = fair_sched_class.pick_next_task(rq, prev); + p = fair_sched_class.pick_next_task(rq, prev, cookie); if (unlikely(p == RETRY_TASK)) goto again; /* assumes fair_sched_class->next == idle_sched_class */ if (unlikely(!p)) - p = idle_sched_class.pick_next_task(rq, prev); + p = idle_sched_class.pick_next_task(rq, prev, cookie); return p; } again: for_each_class(class) { - p = class->pick_next_task(rq, prev); + p = class->pick_next_task(rq, prev, cookie); if (p) { if (unlikely(p == RETRY_TASK)) goto again; @@ -3143,6 +3248,7 @@ static void __sched notrace __schedule(bool preempt) { struct task_struct *prev, *next; unsigned long *switch_count; + struct pin_cookie cookie; struct rq *rq; int cpu; @@ -3176,7 +3282,7 @@ static void __sched notrace __schedule(bool preempt) */ smp_mb__before_spinlock(); raw_spin_lock(&rq->lock); - lockdep_pin_lock(&rq->lock); + cookie = lockdep_pin_lock(&rq->lock); rq->clock_skip_update <<= 1; /* promote REQ to ACT */ @@ -3198,7 +3304,7 @@ static void __sched notrace __schedule(bool preempt) to_wakeup = wq_worker_sleeping(prev); if (to_wakeup) - try_to_wake_up_local(to_wakeup); + try_to_wake_up_local(to_wakeup, cookie); } } switch_count = &prev->nvcsw; @@ -3207,7 +3313,7 @@ static void __sched notrace __schedule(bool preempt) if (task_on_rq_queued(prev)) update_rq_clock(rq); - next = pick_next_task(rq, prev); + next = pick_next_task(rq, prev, cookie); clear_tsk_need_resched(prev); clear_preempt_need_resched(); rq->clock_skip_update = 0; @@ -3218,9 +3324,9 @@ static void __sched notrace __schedule(bool preempt) ++*switch_count; trace_sched_switch(preempt, prev, next); - rq = context_switch(rq, prev, next); /* unlocks the rq */ + rq = context_switch(rq, prev, next, cookie); /* unlocks the rq */ } else { - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); raw_spin_unlock_irq(&rq->lock); } @@ -3287,8 +3393,23 @@ void __sched schedule_preempt_disabled(void) static void __sched notrace preempt_schedule_common(void) { do { + /* + * Because the function tracer can trace preempt_count_sub() + * and it also uses preempt_enable/disable_notrace(), if + * NEED_RESCHED is set, the preempt_enable_notrace() called + * by the function tracer will call this function again and + * cause infinite recursion. + * + * Preemption must be disabled here before the function + * tracer can trace. Break up preempt_disable() into two + * calls. One to disable preemption without fear of being + * traced. The other to still record the preemption latency, + * which can also be traced by the function tracer. + */ preempt_disable_notrace(); + preempt_latency_start(1); __schedule(true); + preempt_latency_stop(1); preempt_enable_no_resched_notrace(); /* @@ -3340,7 +3461,21 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) return; do { + /* + * Because the function tracer can trace preempt_count_sub() + * and it also uses preempt_enable/disable_notrace(), if + * NEED_RESCHED is set, the preempt_enable_notrace() called + * by the function tracer will call this function again and + * cause infinite recursion. + * + * Preemption must be disabled here before the function + * tracer can trace. Break up preempt_disable() into two + * calls. One to disable preemption without fear of being + * traced. The other to still record the preemption latency, + * which can also be traced by the function tracer. + */ preempt_disable_notrace(); + preempt_latency_start(1); /* * Needs preempt disabled in case user_exit() is traced * and the tracer calls preempt_enable_notrace() causing @@ -3350,6 +3485,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) __schedule(true); exception_exit(prev_ctx); + preempt_latency_stop(1); preempt_enable_no_resched_notrace(); } while (need_resched()); } @@ -3406,12 +3542,13 @@ EXPORT_SYMBOL(default_wake_function); void rt_mutex_setprio(struct task_struct *p, int prio) { int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE; - struct rq *rq; const struct sched_class *prev_class; + struct rq_flags rf; + struct rq *rq; BUG_ON(prio > MAX_PRIO); - rq = __task_rq_lock(p); + rq = __task_rq_lock(p, &rf); /* * Idle task boosting is a nono in general. There is one @@ -3487,7 +3624,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) check_class_changed(rq, p, prev_class, oldprio); out_unlock: preempt_disable(); /* avoid rq from going away on us */ - __task_rq_unlock(rq); + __task_rq_unlock(rq, &rf); balance_callback(rq); preempt_enable(); @@ -3497,7 +3634,7 @@ out_unlock: void set_user_nice(struct task_struct *p, long nice) { int old_prio, delta, queued; - unsigned long flags; + struct rq_flags rf; struct rq *rq; if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) @@ -3506,7 +3643,7 @@ void set_user_nice(struct task_struct *p, long nice) * We have to be careful, if called from sys_setpriority(), * the task might be in the middle of scheduling on another CPU. */ - rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &rf); /* * The RT priorities are set via sched_setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected @@ -3537,7 +3674,7 @@ void set_user_nice(struct task_struct *p, long nice) resched_curr(rq); } out_unlock: - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); } EXPORT_SYMBOL(set_user_nice); @@ -3834,11 +3971,11 @@ static int __sched_setscheduler(struct task_struct *p, MAX_RT_PRIO - 1 - attr->sched_priority; int retval, oldprio, oldpolicy = -1, queued, running; int new_effective_prio, policy = attr->sched_policy; - unsigned long flags; const struct sched_class *prev_class; - struct rq *rq; + struct rq_flags rf; int reset_on_fork; int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; + struct rq *rq; /* may grab non-irq protected spin_locks */ BUG_ON(in_interrupt()); @@ -3933,13 +4070,13 @@ recheck: * To be able to change p->policy safely, the appropriate * runqueue lock must be held. */ - rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &rf); /* * Changing the policy of the stop threads its a very bad idea */ if (p == rq->stop) { - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); return -EINVAL; } @@ -3956,7 +4093,7 @@ recheck: goto change; p->sched_reset_on_fork = reset_on_fork; - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); return 0; } change: @@ -3970,7 +4107,7 @@ change: if (rt_bandwidth_enabled() && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0 && !task_group_is_autogroup(task_group(p))) { - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); return -EPERM; } #endif @@ -3985,7 +4122,7 @@ change: */ if (!cpumask_subset(span, &p->cpus_allowed) || rq->rd->dl_bw.bw == 0) { - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); return -EPERM; } } @@ -3995,7 +4132,7 @@ change: /* recheck policy now with rq lock held */ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); goto recheck; } @@ -4005,7 +4142,7 @@ change: * is available. */ if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) { - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); return -EBUSY; } @@ -4050,7 +4187,7 @@ change: check_class_changed(rq, p, prev_class, oldprio); preempt_disable(); /* avoid rq from going away on us */ - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); if (pi) rt_mutex_adjust_pi(p); @@ -4903,10 +5040,10 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, { struct task_struct *p; unsigned int time_slice; - unsigned long flags; + struct rq_flags rf; + struct timespec t; struct rq *rq; int retval; - struct timespec t; if (pid < 0) return -EINVAL; @@ -4921,11 +5058,11 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, if (retval) goto out_unlock; - rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &rf); time_slice = 0; if (p->sched_class->get_rr_interval) time_slice = p->sched_class->get_rr_interval(rq, p); - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); rcu_read_unlock(); jiffies_to_timespec(time_slice, &t); @@ -5001,7 +5138,8 @@ void show_state_filter(unsigned long state_filter) touch_all_softlockup_watchdogs(); #ifdef CONFIG_SCHED_DEBUG - sysrq_sched_debug_show(); + if (!state_filter) + sysrq_sched_debug_show(); #endif rcu_read_unlock(); /* @@ -5163,6 +5301,8 @@ out: #ifdef CONFIG_SMP +static bool sched_smp_initialized __read_mostly; + #ifdef CONFIG_NUMA_BALANCING /* Migrate current task p to target_cpu */ int migrate_task_to(struct task_struct *p, int target_cpu) @@ -5188,11 +5328,11 @@ int migrate_task_to(struct task_struct *p, int target_cpu) */ void sched_setnuma(struct task_struct *p, int nid) { - struct rq *rq; - unsigned long flags; bool queued, running; + struct rq_flags rf; + struct rq *rq; - rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &rf); queued = task_on_rq_queued(p); running = task_current(rq, p); @@ -5207,7 +5347,7 @@ void sched_setnuma(struct task_struct *p, int nid) p->sched_class->set_curr_task(rq); if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE); - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); } #endif /* CONFIG_NUMA_BALANCING */ @@ -5223,7 +5363,7 @@ void idle_task_exit(void) BUG_ON(cpu_online(smp_processor_id())); if (mm != &init_mm) { - switch_mm(mm, &init_mm, current); + switch_mm_irqs_off(mm, &init_mm, current); finish_arch_post_lock_switch(); } mmdrop(mm); @@ -5271,6 +5411,7 @@ static void migrate_tasks(struct rq *dead_rq) { struct rq *rq = dead_rq; struct task_struct *next, *stop = rq->stop; + struct pin_cookie cookie; int dest_cpu; /* @@ -5302,8 +5443,8 @@ static void migrate_tasks(struct rq *dead_rq) /* * pick_next_task assumes pinned rq->lock. */ - lockdep_pin_lock(&rq->lock); - next = pick_next_task(rq, &fake_task); + cookie = lockdep_pin_lock(&rq->lock); + next = pick_next_task(rq, &fake_task, cookie); BUG_ON(!next); next->sched_class->put_prev_task(rq, next); @@ -5316,7 +5457,7 @@ static void migrate_tasks(struct rq *dead_rq) * because !cpu_active at this point, which means load-balance * will not interfere. Also, stop-machine. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); raw_spin_unlock(&rq->lock); raw_spin_lock(&next->pi_lock); raw_spin_lock(&rq->lock); @@ -5377,127 +5518,13 @@ static void set_rq_offline(struct rq *rq) } } -/* - * migration_call - callback that gets triggered when a CPU is added. - * Here we can start up the necessary migration thread for the new CPU. - */ -static int -migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) +static void set_cpu_rq_start_time(unsigned int cpu) { - int cpu = (long)hcpu; - unsigned long flags; struct rq *rq = cpu_rq(cpu); - switch (action & ~CPU_TASKS_FROZEN) { - - case CPU_UP_PREPARE: - rq->calc_load_update = calc_load_update; - account_reset_rq(rq); - break; - - case CPU_ONLINE: - /* Update our root-domain */ - raw_spin_lock_irqsave(&rq->lock, flags); - if (rq->rd) { - BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); - - set_rq_online(rq); - } - raw_spin_unlock_irqrestore(&rq->lock, flags); - break; - -#ifdef CONFIG_HOTPLUG_CPU - case CPU_DYING: - sched_ttwu_pending(); - /* Update our root-domain */ - raw_spin_lock_irqsave(&rq->lock, flags); - if (rq->rd) { - BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); - set_rq_offline(rq); - } - migrate_tasks(rq); - BUG_ON(rq->nr_running != 1); /* the migration thread */ - raw_spin_unlock_irqrestore(&rq->lock, flags); - break; - - case CPU_DEAD: - calc_load_migrate(rq); - break; -#endif - } - - update_max_interval(); - - return NOTIFY_OK; -} - -/* - * Register at high priority so that task migration (migrate_all_tasks) - * happens before everything else. This has to be lower priority than - * the notifier in the perf_event subsystem, though. - */ -static struct notifier_block migration_notifier = { - .notifier_call = migration_call, - .priority = CPU_PRI_MIGRATION, -}; - -static void set_cpu_rq_start_time(void) -{ - int cpu = smp_processor_id(); - struct rq *rq = cpu_rq(cpu); rq->age_stamp = sched_clock_cpu(cpu); } -static int sched_cpu_active(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - int cpu = (long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_STARTING: - set_cpu_rq_start_time(); - return NOTIFY_OK; - - case CPU_DOWN_FAILED: - set_cpu_active(cpu, true); - return NOTIFY_OK; - - default: - return NOTIFY_DONE; - } -} - -static int sched_cpu_inactive(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DOWN_PREPARE: - set_cpu_active((long)hcpu, false); - return NOTIFY_OK; - default: - return NOTIFY_DONE; - } -} - -static int __init migration_init(void) -{ - void *cpu = (void *)(long)smp_processor_id(); - int err; - - /* Initialize migration for the boot CPU */ - err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); - BUG_ON(err == NOTIFY_BAD); - migration_call(&migration_notifier, CPU_ONLINE, cpu); - register_cpu_notifier(&migration_notifier); - - /* Register cpu active notifiers */ - cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); - cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); - - return 0; -} -early_initcall(migration_init); - static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ #ifdef CONFIG_SCHED_DEBUG @@ -6645,10 +6672,10 @@ static void sched_init_numa(void) init_numa_topology_type(); } -static void sched_domains_numa_masks_set(int cpu) +static void sched_domains_numa_masks_set(unsigned int cpu) { - int i, j; int node = cpu_to_node(cpu); + int i, j; for (i = 0; i < sched_domains_numa_levels; i++) { for (j = 0; j < nr_node_ids; j++) { @@ -6658,51 +6685,20 @@ static void sched_domains_numa_masks_set(int cpu) } } -static void sched_domains_numa_masks_clear(int cpu) +static void sched_domains_numa_masks_clear(unsigned int cpu) { int i, j; + for (i = 0; i < sched_domains_numa_levels; i++) { for (j = 0; j < nr_node_ids; j++) cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); } } -/* - * Update sched_domains_numa_masks[level][node] array when new cpus - * are onlined. - */ -static int sched_domains_numa_masks_update(struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - int cpu = (long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: - sched_domains_numa_masks_set(cpu); - break; - - case CPU_DEAD: - sched_domains_numa_masks_clear(cpu); - break; - - default: - return NOTIFY_DONE; - } - - return NOTIFY_OK; -} #else -static inline void sched_init_numa(void) -{ -} - -static int sched_domains_numa_masks_update(struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - return 0; -} +static inline void sched_init_numa(void) { } +static void sched_domains_numa_masks_set(unsigned int cpu) { } +static void sched_domains_numa_masks_clear(unsigned int cpu) { } #endif /* CONFIG_NUMA */ static int __sdt_alloc(const struct cpumask *cpu_map) @@ -7092,13 +7088,9 @@ static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */ * If we come here as part of a suspend/resume, don't touch cpusets because we * want to restore it back to its original state upon resume anyway. */ -static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, - void *hcpu) +static void cpuset_cpu_active(void) { - switch (action) { - case CPU_ONLINE_FROZEN: - case CPU_DOWN_FAILED_FROZEN: - + if (cpuhp_tasks_frozen) { /* * num_cpus_frozen tracks how many CPUs are involved in suspend * resume sequence. As long as this is not the last online @@ -7108,35 +7100,25 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, num_cpus_frozen--; if (likely(num_cpus_frozen)) { partition_sched_domains(1, NULL, NULL); - break; + return; } - /* * This is the last CPU online operation. So fall through and * restore the original sched domains by considering the * cpuset configurations. */ - - case CPU_ONLINE: - cpuset_update_active_cpus(true); - break; - default: - return NOTIFY_DONE; } - return NOTIFY_OK; + cpuset_update_active_cpus(true); } -static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, - void *hcpu) +static int cpuset_cpu_inactive(unsigned int cpu) { unsigned long flags; - long cpu = (long)hcpu; struct dl_bw *dl_b; bool overflow; int cpus; - switch (action) { - case CPU_DOWN_PREPARE: + if (!cpuhp_tasks_frozen) { rcu_read_lock_sched(); dl_b = dl_bw_of(cpu); @@ -7148,19 +7130,120 @@ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, rcu_read_unlock_sched(); if (overflow) - return notifier_from_errno(-EBUSY); + return -EBUSY; cpuset_update_active_cpus(false); - break; - case CPU_DOWN_PREPARE_FROZEN: + } else { num_cpus_frozen++; partition_sched_domains(1, NULL, NULL); - break; - default: - return NOTIFY_DONE; } - return NOTIFY_OK; + return 0; } +int sched_cpu_activate(unsigned int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + set_cpu_active(cpu, true); + + if (sched_smp_initialized) { + sched_domains_numa_masks_set(cpu); + cpuset_cpu_active(); + } + + /* + * Put the rq online, if not already. This happens: + * + * 1) In the early boot process, because we build the real domains + * after all cpus have been brought up. + * + * 2) At runtime, if cpuset_cpu_active() fails to rebuild the + * domains. + */ + raw_spin_lock_irqsave(&rq->lock, flags); + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_online(rq); + } + raw_spin_unlock_irqrestore(&rq->lock, flags); + + update_max_interval(); + + return 0; +} + +int sched_cpu_deactivate(unsigned int cpu) +{ + int ret; + + set_cpu_active(cpu, false); + /* + * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU + * users of this state to go away such that all new such users will + * observe it. + * + * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might + * not imply sync_sched(), so wait for both. + * + * Do sync before park smpboot threads to take care the rcu boost case. + */ + if (IS_ENABLED(CONFIG_PREEMPT)) + synchronize_rcu_mult(call_rcu, call_rcu_sched); + else + synchronize_rcu(); + + if (!sched_smp_initialized) + return 0; + + ret = cpuset_cpu_inactive(cpu); + if (ret) { + set_cpu_active(cpu, true); + return ret; + } + sched_domains_numa_masks_clear(cpu); + return 0; +} + +static void sched_rq_cpu_starting(unsigned int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + rq->calc_load_update = calc_load_update; + account_reset_rq(rq); + update_max_interval(); +} + +int sched_cpu_starting(unsigned int cpu) +{ + set_cpu_rq_start_time(cpu); + sched_rq_cpu_starting(cpu); + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU +int sched_cpu_dying(unsigned int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + /* Handle pending wakeups and then migrate everything off */ + sched_ttwu_pending(); + raw_spin_lock_irqsave(&rq->lock, flags); + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_offline(rq); + } + migrate_tasks(rq); + BUG_ON(rq->nr_running != 1); + raw_spin_unlock_irqrestore(&rq->lock, flags); + calc_load_migrate(rq); + update_max_interval(); + nohz_balance_exit_idle(cpu); + hrtick_clear(rq); + return 0; +} +#endif + void __init sched_init_smp(void) { cpumask_var_t non_isolated_cpus; @@ -7182,12 +7265,6 @@ void __init sched_init_smp(void) cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); mutex_unlock(&sched_domains_mutex); - hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); - hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); - hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); - - init_hrtick(); - /* Move init over to a non-isolated CPU */ if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) BUG(); @@ -7196,7 +7273,16 @@ void __init sched_init_smp(void) init_sched_rt_class(); init_sched_dl_class(); + sched_smp_initialized = true; +} + +static int __init migration_init(void) +{ + sched_rq_cpu_starting(smp_processor_id()); + return 0; } +early_initcall(migration_init); + #else void __init sched_init_smp(void) { @@ -7331,8 +7417,6 @@ void __init sched_init(void) for (j = 0; j < CPU_LOAD_IDX_MAX; j++) rq->cpu_load[j] = 0; - rq->last_load_update_tick = jiffies; - #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; @@ -7351,12 +7435,13 @@ void __init sched_init(void) rq_attach_root(rq, &def_root_domain); #ifdef CONFIG_NO_HZ_COMMON + rq->last_load_update_tick = jiffies; rq->nohz_flags = 0; #endif #ifdef CONFIG_NO_HZ_FULL rq->last_sched_tick = 0; #endif -#endif +#endif /* CONFIG_SMP */ init_rq_hrtick(rq); atomic_set(&rq->nr_iowait, 0); } @@ -7394,7 +7479,7 @@ void __init sched_init(void) if (cpu_isolated_map == NULL) zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); idle_thread_set_boot_cpu(); - set_cpu_rq_start_time(); + set_cpu_rq_start_time(smp_processor_id()); #endif init_sched_fair_class(); @@ -7639,10 +7724,10 @@ void sched_move_task(struct task_struct *tsk) { struct task_group *tg; int queued, running; - unsigned long flags; + struct rq_flags rf; struct rq *rq; - rq = task_rq_lock(tsk, &flags); + rq = task_rq_lock(tsk, &rf); running = task_current(rq, tsk); queued = task_on_rq_queued(tsk); @@ -7674,7 +7759,7 @@ void sched_move_task(struct task_struct *tsk) if (queued) enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); - task_rq_unlock(rq, tsk, &flags); + task_rq_unlock(rq, tsk, &rf); } #endif /* CONFIG_CGROUP_SCHED */ @@ -7894,7 +7979,7 @@ static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) static int sched_rt_global_constraints(void) { unsigned long flags; - int i, ret = 0; + int i; raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); for_each_possible_cpu(i) { @@ -7906,7 +7991,7 @@ static int sched_rt_global_constraints(void) } raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); - return ret; + return 0; } #endif /* CONFIG_RT_GROUP_SCHED */ diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 4a811203c04a..41f85c4d0938 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -25,11 +25,22 @@ enum cpuacct_stat_index { CPUACCT_STAT_NSTATS, }; +enum cpuacct_usage_index { + CPUACCT_USAGE_USER, /* ... user mode */ + CPUACCT_USAGE_SYSTEM, /* ... kernel mode */ + + CPUACCT_USAGE_NRUSAGE, +}; + +struct cpuacct_usage { + u64 usages[CPUACCT_USAGE_NRUSAGE]; +}; + /* track cpu usage of a group of tasks and its child groups */ struct cpuacct { struct cgroup_subsys_state css; /* cpuusage holds pointer to a u64-type object on every cpu */ - u64 __percpu *cpuusage; + struct cpuacct_usage __percpu *cpuusage; struct kernel_cpustat __percpu *cpustat; }; @@ -49,7 +60,7 @@ static inline struct cpuacct *parent_ca(struct cpuacct *ca) return css_ca(ca->css.parent); } -static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); +static DEFINE_PER_CPU(struct cpuacct_usage, root_cpuacct_cpuusage); static struct cpuacct root_cpuacct = { .cpustat = &kernel_cpustat, .cpuusage = &root_cpuacct_cpuusage, @@ -68,7 +79,7 @@ cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) if (!ca) goto out; - ca->cpuusage = alloc_percpu(u64); + ca->cpuusage = alloc_percpu(struct cpuacct_usage); if (!ca->cpuusage) goto out_free_ca; @@ -96,20 +107,37 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css) kfree(ca); } -static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) +static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, + enum cpuacct_usage_index index) { - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); u64 data; + /* + * We allow index == CPUACCT_USAGE_NRUSAGE here to read + * the sum of suages. + */ + BUG_ON(index > CPUACCT_USAGE_NRUSAGE); + #ifndef CONFIG_64BIT /* * Take rq->lock to make 64-bit read safe on 32-bit platforms. */ raw_spin_lock_irq(&cpu_rq(cpu)->lock); - data = *cpuusage; +#endif + + if (index == CPUACCT_USAGE_NRUSAGE) { + int i = 0; + + data = 0; + for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++) + data += cpuusage->usages[i]; + } else { + data = cpuusage->usages[index]; + } + +#ifndef CONFIG_64BIT raw_spin_unlock_irq(&cpu_rq(cpu)->lock); -#else - data = *cpuusage; #endif return data; @@ -117,69 +145,103 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) { - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + int i; #ifndef CONFIG_64BIT /* * Take rq->lock to make 64-bit write safe on 32-bit platforms. */ raw_spin_lock_irq(&cpu_rq(cpu)->lock); - *cpuusage = val; +#endif + + for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++) + cpuusage->usages[i] = val; + +#ifndef CONFIG_64BIT raw_spin_unlock_irq(&cpu_rq(cpu)->lock); -#else - *cpuusage = val; #endif } /* return total cpu usage (in nanoseconds) of a group */ -static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) +static u64 __cpuusage_read(struct cgroup_subsys_state *css, + enum cpuacct_usage_index index) { struct cpuacct *ca = css_ca(css); u64 totalcpuusage = 0; int i; - for_each_present_cpu(i) - totalcpuusage += cpuacct_cpuusage_read(ca, i); + for_each_possible_cpu(i) + totalcpuusage += cpuacct_cpuusage_read(ca, i, index); return totalcpuusage; } +static u64 cpuusage_user_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return __cpuusage_read(css, CPUACCT_USAGE_USER); +} + +static u64 cpuusage_sys_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return __cpuusage_read(css, CPUACCT_USAGE_SYSTEM); +} + +static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + return __cpuusage_read(css, CPUACCT_USAGE_NRUSAGE); +} + static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { struct cpuacct *ca = css_ca(css); - int err = 0; - int i; + int cpu; /* * Only allow '0' here to do a reset. */ - if (val) { - err = -EINVAL; - goto out; - } + if (val) + return -EINVAL; - for_each_present_cpu(i) - cpuacct_cpuusage_write(ca, i, 0); + for_each_possible_cpu(cpu) + cpuacct_cpuusage_write(ca, cpu, 0); -out: - return err; + return 0; } -static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) +static int __cpuacct_percpu_seq_show(struct seq_file *m, + enum cpuacct_usage_index index) { struct cpuacct *ca = css_ca(seq_css(m)); u64 percpu; int i; - for_each_present_cpu(i) { - percpu = cpuacct_cpuusage_read(ca, i); + for_each_possible_cpu(i) { + percpu = cpuacct_cpuusage_read(ca, i, index); seq_printf(m, "%llu ", (unsigned long long) percpu); } seq_printf(m, "\n"); return 0; } +static int cpuacct_percpu_user_seq_show(struct seq_file *m, void *V) +{ + return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_USER); +} + +static int cpuacct_percpu_sys_seq_show(struct seq_file *m, void *V) +{ + return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_SYSTEM); +} + +static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) +{ + return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_NRUSAGE); +} + static const char * const cpuacct_stat_desc[] = { [CPUACCT_STAT_USER] = "user", [CPUACCT_STAT_SYSTEM] = "system", @@ -191,7 +253,7 @@ static int cpuacct_stats_show(struct seq_file *sf, void *v) int cpu; s64 val = 0; - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); val += kcpustat->cpustat[CPUTIME_USER]; val += kcpustat->cpustat[CPUTIME_NICE]; @@ -200,7 +262,7 @@ static int cpuacct_stats_show(struct seq_file *sf, void *v) seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val); val = 0; - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); val += kcpustat->cpustat[CPUTIME_SYSTEM]; val += kcpustat->cpustat[CPUTIME_IRQ]; @@ -220,10 +282,26 @@ static struct cftype files[] = { .write_u64 = cpuusage_write, }, { + .name = "usage_user", + .read_u64 = cpuusage_user_read, + }, + { + .name = "usage_sys", + .read_u64 = cpuusage_sys_read, + }, + { .name = "usage_percpu", .seq_show = cpuacct_percpu_seq_show, }, { + .name = "usage_percpu_user", + .seq_show = cpuacct_percpu_user_seq_show, + }, + { + .name = "usage_percpu_sys", + .seq_show = cpuacct_percpu_sys_seq_show, + }, + { .name = "stat", .seq_show = cpuacct_stats_show, }, @@ -238,10 +316,17 @@ static struct cftype files[] = { void cpuacct_charge(struct task_struct *tsk, u64 cputime) { struct cpuacct *ca; + int index = CPUACCT_USAGE_SYSTEM; + struct pt_regs *regs = task_pt_regs(tsk); + + if (regs && user_mode(regs)) + index = CPUACCT_USAGE_USER; rcu_read_lock(); + for (ca = task_ca(tsk); ca; ca = parent_ca(ca)) - *this_cpu_ptr(ca->cpuusage) += cputime; + this_cpu_ptr(ca->cpuusage)->usages[index] += cputime; + rcu_read_unlock(); } diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 5a75b08cfd85..5be58820465c 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -103,10 +103,10 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, const struct sched_dl_entity *dl_se = &p->dl; if (later_mask && - cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) { + cpumask_and(later_mask, cp->free_cpus, tsk_cpus_allowed(p))) { best_cpu = cpumask_any(later_mask); goto out; - } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && + } else if (cpumask_test_cpu(cpudl_maximum(cp), tsk_cpus_allowed(p)) && dl_time_before(dl_se->deadline, cp->elements[0].dl)) { best_cpu = cpudl_maximum(cp); if (later_mask) diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 981fcd7dc394..11e9705bf937 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -103,11 +103,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, if (skip) continue; - if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) + if (cpumask_any_and(tsk_cpus_allowed(p), vec->mask) >= nr_cpu_ids) continue; if (lowest_mask) { - cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); + cpumask_and(lowest_mask, tsk_cpus_allowed(p), vec->mask); /* * We have to ensure that we have at least one bit diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 686ec8adf952..fcb7f0217ff4 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -134,7 +134,7 @@ static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { struct task_struct *p = dl_task_of(dl_se); - if (p->nr_cpus_allowed > 1) + if (tsk_nr_cpus_allowed(p) > 1) dl_rq->dl_nr_migratory++; update_dl_migration(dl_rq); @@ -144,7 +144,7 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { struct task_struct *p = dl_task_of(dl_se); - if (p->nr_cpus_allowed > 1) + if (tsk_nr_cpus_allowed(p) > 1) dl_rq->dl_nr_migratory--; update_dl_migration(dl_rq); @@ -591,10 +591,10 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) struct sched_dl_entity, dl_timer); struct task_struct *p = dl_task_of(dl_se); - unsigned long flags; + struct rq_flags rf; struct rq *rq; - rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &rf); /* * The task might have changed its scheduling policy to something @@ -670,14 +670,14 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * Nothing relies on rq->lock after this, so its safe to drop * rq->lock. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, rf.cookie); push_dl_task(rq); - lockdep_pin_lock(&rq->lock); + lockdep_repin_lock(&rq->lock, rf.cookie); } #endif unlock: - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); /* * This can free the task_struct, including this hrtimer, do not touch @@ -717,10 +717,6 @@ static void update_curr_dl(struct rq *rq) if (!dl_task(curr) || !on_dl_rq(dl_se)) return; - /* Kick cpufreq (see the comment in linux/cpufreq.h). */ - if (cpu_of(rq) == smp_processor_id()) - cpufreq_trigger_update(rq_clock(rq)); - /* * Consumed budget is computed considering the time as * observed by schedulable tasks (excluding time spent @@ -736,6 +732,10 @@ static void update_curr_dl(struct rq *rq) return; } + /* kick cpufreq (see the comment in linux/cpufreq.h). */ + if (cpu_of(rq) == smp_processor_id()) + cpufreq_trigger_update(rq_clock(rq)); + schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); @@ -966,7 +966,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) enqueue_dl_entity(&p->dl, pi_se, flags); - if (!task_current(rq, p) && p->nr_cpus_allowed > 1) + if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1) enqueue_pushable_dl_task(rq, p); } @@ -1040,9 +1040,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) * try to make it stay here, it might be important. */ if (unlikely(dl_task(curr)) && - (curr->nr_cpus_allowed < 2 || + (tsk_nr_cpus_allowed(curr) < 2 || !dl_entity_preempt(&p->dl, &curr->dl)) && - (p->nr_cpus_allowed > 1)) { + (tsk_nr_cpus_allowed(p) > 1)) { int target = find_later_rq(p); if (target != -1 && @@ -1063,7 +1063,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) * Current can't be migrated, useless to reschedule, * let's hope p can move out. */ - if (rq->curr->nr_cpus_allowed == 1 || + if (tsk_nr_cpus_allowed(rq->curr) == 1 || cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1) return; @@ -1071,7 +1071,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) * p is migratable, so let's not schedule it and * see if it is pushed or pulled somewhere else. */ - if (p->nr_cpus_allowed != 1 && + if (tsk_nr_cpus_allowed(p) != 1 && cpudl_find(&rq->rd->cpudl, p, NULL) != -1) return; @@ -1125,7 +1125,8 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, return rb_entry(left, struct sched_dl_entity, rb_node); } -struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) +struct task_struct * +pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) { struct sched_dl_entity *dl_se; struct task_struct *p; @@ -1140,9 +1141,9 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) * disabled avoiding further scheduler activity on it and we're * being very careful to re-start the picking loop. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); pull_dl_task(rq); - lockdep_pin_lock(&rq->lock); + lockdep_repin_lock(&rq->lock, cookie); /* * pull_rt_task() can drop (and re-acquire) rq->lock; this * means a stop task can slip in, in which case we need to @@ -1185,7 +1186,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p) { update_curr_dl(rq); - if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) + if (on_dl_rq(&p->dl) && tsk_nr_cpus_allowed(p) > 1) enqueue_pushable_dl_task(rq, p); } @@ -1286,7 +1287,7 @@ static int find_later_rq(struct task_struct *task) if (unlikely(!later_mask)) return -1; - if (task->nr_cpus_allowed == 1) + if (tsk_nr_cpus_allowed(task) == 1) return -1; /* @@ -1392,7 +1393,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) if (double_lock_balance(rq, later_rq)) { if (unlikely(task_rq(task) != rq || !cpumask_test_cpu(later_rq->cpu, - &task->cpus_allowed) || + tsk_cpus_allowed(task)) || task_running(rq, task) || !dl_task(task) || !task_on_rq_queued(task))) { @@ -1432,7 +1433,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) BUG_ON(rq->cpu != task_cpu(p)); BUG_ON(task_current(rq, p)); - BUG_ON(p->nr_cpus_allowed <= 1); + BUG_ON(tsk_nr_cpus_allowed(p) <= 1); BUG_ON(!task_on_rq_queued(p)); BUG_ON(!dl_task(p)); @@ -1471,7 +1472,7 @@ retry: */ if (dl_task(rq->curr) && dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) && - rq->curr->nr_cpus_allowed > 1) { + tsk_nr_cpus_allowed(rq->curr) > 1) { resched_curr(rq); return 0; } @@ -1618,9 +1619,9 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) { if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && - p->nr_cpus_allowed > 1 && + tsk_nr_cpus_allowed(p) > 1 && dl_task(rq->curr) && - (rq->curr->nr_cpus_allowed < 2 || + (tsk_nr_cpus_allowed(rq->curr) < 2 || !dl_entity_preempt(&p->dl, &rq->curr->dl))) { push_dl_tasks(rq); } @@ -1724,7 +1725,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP - if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) + if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded) queue_push_tasks(rq); #else if (dl_task(rq->curr)) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4fbc3bd5ff60..cf905f655ba1 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -626,15 +626,16 @@ do { \ #undef P #undef PN -#ifdef CONFIG_SCHEDSTATS -#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); -#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n); - #ifdef CONFIG_SMP +#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n); P64(avg_idle); P64(max_idle_balance_cost); +#undef P64 #endif +#ifdef CONFIG_SCHEDSTATS +#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); + if (schedstat_enabled()) { P(yld_count); P(sched_count); @@ -644,7 +645,6 @@ do { \ } #undef P -#undef P64 #endif spin_lock_irqsave(&sched_debug_lock, flags); print_cfs_stats(m, cpu); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 40748dc8ea3e..218f8e83db73 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -204,7 +204,7 @@ static void __update_inv_weight(struct load_weight *lw) * OR * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT * - * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case + * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case * we're guaranteed shift stays positive because inv_weight is guaranteed to * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22. * @@ -682,17 +682,68 @@ void init_entity_runnable_average(struct sched_entity *se) sa->period_contrib = 1023; sa->load_avg = scale_load_down(se->load.weight); sa->load_sum = sa->load_avg * LOAD_AVG_MAX; - sa->util_avg = scale_load_down(SCHED_LOAD_SCALE); - sa->util_sum = sa->util_avg * LOAD_AVG_MAX; + /* + * At this point, util_avg won't be used in select_task_rq_fair anyway + */ + sa->util_avg = 0; + sa->util_sum = 0; /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } +/* + * With new tasks being created, their initial util_avgs are extrapolated + * based on the cfs_rq's current util_avg: + * + * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight + * + * However, in many cases, the above util_avg does not give a desired + * value. Moreover, the sum of the util_avgs may be divergent, such + * as when the series is a harmonic series. + * + * To solve this problem, we also cap the util_avg of successive tasks to + * only 1/2 of the left utilization budget: + * + * util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n + * + * where n denotes the nth task. + * + * For example, a simplest series from the beginning would be like: + * + * task util_avg: 512, 256, 128, 64, 32, 16, 8, ... + * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ... + * + * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap) + * if util_avg > util_avg_cap. + */ +void post_init_entity_util_avg(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + struct sched_avg *sa = &se->avg; + long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; + + if (cap > 0) { + if (cfs_rq->avg.util_avg != 0) { + sa->util_avg = cfs_rq->avg.util_avg * se->load.weight; + sa->util_avg /= (cfs_rq->avg.load_avg + 1); + + if (sa->util_avg > cap) + sa->util_avg = cap; + } else { + sa->util_avg = cap; + } + sa->util_sum = sa->util_avg * LOAD_AVG_MAX; + } +} + static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq); #else void init_entity_runnable_average(struct sched_entity *se) { } +void post_init_entity_util_avg(struct sched_entity *se) +{ +} #endif /* @@ -2437,10 +2488,12 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_sub(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); +#ifdef CONFIG_SMP if (entity_is_task(se)) { account_numa_dequeue(rq_of(cfs_rq), task_of(se)); list_del_init(&se->group_node); } +#endif cfs_rq->nr_running--; } @@ -2550,6 +2603,16 @@ static const u32 runnable_avg_yN_sum[] = { }; /* + * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to + * lower integers. See Documentation/scheduler/sched-avg.txt how these + * were generated: + */ +static const u32 __accumulated_sum_N32[] = { + 0, 23371, 35056, 40899, 43820, 45281, + 46011, 46376, 46559, 46650, 46696, 46719, +}; + +/* * Approximate: * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) */ @@ -2597,22 +2660,13 @@ static u32 __compute_runnable_contrib(u64 n) else if (unlikely(n >= LOAD_AVG_MAX_N)) return LOAD_AVG_MAX; - /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */ - do { - contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */ - contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD]; - - n -= LOAD_AVG_PERIOD; - } while (n > LOAD_AVG_PERIOD); - + /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */ + contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD]; + n %= LOAD_AVG_PERIOD; contrib = decay_load(contrib, n); return contrib + runnable_avg_yN_sum[n]; } -#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10 -#error "load tracking assumes 2^10 as unit" -#endif - #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) /* @@ -2821,23 +2875,54 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); +static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + int cpu = cpu_of(rq); + + if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) { + unsigned long max = rq->cpu_capacity_orig; + + /* + * There are a few boundary cases this might miss but it should + * get called often enough that that should (hopefully) not be + * a real problem -- added to that it only calls on the local + * CPU, so if we enqueue remotely we'll miss an update, but + * the next tick/schedule should update. + * + * It will not get called when we go idle, because the idle + * thread is a different class (!fair), nor will the utilization + * number include things like RT tasks. + * + * As is, the util number is not freq-invariant (we'd have to + * implement arch_scale_freq_capacity() for that). + * + * See cpu_util(). + */ + cpufreq_update_util(rq_clock(rq), + min(cfs_rq->avg.util_avg, max), max); + } +} + /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ -static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) +static inline int +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) { struct sched_avg *sa = &cfs_rq->avg; - int decayed, removed = 0; + int decayed, removed_load = 0, removed_util = 0; if (atomic_long_read(&cfs_rq->removed_load_avg)) { s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); sa->load_avg = max_t(long, sa->load_avg - r, 0); sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); - removed = 1; + removed_load = 1; } if (atomic_long_read(&cfs_rq->removed_util_avg)) { long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); sa->util_avg = max_t(long, sa->util_avg - r, 0); sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0); + removed_util = 1; } decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, @@ -2848,7 +2933,10 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) cfs_rq->load_last_update_time_copy = sa->last_update_time; #endif - return decayed || removed; + if (update_freq && (decayed || removed_util)) + cfs_rq_util_change(cfs_rq); + + return decayed || removed_load; } /* Update task and its cfs_rq load average */ @@ -2867,31 +2955,8 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); - if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) + if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg) update_tg_load_avg(cfs_rq, 0); - - if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) { - unsigned long max = rq->cpu_capacity_orig; - - /* - * There are a few boundary cases this might miss but it should - * get called often enough that that should (hopefully) not be - * a real problem -- added to that it only calls on the local - * CPU, so if we enqueue remotely we'll miss an update, but - * the next tick/schedule should update. - * - * It will not get called when we go idle, because the idle - * thread is a different class (!fair), nor will the utilization - * number include things like RT tasks. - * - * As is, the util number is not freq-invariant (we'd have to - * implement arch_scale_freq_capacity() for that). - * - * See cpu_util(). - */ - cpufreq_update_util(rq_clock(rq), - min(cfs_rq->avg.util_avg, max), max); - } } static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -2919,6 +2984,8 @@ skip_aging: cfs_rq->avg.load_sum += se->avg.load_sum; cfs_rq->avg.util_avg += se->avg.util_avg; cfs_rq->avg.util_sum += se->avg.util_sum; + + cfs_rq_util_change(cfs_rq); } static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -2931,6 +2998,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); + + cfs_rq_util_change(cfs_rq); } /* Add the load generated by se into cfs_rq's load average */ @@ -2948,7 +3017,7 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) cfs_rq->curr == se, NULL); } - decayed = update_cfs_rq_load_avg(now, cfs_rq); + decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated); cfs_rq->runnable_load_avg += sa->load_avg; cfs_rq->runnable_load_sum += sa->load_sum; @@ -3185,10 +3254,41 @@ static inline void check_schedstat_required(void) #endif } + +/* + * MIGRATION + * + * dequeue + * update_curr() + * update_min_vruntime() + * vruntime -= min_vruntime + * + * enqueue + * update_curr() + * update_min_vruntime() + * vruntime += min_vruntime + * + * this way the vruntime transition between RQs is done when both + * min_vruntime are up-to-date. + * + * WAKEUP (remote) + * + * ->migrate_task_rq_fair() (p->state == TASK_WAKING) + * vruntime -= min_vruntime + * + * enqueue + * update_curr() + * update_min_vruntime() + * vruntime += min_vruntime + * + * this way we don't have the most up-to-date min_vruntime on the originating + * CPU and an up-to-date min_vruntime on the destination CPU. + */ + static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING); + bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED); bool curr = cfs_rq->curr == se; /* @@ -3202,7 +3302,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) /* * Otherwise, renormalise after, such that we're placed at the current - * moment in time, instead of some random moment in the past. + * moment in time, instead of some random moment in the past. Being + * placed in the past could significantly boost this task to the + * fairness detriment of existing tasks. */ if (renorm && !curr) se->vruntime += cfs_rq->min_vruntime; @@ -4430,7 +4532,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } #ifdef CONFIG_SMP - +#ifdef CONFIG_NO_HZ_COMMON /* * per rq 'load' arrray crap; XXX kill this. */ @@ -4496,13 +4598,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) } return load; } +#endif /* CONFIG_NO_HZ_COMMON */ /** - * __update_cpu_load - update the rq->cpu_load[] statistics + * __cpu_load_update - update the rq->cpu_load[] statistics * @this_rq: The rq to update statistics for * @this_load: The current load * @pending_updates: The number of missed updates - * @active: !0 for NOHZ_FULL * * Update rq->cpu_load[] statistics. This function is usually called every * scheduler tick (TICK_NSEC). @@ -4531,12 +4633,12 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) * load[i]_n = (1 - 1/2^i)^n * load[i]_0 * * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra - * term. See the @active paramter. + * term. */ -static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, - unsigned long pending_updates, int active) +static void cpu_load_update(struct rq *this_rq, unsigned long this_load, + unsigned long pending_updates) { - unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0; + unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0]; int i, scale; this_rq->nr_load_updates++; @@ -4549,6 +4651,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, /* scale is effectively 1 << i now, and >> i divides by scale */ old_load = this_rq->cpu_load[i]; +#ifdef CONFIG_NO_HZ_COMMON old_load = decay_load_missed(old_load, pending_updates - 1, i); if (tickless_load) { old_load -= decay_load_missed(tickless_load, pending_updates - 1, i); @@ -4559,6 +4662,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, */ old_load += tickless_load; } +#endif new_load = this_load; /* * Round up the averaging division if load is increasing. This @@ -4581,10 +4685,23 @@ static unsigned long weighted_cpuload(const int cpu) } #ifdef CONFIG_NO_HZ_COMMON -static void __update_cpu_load_nohz(struct rq *this_rq, - unsigned long curr_jiffies, - unsigned long load, - int active) +/* + * There is no sane way to deal with nohz on smp when using jiffies because the + * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading + * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. + * + * Therefore we need to avoid the delta approach from the regular tick when + * possible since that would seriously skew the load calculation. This is why we + * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on + * jiffies deltas for updates happening while in nohz mode (idle ticks, idle + * loop exit, nohz_idle_balance, nohz full exit...) + * + * This means we might still be one tick off for nohz periods. + */ + +static void cpu_load_update_nohz(struct rq *this_rq, + unsigned long curr_jiffies, + unsigned long load) { unsigned long pending_updates; @@ -4596,28 +4713,15 @@ static void __update_cpu_load_nohz(struct rq *this_rq, * In the NOHZ_FULL case, we were non-idle, we should consider * its weighted load. */ - __update_cpu_load(this_rq, load, pending_updates, active); + cpu_load_update(this_rq, load, pending_updates); } } /* - * There is no sane way to deal with nohz on smp when using jiffies because the - * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading - * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. - * - * Therefore we cannot use the delta approach from the regular tick since that - * would seriously skew the load calculation. However we'll make do for those - * updates happening while idle (nohz_idle_balance) or coming out of idle - * (tick_nohz_idle_exit). - * - * This means we might still be one tick off for nohz periods. - */ - -/* * Called from nohz_idle_balance() to update the load ratings before doing the * idle balance. */ -static void update_cpu_load_idle(struct rq *this_rq) +static void cpu_load_update_idle(struct rq *this_rq) { /* * bail if there's load or we're actually up-to-date. @@ -4625,38 +4729,71 @@ static void update_cpu_load_idle(struct rq *this_rq) if (weighted_cpuload(cpu_of(this_rq))) return; - __update_cpu_load_nohz(this_rq, READ_ONCE(jiffies), 0, 0); + cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0); } /* - * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. + * Record CPU load on nohz entry so we know the tickless load to account + * on nohz exit. cpu_load[0] happens then to be updated more frequently + * than other cpu_load[idx] but it should be fine as cpu_load readers + * shouldn't rely into synchronized cpu_load[*] updates. */ -void update_cpu_load_nohz(int active) +void cpu_load_update_nohz_start(void) { struct rq *this_rq = this_rq(); + + /* + * This is all lockless but should be fine. If weighted_cpuload changes + * concurrently we'll exit nohz. And cpu_load write can race with + * cpu_load_update_idle() but both updater would be writing the same. + */ + this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq)); +} + +/* + * Account the tickless load in the end of a nohz frame. + */ +void cpu_load_update_nohz_stop(void) +{ unsigned long curr_jiffies = READ_ONCE(jiffies); - unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0; + struct rq *this_rq = this_rq(); + unsigned long load; if (curr_jiffies == this_rq->last_load_update_tick) return; + load = weighted_cpuload(cpu_of(this_rq)); raw_spin_lock(&this_rq->lock); - __update_cpu_load_nohz(this_rq, curr_jiffies, load, active); + update_rq_clock(this_rq); + cpu_load_update_nohz(this_rq, curr_jiffies, load); raw_spin_unlock(&this_rq->lock); } -#endif /* CONFIG_NO_HZ */ +#else /* !CONFIG_NO_HZ_COMMON */ +static inline void cpu_load_update_nohz(struct rq *this_rq, + unsigned long curr_jiffies, + unsigned long load) { } +#endif /* CONFIG_NO_HZ_COMMON */ + +static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load) +{ +#ifdef CONFIG_NO_HZ_COMMON + /* See the mess around cpu_load_update_nohz(). */ + this_rq->last_load_update_tick = READ_ONCE(jiffies); +#endif + cpu_load_update(this_rq, load, 1); +} /* * Called from scheduler_tick() */ -void update_cpu_load_active(struct rq *this_rq) +void cpu_load_update_active(struct rq *this_rq) { unsigned long load = weighted_cpuload(cpu_of(this_rq)); - /* - * See the mess around update_cpu_load_idle() / update_cpu_load_nohz(). - */ - this_rq->last_load_update_tick = jiffies; - __update_cpu_load(this_rq, load, 1, 1); + + if (tick_nohz_tick_stopped()) + cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load); + else + cpu_load_update_periodic(this_rq, load); } /* @@ -4714,46 +4851,6 @@ static unsigned long cpu_avg_load_per_task(int cpu) return 0; } -static void record_wakee(struct task_struct *p) -{ - /* - * Rough decay (wiping) for cost saving, don't worry - * about the boundary, really active task won't care - * about the loss. - */ - if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) { - current->wakee_flips >>= 1; - current->wakee_flip_decay_ts = jiffies; - } - - if (current->last_wakee != p) { - current->last_wakee = p; - current->wakee_flips++; - } -} - -static void task_waking_fair(struct task_struct *p) -{ - struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 min_vruntime; - -#ifndef CONFIG_64BIT - u64 min_vruntime_copy; - - do { - min_vruntime_copy = cfs_rq->min_vruntime_copy; - smp_rmb(); - min_vruntime = cfs_rq->min_vruntime; - } while (min_vruntime != min_vruntime_copy); -#else - min_vruntime = cfs_rq->min_vruntime; -#endif - - se->vruntime -= min_vruntime; - record_wakee(p); -} - #ifdef CONFIG_FAIR_GROUP_SCHED /* * effective_load() calculates the load change as seen from the root_task_group @@ -4869,17 +4966,39 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) #endif +static void record_wakee(struct task_struct *p) +{ + /* + * Only decay a single time; tasks that have less then 1 wakeup per + * jiffy will not have built up many flips. + */ + if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) { + current->wakee_flips >>= 1; + current->wakee_flip_decay_ts = jiffies; + } + + if (current->last_wakee != p) { + current->last_wakee = p; + current->wakee_flips++; + } +} + /* * Detect M:N waker/wakee relationships via a switching-frequency heuristic. + * * A waker of many should wake a different task than the one last awakened - * at a frequency roughly N times higher than one of its wakees. In order - * to determine whether we should let the load spread vs consolodating to - * shared cache, we look for a minimum 'flip' frequency of llc_size in one - * partner, and a factor of lls_size higher frequency in the other. With - * both conditions met, we can be relatively sure that the relationship is - * non-monogamous, with partner count exceeding socket size. Waker/wakee - * being client/server, worker/dispatcher, interrupt source or whatever is - * irrelevant, spread criteria is apparent partner count exceeds socket size. + * at a frequency roughly N times higher than one of its wakees. + * + * In order to determine whether we should let the load spread vs consolidating + * to shared cache, we look for a minimum 'flip' frequency of llc_size in one + * partner, and a factor of lls_size higher frequency in the other. + * + * With both conditions met, we can be relatively sure that the relationship is + * non-monogamous, with partner count exceeding socket size. + * + * Waker/wakee being client/server, worker/dispatcher, interrupt source or + * whatever is irrelevant, spread criteria is apparent partner count exceeds + * socket size. */ static int wake_wide(struct task_struct *p) { @@ -5184,8 +5303,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int want_affine = 0; int sync = wake_flags & WF_SYNC; - if (sd_flag & SD_BALANCE_WAKE) + if (sd_flag & SD_BALANCE_WAKE) { + record_wakee(p); want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + } rcu_read_lock(); for_each_domain(cpu, tmp) { @@ -5265,6 +5386,32 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f static void migrate_task_rq_fair(struct task_struct *p) { /* + * As blocked tasks retain absolute vruntime the migration needs to + * deal with this by subtracting the old and adding the new + * min_vruntime -- the latter is done by enqueue_entity() when placing + * the task on the new runqueue. + */ + if (p->state == TASK_WAKING) { + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 min_vruntime; + +#ifndef CONFIG_64BIT + u64 min_vruntime_copy; + + do { + min_vruntime_copy = cfs_rq->min_vruntime_copy; + smp_rmb(); + min_vruntime = cfs_rq->min_vruntime; + } while (min_vruntime != min_vruntime_copy); +#else + min_vruntime = cfs_rq->min_vruntime; +#endif + + se->vruntime -= min_vruntime; + } + + /* * We are supposed to update the task to "current" time, then its up to date * and ready to go to new CPU/cfs_rq. But we have difficulty in getting * what current time is, so simply throw away the out-of-date time. This @@ -5447,7 +5594,7 @@ preempt: } static struct task_struct * -pick_next_task_fair(struct rq *rq, struct task_struct *prev) +pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) { struct cfs_rq *cfs_rq = &rq->cfs; struct sched_entity *se; @@ -5560,9 +5707,9 @@ idle: * further scheduler activity on it and we're being very careful to * re-start the picking loop. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); new_tasks = idle_balance(rq); - lockdep_pin_lock(&rq->lock); + lockdep_repin_lock(&rq->lock, cookie); /* * Because idle_balance() releases (and re-acquires) rq->lock, it is * possible for any higher priority task to appear. In that case we @@ -5661,7 +5808,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * W_i,0 = \Sum_j w_i,j (2) * * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight - * is derived from the nice value as per prio_to_weight[]. + * is derived from the nice value as per sched_prio_to_weight[]. * * The weight average is an exponential decay average of the instantaneous * weight: @@ -6163,7 +6310,7 @@ static void update_blocked_averages(int cpu) if (throttled_hierarchy(cfs_rq)) continue; - if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq)) + if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true)) update_tg_load_avg(cfs_rq, 0); } raw_spin_unlock_irqrestore(&rq->lock, flags); @@ -6224,7 +6371,7 @@ static inline void update_blocked_averages(int cpu) raw_spin_lock_irqsave(&rq->lock, flags); update_rq_clock(rq); - update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); + update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true); raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -6633,6 +6780,9 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (!(env->sd->flags & SD_ASYM_PACKING)) return true; + /* No ASYM_PACKING if target cpu is already busy */ + if (env->idle == CPU_NOT_IDLE) + return true; /* * ASYM_PACKING needs to move all the work to the lowest * numbered CPUs in the group, therefore mark all groups @@ -6642,7 +6792,8 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (!sds->busiest) return true; - if (group_first_cpu(sds->busiest) > group_first_cpu(sg)) + /* Prefer to move from highest possible cpu's work */ + if (group_first_cpu(sds->busiest) < group_first_cpu(sg)) return true; } @@ -6788,6 +6939,9 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) if (!(env->sd->flags & SD_ASYM_PACKING)) return 0; + if (env->idle == CPU_NOT_IDLE) + return 0; + if (!sds->busiest) return 0; @@ -6896,9 +7050,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s } /* - * In the presence of smp nice balancing, certain scenarios can have - * max load less than avg load(as we skip the groups at or below - * its cpu_capacity, while calculating max_load..) + * Avg load of busiest sg can be less and avg load of local sg can + * be greater than avg load across all sgs of sd because avg load + * factors in sg capacity and sgs with smaller group_type are + * skipped when updating the busiest sg: */ if (busiest->avg_load <= sds->avg_load || local->avg_load >= sds->avg_load) { @@ -6911,11 +7066,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s */ if (busiest->group_type == group_overloaded && local->group_type == group_overloaded) { - load_above_capacity = busiest->sum_nr_running * - SCHED_LOAD_SCALE; - if (load_above_capacity > busiest->group_capacity) + load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE; + if (load_above_capacity > busiest->group_capacity) { load_above_capacity -= busiest->group_capacity; - else + load_above_capacity *= NICE_0_LOAD; + load_above_capacity /= busiest->group_capacity; + } else load_above_capacity = ~0UL; } @@ -6923,9 +7079,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * We're trying to get all the cpus to the average_load, so we don't * want to push ourselves above the average load, nor do we wish to * reduce the max loaded cpu below the average load. At the same time, - * we also don't want to reduce the group load below the group capacity - * (so that we can implement power-savings policies etc). Thus we look - * for the minimum possible imbalance. + * we also don't want to reduce the group load below the group + * capacity. Thus we look for the minimum possible imbalance. */ max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity); @@ -6949,10 +7104,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /** * find_busiest_group - Returns the busiest group within the sched_domain - * if there is an imbalance. If there isn't an imbalance, and - * the user has opted for power-savings, it returns a group whose - * CPUs can be put to idle by rebalancing those tasks elsewhere, if - * such a group exists. + * if there is an imbalance. * * Also calculates the amount of weighted load which should be moved * to restore balance. @@ -6960,9 +7112,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * @env: The load balancing environment. * * Return: - The busiest group if imbalance exists. - * - If no imbalance and user has opted for power-savings balance, - * return the least loaded group whose CPUs can be - * put to idle by rebalancing its tasks onto our group. */ static struct sched_group *find_busiest_group(struct lb_env *env) { @@ -6980,8 +7129,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) busiest = &sds.busiest_stat; /* ASYM feature bypasses nice load balance check */ - if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && - check_asym_packing(env, &sds)) + if (check_asym_packing(env, &sds)) return sds.busiest; /* There is no busy sibling group to pull tasks from */ @@ -7406,10 +7554,7 @@ more_balance: &busiest->active_balance_work); } - /* - * We've kicked active balancing, reset the failure - * counter. - */ + /* We've kicked active balancing, force task migration. */ sd->nr_balance_failed = sd->cache_nice_tries+1; } } else @@ -7644,10 +7789,13 @@ static int active_load_balance_cpu_stop(void *data) schedstat_inc(sd, alb_count); p = detach_one_task(&env); - if (p) + if (p) { schedstat_inc(sd, alb_pushed); - else + /* Active balancing done, reset the failure counter. */ + sd->nr_balance_failed = 0; + } else { schedstat_inc(sd, alb_failed); + } } rcu_read_unlock(); out_unlock: @@ -7718,7 +7866,7 @@ static void nohz_balancer_kick(void) return; } -static inline void nohz_balance_exit_idle(int cpu) +void nohz_balance_exit_idle(unsigned int cpu) { if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { /* @@ -7791,18 +7939,6 @@ void nohz_balance_enter_idle(int cpu) atomic_inc(&nohz.nr_cpus); set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); } - -static int sched_ilb_notifier(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DYING: - nohz_balance_exit_idle(smp_processor_id()); - return NOTIFY_OK; - default: - return NOTIFY_DONE; - } -} #endif static DEFINE_SPINLOCK(balancing); @@ -7964,7 +8100,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) if (time_after_eq(jiffies, rq->next_balance)) { raw_spin_lock_irq(&rq->lock); update_rq_clock(rq); - update_cpu_load_idle(rq); + cpu_load_update_idle(rq); raw_spin_unlock_irq(&rq->lock); rebalance_domains(rq, CPU_IDLE); } @@ -8389,6 +8525,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); init_entity_runnable_average(se); + post_init_entity_util_avg(se); } return 1; @@ -8545,7 +8682,6 @@ const struct sched_class fair_sched_class = { .rq_online = rq_online_fair, .rq_offline = rq_offline_fair, - .task_waking = task_waking_fair, .task_dead = task_dead_fair, .set_cpus_allowed = set_cpus_allowed_common, #endif @@ -8607,7 +8743,6 @@ __init void init_sched_fair_class(void) #ifdef CONFIG_NO_HZ_COMMON nohz.next_balance = jiffies; zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); - cpu_notifier(sched_ilb_notifier, 0); #endif #endif /* SMP */ diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 47ce94931f1b..2ce5458bbe1d 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -24,7 +24,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl } static struct task_struct * -pick_next_task_idle(struct rq *rq, struct task_struct *prev) +pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) { put_prev_task(rq, prev); diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index ef7159012cf3..b0b93fd33af9 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -99,10 +99,13 @@ long calc_load_fold_active(struct rq *this_rq) static unsigned long calc_load(unsigned long load, unsigned long exp, unsigned long active) { - load *= exp; - load += active * (FIXED_1 - exp); - load += 1UL << (FSHIFT - 1); - return load >> FSHIFT; + unsigned long newload; + + newload = load * exp + active * (FIXED_1 - exp); + if (active >= load) + newload += FIXED_1-1; + + return newload / FIXED_1; } #ifdef CONFIG_NO_HZ_COMMON diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index ec4f538d4396..d5690b722691 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -334,7 +334,7 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) rt_rq = &rq_of_rt_rq(rt_rq)->rt; rt_rq->rt_nr_total++; - if (p->nr_cpus_allowed > 1) + if (tsk_nr_cpus_allowed(p) > 1) rt_rq->rt_nr_migratory++; update_rt_migration(rt_rq); @@ -351,7 +351,7 @@ static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) rt_rq = &rq_of_rt_rq(rt_rq)->rt; rt_rq->rt_nr_total--; - if (p->nr_cpus_allowed > 1) + if (tsk_nr_cpus_allowed(p) > 1) rt_rq->rt_nr_migratory--; update_rt_migration(rt_rq); @@ -953,14 +953,14 @@ static void update_curr_rt(struct rq *rq) if (curr->sched_class != &rt_sched_class) return; - /* Kick cpufreq (see the comment in linux/cpufreq.h). */ - if (cpu_of(rq) == smp_processor_id()) - cpufreq_trigger_update(rq_clock(rq)); - delta_exec = rq_clock_task(rq) - curr->se.exec_start; if (unlikely((s64)delta_exec <= 0)) return; + /* Kick cpufreq (see the comment in linux/cpufreq.h). */ + if (cpu_of(rq) == smp_processor_id()) + cpufreq_trigger_update(rq_clock(rq)); + schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); @@ -1324,7 +1324,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) enqueue_rt_entity(rt_se, flags); - if (!task_current(rq, p) && p->nr_cpus_allowed > 1) + if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1) enqueue_pushable_task(rq, p); } @@ -1413,7 +1413,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) * will have to sort it out. */ if (curr && unlikely(rt_task(curr)) && - (curr->nr_cpus_allowed < 2 || + (tsk_nr_cpus_allowed(curr) < 2 || curr->prio <= p->prio)) { int target = find_lowest_rq(p); @@ -1437,7 +1437,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) * Current can't be migrated, useless to reschedule, * let's hope p can move out. */ - if (rq->curr->nr_cpus_allowed == 1 || + if (tsk_nr_cpus_allowed(rq->curr) == 1 || !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) return; @@ -1445,7 +1445,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) * p is migratable, so let's not schedule it and * see if it is pushed or pulled somewhere else. */ - if (p->nr_cpus_allowed != 1 + if (tsk_nr_cpus_allowed(p) != 1 && cpupri_find(&rq->rd->cpupri, p, NULL)) return; @@ -1524,7 +1524,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) } static struct task_struct * -pick_next_task_rt(struct rq *rq, struct task_struct *prev) +pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) { struct task_struct *p; struct rt_rq *rt_rq = &rq->rt; @@ -1536,9 +1536,9 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) * disabled avoiding further scheduler activity on it and we're * being very careful to re-start the picking loop. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); pull_rt_task(rq); - lockdep_pin_lock(&rq->lock); + lockdep_repin_lock(&rq->lock, cookie); /* * pull_rt_task() can drop (and re-acquire) rq->lock; this * means a dl or stop task can slip in, in which case we need @@ -1579,7 +1579,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) * The previous task needs to be made eligible for pushing * if it is still active */ - if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) + if (on_rt_rq(&p->rt) && tsk_nr_cpus_allowed(p) > 1) enqueue_pushable_task(rq, p); } @@ -1629,7 +1629,7 @@ static int find_lowest_rq(struct task_struct *task) if (unlikely(!lowest_mask)) return -1; - if (task->nr_cpus_allowed == 1) + if (tsk_nr_cpus_allowed(task) == 1) return -1; /* No other targets possible */ if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) @@ -1762,7 +1762,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) BUG_ON(rq->cpu != task_cpu(p)); BUG_ON(task_current(rq, p)); - BUG_ON(p->nr_cpus_allowed <= 1); + BUG_ON(tsk_nr_cpus_allowed(p) <= 1); BUG_ON(!task_on_rq_queued(p)); BUG_ON(!rt_task(p)); @@ -2122,9 +2122,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) { if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && - p->nr_cpus_allowed > 1 && + tsk_nr_cpus_allowed(p) > 1 && (dl_task(rq->curr) || rt_task(rq->curr)) && - (rq->curr->nr_cpus_allowed < 2 || + (tsk_nr_cpus_allowed(rq->curr) < 2 || rq->curr->prio <= p->prio)) push_rt_tasks(rq); } @@ -2197,7 +2197,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) */ if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP - if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) + if (tsk_nr_cpus_allowed(p) > 1 && rq->rt.overloaded) queue_push_tasks(rq); #else if (p->prio < rq->curr->prio) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ec2e8d23527e..e51145e76807 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -31,9 +31,9 @@ extern void calc_global_load_tick(struct rq *this_rq); extern long calc_load_fold_active(struct rq *this_rq); #ifdef CONFIG_SMP -extern void update_cpu_load_active(struct rq *this_rq); +extern void cpu_load_update_active(struct rq *this_rq); #else -static inline void update_cpu_load_active(struct rq *this_rq) { } +static inline void cpu_load_update_active(struct rq *this_rq) { } #endif /* @@ -49,25 +49,32 @@ static inline void update_cpu_load_active(struct rq *this_rq) { } * and does not change the user-interface for setting shares/weights. * * We increase resolution only if we have enough bits to allow this increased - * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution - * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the - * increased costs. + * resolution (i.e. 64bit). The costs for increasing resolution when 32bit are + * pretty high and the returns do not justify the increased costs. + * + * Really only required when CONFIG_FAIR_GROUP_SCHED is also set, but to + * increase coverage and consistency always enable it on 64bit platforms. */ -#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load */ -# define SCHED_LOAD_RESOLUTION 10 -# define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION) -# define scale_load_down(w) ((w) >> SCHED_LOAD_RESOLUTION) +#ifdef CONFIG_64BIT +# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT) +# define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT) +# define scale_load_down(w) ((w) >> SCHED_FIXEDPOINT_SHIFT) #else -# define SCHED_LOAD_RESOLUTION 0 +# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT) # define scale_load(w) (w) # define scale_load_down(w) (w) #endif -#define SCHED_LOAD_SHIFT (10 + SCHED_LOAD_RESOLUTION) -#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) - -#define NICE_0_LOAD SCHED_LOAD_SCALE -#define NICE_0_SHIFT SCHED_LOAD_SHIFT +/* + * Task weight (visible to users) and its load (invisible to users) have + * independent resolution, but they should be well calibrated. We use + * scale_load() and scale_load_down(w) to convert between them. The + * following must be true: + * + * scale_load(sched_prio_to_weight[USER_PRIO(NICE_TO_PRIO(0))]) == NICE_0_LOAD + * + */ +#define NICE_0_LOAD (1L << NICE_0_LOAD_SHIFT) /* * Single value that decides SCHED_DEADLINE internal math precision. @@ -585,11 +592,13 @@ struct rq { #endif #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; - unsigned long last_load_update_tick; #ifdef CONFIG_NO_HZ_COMMON +#ifdef CONFIG_SMP + unsigned long last_load_update_tick; +#endif /* CONFIG_SMP */ u64 nohz_stamp; unsigned long nohz_flags; -#endif +#endif /* CONFIG_NO_HZ_COMMON */ #ifdef CONFIG_NO_HZ_FULL unsigned long last_sched_tick; #endif @@ -854,7 +863,7 @@ DECLARE_PER_CPU(struct sched_domain *, sd_asym); struct sched_group_capacity { atomic_t ref; /* - * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity + * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity * for a single CPU. */ unsigned int capacity; @@ -1159,7 +1168,7 @@ extern const u32 sched_prio_to_wmult[40]; * * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) - * ENQUEUE_WAKING - sched_class::task_waking was called + * ENQUEUE_MIGRATED - the task was migrated during wakeup * */ @@ -1174,9 +1183,9 @@ extern const u32 sched_prio_to_wmult[40]; #define ENQUEUE_HEAD 0x08 #define ENQUEUE_REPLENISH 0x10 #ifdef CONFIG_SMP -#define ENQUEUE_WAKING 0x20 +#define ENQUEUE_MIGRATED 0x20 #else -#define ENQUEUE_WAKING 0x00 +#define ENQUEUE_MIGRATED 0x00 #endif #define RETRY_TASK ((void *)-1UL) @@ -1200,14 +1209,14 @@ struct sched_class { * tasks. */ struct task_struct * (*pick_next_task) (struct rq *rq, - struct task_struct *prev); + struct task_struct *prev, + struct pin_cookie cookie); void (*put_prev_task) (struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); void (*migrate_task_rq)(struct task_struct *p); - void (*task_waking) (struct task_struct *task); void (*task_woken) (struct rq *this_rq, struct task_struct *task); void (*set_cpus_allowed)(struct task_struct *p, @@ -1313,6 +1322,7 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se); unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); +extern void post_init_entity_util_avg(struct sched_entity *se); #ifdef CONFIG_NO_HZ_FULL extern bool sched_can_stop_tick(struct rq *rq); @@ -1448,86 +1458,32 @@ static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } static inline void sched_avg_update(struct rq *rq) { } #endif -/* - * __task_rq_lock - lock the rq @p resides on. - */ -static inline struct rq *__task_rq_lock(struct task_struct *p) - __acquires(rq->lock) -{ - struct rq *rq; - - lockdep_assert_held(&p->pi_lock); - - for (;;) { - rq = task_rq(p); - raw_spin_lock(&rq->lock); - if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { - lockdep_pin_lock(&rq->lock); - return rq; - } - raw_spin_unlock(&rq->lock); - - while (unlikely(task_on_rq_migrating(p))) - cpu_relax(); - } -} +struct rq_flags { + unsigned long flags; + struct pin_cookie cookie; +}; -/* - * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. - */ -static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) +struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) + __acquires(rq->lock); +struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) __acquires(p->pi_lock) - __acquires(rq->lock) -{ - struct rq *rq; - - for (;;) { - raw_spin_lock_irqsave(&p->pi_lock, *flags); - rq = task_rq(p); - raw_spin_lock(&rq->lock); - /* - * move_queued_task() task_rq_lock() - * - * ACQUIRE (rq->lock) - * [S] ->on_rq = MIGRATING [L] rq = task_rq() - * WMB (__set_task_cpu()) ACQUIRE (rq->lock); - * [S] ->cpu = new_cpu [L] task_rq() - * [L] ->on_rq - * RELEASE (rq->lock) - * - * If we observe the old cpu in task_rq_lock, the acquire of - * the old rq->lock will fully serialize against the stores. - * - * If we observe the new cpu in task_rq_lock, the acquire will - * pair with the WMB to ensure we must then also see migrating. - */ - if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { - lockdep_pin_lock(&rq->lock); - return rq; - } - raw_spin_unlock(&rq->lock); - raw_spin_unlock_irqrestore(&p->pi_lock, *flags); - - while (unlikely(task_on_rq_migrating(p))) - cpu_relax(); - } -} + __acquires(rq->lock); -static inline void __task_rq_unlock(struct rq *rq) +static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) __releases(rq->lock) { - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, rf->cookie); raw_spin_unlock(&rq->lock); } static inline void -task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) +task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) __releases(rq->lock) __releases(p->pi_lock) { - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, rf->cookie); raw_spin_unlock(&rq->lock); - raw_spin_unlock_irqrestore(&p->pi_lock, *flags); + raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); } #ifdef CONFIG_SMP @@ -1743,6 +1699,10 @@ enum rq_nohz_flag_bits { }; #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) + +extern void nohz_balance_exit_idle(unsigned int cpu); +#else +static inline void nohz_balance_exit_idle(unsigned int cpu) { } #endif #ifdef CONFIG_IRQ_TIME_ACCOUNTING diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index cbc67da10954..604297a08b3a 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -24,7 +24,7 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) } static struct task_struct * -pick_next_task_stop(struct rq *rq, struct task_struct *prev) +pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) { struct task_struct *stop = rq->stop; diff --git a/kernel/signal.c b/kernel/signal.c index aa9bf00749c1..ab122a2cee41 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3099,12 +3099,14 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s oss.ss_sp = (void __user *) current->sas_ss_sp; oss.ss_size = current->sas_ss_size; - oss.ss_flags = sas_ss_flags(sp); + oss.ss_flags = sas_ss_flags(sp) | + (current->sas_ss_flags & SS_FLAG_BITS); if (uss) { void __user *ss_sp; size_t ss_size; - int ss_flags; + unsigned ss_flags; + int ss_mode; error = -EFAULT; if (!access_ok(VERIFY_READ, uss, sizeof(*uss))) @@ -3119,18 +3121,13 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s if (on_sig_stack(sp)) goto out; + ss_mode = ss_flags & ~SS_FLAG_BITS; error = -EINVAL; - /* - * Note - this code used to test ss_flags incorrectly: - * old code may have been written using ss_flags==0 - * to mean ss_flags==SS_ONSTACK (as this was the only - * way that worked) - this fix preserves that older - * mechanism. - */ - if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0) + if (ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK && + ss_mode != 0) goto out; - if (ss_flags == SS_DISABLE) { + if (ss_mode == SS_DISABLE) { ss_size = 0; ss_sp = NULL; } else { @@ -3141,6 +3138,7 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s current->sas_ss_sp = (unsigned long) ss_sp; current->sas_ss_size = ss_size; + current->sas_ss_flags = ss_flags; } error = 0; @@ -3171,9 +3169,14 @@ int restore_altstack(const stack_t __user *uss) int __save_altstack(stack_t __user *uss, unsigned long sp) { struct task_struct *t = current; - return __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) | - __put_user(sas_ss_flags(sp), &uss->ss_flags) | + int err = __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) | + __put_user(t->sas_ss_flags, &uss->ss_flags) | __put_user(t->sas_ss_size, &uss->ss_size); + if (err) + return err; + if (t->sas_ss_flags & SS_AUTODISARM) + sas_ss_reset(t); + return 0; } #ifdef CONFIG_COMPAT diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 58e3310c9b21..536ada80f6dd 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -262,7 +262,7 @@ static void tick_nohz_dep_set_all(atomic_t *dep, { int prev; - prev = atomic_fetch_or(dep, BIT(bit)); + prev = atomic_fetch_or(BIT(bit), dep); if (!prev) tick_nohz_full_kick_all(); } @@ -292,7 +292,7 @@ void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) ts = per_cpu_ptr(&tick_cpu_sched, cpu); - prev = atomic_fetch_or(&ts->tick_dep_mask, BIT(bit)); + prev = atomic_fetch_or(BIT(bit), &ts->tick_dep_mask); if (!prev) { preempt_disable(); /* Perf needs local kick that is NMI safe */ @@ -776,6 +776,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, if (!ts->tick_stopped) { nohz_balance_enter_idle(cpu); calc_load_enter_idle(); + cpu_load_update_nohz_start(); ts->last_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; @@ -802,11 +803,11 @@ out: return tick; } -static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now, int active) +static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) { /* Update jiffies first */ tick_do_update_jiffies64(now); - update_cpu_load_nohz(active); + cpu_load_update_nohz_stop(); calc_load_exit_idle(); touch_softlockup_watchdog_sched(); @@ -833,7 +834,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts) if (can_stop_full_tick(ts)) tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); else if (ts->tick_stopped) - tick_nohz_restart_sched_tick(ts, ktime_get(), 1); + tick_nohz_restart_sched_tick(ts, ktime_get()); #endif } @@ -1024,7 +1025,7 @@ void tick_nohz_idle_exit(void) tick_nohz_stop_idle(ts, now); if (ts->tick_stopped) { - tick_nohz_restart_sched_tick(ts, now, 0); + tick_nohz_restart_sched_tick(ts, now); tick_nohz_account_idle_ticks(ts); } diff --git a/kernel/torture.c b/kernel/torture.c index 44aa462d033f..fa0bdeee17ac 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -451,6 +451,7 @@ static int torture_shutdown(void *arg) torture_shutdown_hook(); else VERBOSE_TOROUT_STRING("No torture_shutdown_hook(), skipping."); + ftrace_dump(DUMP_ALL); kernel_power_off(); /* Shut down the system. */ return 0; } @@ -602,8 +603,9 @@ bool torture_init_begin(char *ttype, bool v, int *runnable) { mutex_lock(&fullstop_mutex); if (torture_type != NULL) { - pr_alert("torture_init_begin: refusing %s init: %s running", + pr_alert("torture_init_begin: Refusing %s init: %s running.\n", ttype, torture_type); + pr_alert("torture_init_begin: One torture test at a time!\n"); mutex_unlock(&fullstop_mutex); return false; } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3bfdff06eea7..5f5068e94003 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4554,6 +4554,17 @@ static void rebind_workers(struct worker_pool *pool) pool->attrs->cpumask) < 0); spin_lock_irq(&pool->lock); + + /* + * XXX: CPU hotplug notifiers are weird and can call DOWN_FAILED + * w/o preceding DOWN_PREPARE. Work around it. CPU hotplug is + * being reworked and this can go away in time. + */ + if (!(pool->flags & POOL_DISASSOCIATED)) { + spin_unlock_irq(&pool->lock); + return; + } + pool->flags &= ~POOL_DISASSOCIATED; for_each_pool_worker(worker, pool) { |