diff options
author | Frederic Weisbecker <fweisbec@gmail.com> | 2013-05-02 17:37:49 +0200 |
---|---|---|
committer | Frederic Weisbecker <fweisbec@gmail.com> | 2013-05-02 17:54:19 +0200 |
commit | c032862fba51a3ca504752d3a25186b324c5ce83 (patch) | |
tree | 955dc2ba4ab3df76ecc2bb780ee84aca04967e8d /kernel/sched | |
parent | Merge branch 'rcu/nohz' of git://git.kernel.org/pub/scm/linux/kernel/git/paul... (diff) | |
parent | Merge branch 'smp-hotplug-for-linus' of git://git.kernel.org/pub/scm/linux/ke... (diff) | |
download | linux-c032862fba51a3ca504752d3a25186b324c5ce83.tar.xz linux-c032862fba51a3ca504752d3a25186b324c5ce83.zip |
Merge commit '8700c95adb03' into timers/nohz
The full dynticks tree needs the latest RCU and sched
upstream updates in order to fix some dependencies.
Merge a common upstream merge point that has these
updates.
Conflicts:
include/linux/perf_event.h
kernel/rcutree.h
kernel/rcutree_plugin.h
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/Makefile | 1 | ||||
-rw-r--r-- | kernel/sched/clock.c | 26 | ||||
-rw-r--r-- | kernel/sched/core.c | 299 | ||||
-rw-r--r-- | kernel/sched/cpuacct.c | 296 | ||||
-rw-r--r-- | kernel/sched/cpuacct.h | 17 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 20 | ||||
-rw-r--r-- | kernel/sched/fair.c | 131 | ||||
-rw-r--r-- | kernel/sched/features.h | 7 | ||||
-rw-r--r-- | kernel/sched/idle_task.c | 16 | ||||
-rw-r--r-- | kernel/sched/sched.h | 61 |
10 files changed, 464 insertions, 410 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index f06d249e103b..deaf90e4a1de 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -16,3 +16,4 @@ obj-$(CONFIG_SMP) += cpupri.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o +obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c685e31492df..c3ae1446461c 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -176,10 +176,36 @@ static u64 sched_clock_remote(struct sched_clock_data *scd) u64 this_clock, remote_clock; u64 *ptr, old_val, val; +#if BITS_PER_LONG != 64 +again: + /* + * Careful here: The local and the remote clock values need to + * be read out atomic as we need to compare the values and + * then update either the local or the remote side. So the + * cmpxchg64 below only protects one readout. + * + * We must reread via sched_clock_local() in the retry case on + * 32bit as an NMI could use sched_clock_local() via the + * tracer and hit between the readout of + * the low32bit and the high 32bit portion. + */ + this_clock = sched_clock_local(my_scd); + /* + * We must enforce atomic readout on 32bit, otherwise the + * update on the remote cpu can hit inbetween the readout of + * the low32bit and the high 32bit portion. + */ + remote_clock = cmpxchg64(&scd->clock, 0, 0); +#else + /* + * On 64bit the read of [my]scd->clock is atomic versus the + * update, so we can avoid the above 32bit dance. + */ sched_clock_local(my_scd); again: this_clock = my_scd->clock; remote_clock = scd->clock; +#endif /* * Use the opportunity that we have both locks diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dd09def88567..e94842d4400c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -512,11 +512,6 @@ static inline void init_hrtick(void) * the target CPU. */ #ifdef CONFIG_SMP - -#ifndef tsk_is_polling -#define tsk_is_polling(t) 0 -#endif - void resched_task(struct task_struct *p) { int cpu; @@ -1536,8 +1531,10 @@ static void try_to_wake_up_local(struct task_struct *p) { struct rq *rq = task_rq(p); - BUG_ON(rq != this_rq()); - BUG_ON(p == current); + if (WARN_ON_ONCE(rq != this_rq()) || + WARN_ON_ONCE(p == current)) + return; + lockdep_assert_held(&rq->lock); if (!raw_spin_trylock(&p->pi_lock)) { @@ -3037,51 +3034,6 @@ void __sched schedule_preempt_disabled(void) preempt_disable(); } -#ifdef CONFIG_MUTEX_SPIN_ON_OWNER - -static inline bool owner_running(struct mutex *lock, struct task_struct *owner) -{ - if (lock->owner != owner) - return false; - - /* - * Ensure we emit the owner->on_cpu, dereference _after_ checking - * lock->owner still matches owner, if that fails, owner might - * point to free()d memory, if it still matches, the rcu_read_lock() - * ensures the memory stays valid. - */ - barrier(); - - return owner->on_cpu; -} - -/* - * Look out! "owner" is an entirely speculative pointer - * access and not reliable. - */ -int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) -{ - if (!sched_feat(OWNER_SPIN)) - return 0; - - rcu_read_lock(); - while (owner_running(lock, owner)) { - if (need_resched()) - break; - - arch_mutex_cpu_relax(); - } - rcu_read_unlock(); - - /* - * We break out the loop above on need_resched() and when the - * owner changed, which is a sign for heavy contention. Return - * success only when lock->owner is NULL. - */ - return lock->owner == NULL; -} -#endif - #ifdef CONFIG_PREEMPT /* * this is the entry point to schedule() from in-kernel preemption @@ -4170,6 +4122,10 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) get_task_struct(p); rcu_read_unlock(); + if (p->flags & PF_NO_SETAFFINITY) { + retval = -EINVAL; + goto out_put_task; + } if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { retval = -ENOMEM; goto out_put_task; @@ -4817,11 +4773,6 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) goto out; } - if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) { - ret = -EINVAL; - goto out; - } - do_set_cpus_allowed(p, new_mask); /* Can the task run on the task's current CPU? If so, we're done */ @@ -5043,7 +4994,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep) } static int min_load_idx = 0; -static int max_load_idx = CPU_LOAD_IDX_MAX; +static int max_load_idx = CPU_LOAD_IDX_MAX-1; static void set_table_entry(struct ctl_table *entry, @@ -6292,7 +6243,7 @@ static void sched_init_numa(void) * 'level' contains the number of unique distances, excluding the * identity distance node_distance(i,i). * - * The sched_domains_nume_distance[] array includes the actual distance + * The sched_domains_numa_distance[] array includes the actual distance * numbers. */ @@ -6913,7 +6864,7 @@ struct task_group root_task_group; LIST_HEAD(task_groups); #endif -DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); +DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); void __init sched_init(void) { @@ -6950,7 +6901,7 @@ void __init sched_init(void) #endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_CPUMASK_OFFSTACK for_each_possible_cpu(i) { - per_cpu(load_balance_tmpmask, i) = (void *)ptr; + per_cpu(load_balance_mask, i) = (void *)ptr; ptr += cpumask_size(); } #endif /* CONFIG_CPUMASK_OFFSTACK */ @@ -6976,12 +6927,6 @@ void __init sched_init(void) #endif /* CONFIG_CGROUP_SCHED */ -#ifdef CONFIG_CGROUP_CPUACCT - root_cpuacct.cpustat = &kernel_cpustat; - root_cpuacct.cpuusage = alloc_percpu(u64); - /* Too early, not expected to fail */ - BUG_ON(!root_cpuacct.cpuusage); -#endif for_each_possible_cpu(i) { struct rq *rq; @@ -8083,226 +8028,6 @@ struct cgroup_subsys cpu_cgroup_subsys = { #endif /* CONFIG_CGROUP_SCHED */ -#ifdef CONFIG_CGROUP_CPUACCT - -/* - * CPU accounting code for task groups. - * - * Based on the work by Paul Menage (menage@google.com) and Balbir Singh - * (balbir@in.ibm.com). - */ - -struct cpuacct root_cpuacct; - -/* create a new cpu accounting group */ -static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) -{ - struct cpuacct *ca; - - if (!cgrp->parent) - return &root_cpuacct.css; - - ca = kzalloc(sizeof(*ca), GFP_KERNEL); - if (!ca) - goto out; - - ca->cpuusage = alloc_percpu(u64); - if (!ca->cpuusage) - goto out_free_ca; - - ca->cpustat = alloc_percpu(struct kernel_cpustat); - if (!ca->cpustat) - goto out_free_cpuusage; - - return &ca->css; - -out_free_cpuusage: - free_percpu(ca->cpuusage); -out_free_ca: - kfree(ca); -out: - return ERR_PTR(-ENOMEM); -} - -/* destroy an existing cpu accounting group */ -static void cpuacct_css_free(struct cgroup *cgrp) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - - free_percpu(ca->cpustat); - free_percpu(ca->cpuusage); - kfree(ca); -} - -static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) -{ - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - u64 data; - -#ifndef CONFIG_64BIT - /* - * Take rq->lock to make 64-bit read safe on 32-bit platforms. - */ - raw_spin_lock_irq(&cpu_rq(cpu)->lock); - data = *cpuusage; - raw_spin_unlock_irq(&cpu_rq(cpu)->lock); -#else - data = *cpuusage; -#endif - - return data; -} - -static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) -{ - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - -#ifndef CONFIG_64BIT - /* - * Take rq->lock to make 64-bit write safe on 32-bit platforms. - */ - raw_spin_lock_irq(&cpu_rq(cpu)->lock); - *cpuusage = val; - raw_spin_unlock_irq(&cpu_rq(cpu)->lock); -#else - *cpuusage = val; -#endif -} - -/* return total cpu usage (in nanoseconds) of a group */ -static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - u64 totalcpuusage = 0; - int i; - - for_each_present_cpu(i) - totalcpuusage += cpuacct_cpuusage_read(ca, i); - - return totalcpuusage; -} - -static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, - u64 reset) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - int err = 0; - int i; - - if (reset) { - err = -EINVAL; - goto out; - } - - for_each_present_cpu(i) - cpuacct_cpuusage_write(ca, i, 0); - -out: - return err; -} - -static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, - struct seq_file *m) -{ - struct cpuacct *ca = cgroup_ca(cgroup); - u64 percpu; - int i; - - for_each_present_cpu(i) { - percpu = cpuacct_cpuusage_read(ca, i); - seq_printf(m, "%llu ", (unsigned long long) percpu); - } - seq_printf(m, "\n"); - return 0; -} - -static const char *cpuacct_stat_desc[] = { - [CPUACCT_STAT_USER] = "user", - [CPUACCT_STAT_SYSTEM] = "system", -}; - -static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, - struct cgroup_map_cb *cb) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - int cpu; - s64 val = 0; - - for_each_online_cpu(cpu) { - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); - val += kcpustat->cpustat[CPUTIME_USER]; - val += kcpustat->cpustat[CPUTIME_NICE]; - } - val = cputime64_to_clock_t(val); - cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); - - val = 0; - for_each_online_cpu(cpu) { - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); - val += kcpustat->cpustat[CPUTIME_SYSTEM]; - val += kcpustat->cpustat[CPUTIME_IRQ]; - val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; - } - - val = cputime64_to_clock_t(val); - cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); - - return 0; -} - -static struct cftype files[] = { - { - .name = "usage", - .read_u64 = cpuusage_read, - .write_u64 = cpuusage_write, - }, - { - .name = "usage_percpu", - .read_seq_string = cpuacct_percpu_seq_read, - }, - { - .name = "stat", - .read_map = cpuacct_stats_show, - }, - { } /* terminate */ -}; - -/* - * charge this task's execution time to its accounting group. - * - * called with rq->lock held. - */ -void cpuacct_charge(struct task_struct *tsk, u64 cputime) -{ - struct cpuacct *ca; - int cpu; - - if (unlikely(!cpuacct_subsys.active)) - return; - - cpu = task_cpu(tsk); - - rcu_read_lock(); - - ca = task_ca(tsk); - - for (; ca; ca = parent_ca(ca)) { - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - *cpuusage += cputime; - } - - rcu_read_unlock(); -} - -struct cgroup_subsys cpuacct_subsys = { - .name = "cpuacct", - .css_alloc = cpuacct_css_alloc, - .css_free = cpuacct_css_free, - .subsys_id = cpuacct_subsys_id, - .base_cftypes = files, -}; -#endif /* CONFIG_CGROUP_CPUACCT */ - void dump_cpu_task(int cpu) { pr_info("Task dump for CPU %d:\n", cpu); diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c new file mode 100644 index 000000000000..dbb7e2cd95eb --- /dev/null +++ b/kernel/sched/cpuacct.c @@ -0,0 +1,296 @@ +#include <linux/cgroup.h> +#include <linux/slab.h> +#include <linux/percpu.h> +#include <linux/spinlock.h> +#include <linux/cpumask.h> +#include <linux/seq_file.h> +#include <linux/rcupdate.h> +#include <linux/kernel_stat.h> +#include <linux/err.h> + +#include "sched.h" + +/* + * CPU accounting code for task groups. + * + * Based on the work by Paul Menage (menage@google.com) and Balbir Singh + * (balbir@in.ibm.com). + */ + +/* Time spent by the tasks of the cpu accounting group executing in ... */ +enum cpuacct_stat_index { + CPUACCT_STAT_USER, /* ... user mode */ + CPUACCT_STAT_SYSTEM, /* ... kernel mode */ + + CPUACCT_STAT_NSTATS, +}; + +/* track cpu usage of a group of tasks and its child groups */ +struct cpuacct { + struct cgroup_subsys_state css; + /* cpuusage holds pointer to a u64-type object on every cpu */ + u64 __percpu *cpuusage; + struct kernel_cpustat __percpu *cpustat; +}; + +/* return cpu accounting group corresponding to this container */ +static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) +{ + return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), + struct cpuacct, css); +} + +/* return cpu accounting group to which this task belongs */ +static inline struct cpuacct *task_ca(struct task_struct *tsk) +{ + return container_of(task_subsys_state(tsk, cpuacct_subsys_id), + struct cpuacct, css); +} + +static inline struct cpuacct *__parent_ca(struct cpuacct *ca) +{ + return cgroup_ca(ca->css.cgroup->parent); +} + +static inline struct cpuacct *parent_ca(struct cpuacct *ca) +{ + if (!ca->css.cgroup->parent) + return NULL; + return cgroup_ca(ca->css.cgroup->parent); +} + +static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); +static struct cpuacct root_cpuacct = { + .cpustat = &kernel_cpustat, + .cpuusage = &root_cpuacct_cpuusage, +}; + +/* create a new cpu accounting group */ +static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) +{ + struct cpuacct *ca; + + if (!cgrp->parent) + return &root_cpuacct.css; + + ca = kzalloc(sizeof(*ca), GFP_KERNEL); + if (!ca) + goto out; + + ca->cpuusage = alloc_percpu(u64); + if (!ca->cpuusage) + goto out_free_ca; + + ca->cpustat = alloc_percpu(struct kernel_cpustat); + if (!ca->cpustat) + goto out_free_cpuusage; + + return &ca->css; + +out_free_cpuusage: + free_percpu(ca->cpuusage); +out_free_ca: + kfree(ca); +out: + return ERR_PTR(-ENOMEM); +} + +/* destroy an existing cpu accounting group */ +static void cpuacct_css_free(struct cgroup *cgrp) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + + free_percpu(ca->cpustat); + free_percpu(ca->cpuusage); + kfree(ca); +} + +static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) +{ + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + u64 data; + +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit read safe on 32-bit platforms. + */ + raw_spin_lock_irq(&cpu_rq(cpu)->lock); + data = *cpuusage; + raw_spin_unlock_irq(&cpu_rq(cpu)->lock); +#else + data = *cpuusage; +#endif + + return data; +} + +static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) +{ + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit write safe on 32-bit platforms. + */ + raw_spin_lock_irq(&cpu_rq(cpu)->lock); + *cpuusage = val; + raw_spin_unlock_irq(&cpu_rq(cpu)->lock); +#else + *cpuusage = val; +#endif +} + +/* return total cpu usage (in nanoseconds) of a group */ +static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + u64 totalcpuusage = 0; + int i; + + for_each_present_cpu(i) + totalcpuusage += cpuacct_cpuusage_read(ca, i); + + return totalcpuusage; +} + +static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, + u64 reset) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + int err = 0; + int i; + + if (reset) { + err = -EINVAL; + goto out; + } + + for_each_present_cpu(i) + cpuacct_cpuusage_write(ca, i, 0); + +out: + return err; +} + +static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, + struct seq_file *m) +{ + struct cpuacct *ca = cgroup_ca(cgroup); + u64 percpu; + int i; + + for_each_present_cpu(i) { + percpu = cpuacct_cpuusage_read(ca, i); + seq_printf(m, "%llu ", (unsigned long long) percpu); + } + seq_printf(m, "\n"); + return 0; +} + +static const char * const cpuacct_stat_desc[] = { + [CPUACCT_STAT_USER] = "user", + [CPUACCT_STAT_SYSTEM] = "system", +}; + +static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, + struct cgroup_map_cb *cb) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + int cpu; + s64 val = 0; + + for_each_online_cpu(cpu) { + struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); + val += kcpustat->cpustat[CPUTIME_USER]; + val += kcpustat->cpustat[CPUTIME_NICE]; + } + val = cputime64_to_clock_t(val); + cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); + + val = 0; + for_each_online_cpu(cpu) { + struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); + val += kcpustat->cpustat[CPUTIME_SYSTEM]; + val += kcpustat->cpustat[CPUTIME_IRQ]; + val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; + } + + val = cputime64_to_clock_t(val); + cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); + + return 0; +} + +static struct cftype files[] = { + { + .name = "usage", + .read_u64 = cpuusage_read, + .write_u64 = cpuusage_write, + }, + { + .name = "usage_percpu", + .read_seq_string = cpuacct_percpu_seq_read, + }, + { + .name = "stat", + .read_map = cpuacct_stats_show, + }, + { } /* terminate */ +}; + +/* + * charge this task's execution time to its accounting group. + * + * called with rq->lock held. + */ +void cpuacct_charge(struct task_struct *tsk, u64 cputime) +{ + struct cpuacct *ca; + int cpu; + + cpu = task_cpu(tsk); + + rcu_read_lock(); + + ca = task_ca(tsk); + + while (true) { + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + *cpuusage += cputime; + + ca = parent_ca(ca); + if (!ca) + break; + } + + rcu_read_unlock(); +} + +/* + * Add user/system time to cpuacct. + * + * Note: it's the caller that updates the account of the root cgroup. + */ +void cpuacct_account_field(struct task_struct *p, int index, u64 val) +{ + struct kernel_cpustat *kcpustat; + struct cpuacct *ca; + + rcu_read_lock(); + ca = task_ca(p); + while (ca != &root_cpuacct) { + kcpustat = this_cpu_ptr(ca->cpustat); + kcpustat->cpustat[index] += val; + ca = __parent_ca(ca); + } + rcu_read_unlock(); +} + +struct cgroup_subsys cpuacct_subsys = { + .name = "cpuacct", + .css_alloc = cpuacct_css_alloc, + .css_free = cpuacct_css_free, + .subsys_id = cpuacct_subsys_id, + .base_cftypes = files, + .early_init = 1, +}; diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h new file mode 100644 index 000000000000..ed605624a5e7 --- /dev/null +++ b/kernel/sched/cpuacct.h @@ -0,0 +1,17 @@ +#ifdef CONFIG_CGROUP_CPUACCT + +extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); +extern void cpuacct_account_field(struct task_struct *p, int index, u64 val); + +#else + +static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) +{ +} + +static inline void +cpuacct_account_field(struct task_struct *p, int index, u64 val) +{ +} + +#endif diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 699d59756ece..ea32f02bf2c3 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -115,10 +115,6 @@ static int irqtime_account_si_update(void) static inline void task_group_account_field(struct task_struct *p, int index, u64 tmp) { -#ifdef CONFIG_CGROUP_CPUACCT - struct kernel_cpustat *kcpustat; - struct cpuacct *ca; -#endif /* * Since all updates are sure to touch the root cgroup, we * get ourselves ahead and touch it first. If the root cgroup @@ -127,19 +123,7 @@ static inline void task_group_account_field(struct task_struct *p, int index, */ __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; -#ifdef CONFIG_CGROUP_CPUACCT - if (unlikely(!cpuacct_subsys.active)) - return; - - rcu_read_lock(); - ca = task_ca(p); - while (ca && (ca != &root_cpuacct)) { - kcpustat = this_cpu_ptr(ca->cpustat); - kcpustat->cpustat[index] += tmp; - ca = parent_ca(ca); - } - rcu_read_unlock(); -#endif + cpuacct_account_field(p, index, tmp); } /* @@ -310,7 +294,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) t = tsk; do { - task_cputime(tsk, &utime, &stime); + task_cputime(t, &utime, &stime); times->utime += utime; times->stime += stime; times->sum_exec_runtime += task_sched_runtime(t); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5c97fca091a7..c61a614465c8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1563,6 +1563,27 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); } /* migrations, e.g. sleep=0 leave decay_count == 0 */ } + +/* + * Update the rq's load with the elapsed running time before entering + * idle. if the last scheduled task is not a CFS task, idle_enter will + * be the only way to update the runnable statistic. + */ +void idle_enter_fair(struct rq *this_rq) +{ + update_rq_runnable_avg(this_rq, 1); +} + +/* + * Update the rq's load with the elapsed idle time before a task is + * scheduled. if the newly scheduled task is not a CFS task, idle_exit will + * be the only way to update the runnable statistic. + */ +void idle_exit_fair(struct rq *this_rq) +{ + update_rq_runnable_avg(this_rq, 0); +} + #else static inline void update_entity_load_avg(struct sched_entity *se, int update_cfs_rq) {} @@ -3875,12 +3896,16 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) int tsk_cache_hot = 0; /* * We do not migrate tasks that are: - * 1) running (obviously), or + * 1) throttled_lb_pair, or * 2) cannot be migrated to this CPU due to cpus_allowed, or - * 3) are cache-hot on their current CPU. + * 3) running (obviously), or + * 4) are cache-hot on their current CPU. */ + if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) + return 0; + if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { - int new_dst_cpu; + int cpu; schedstat_inc(p, se.statistics.nr_failed_migrations_affine); @@ -3895,12 +3920,15 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) return 0; - new_dst_cpu = cpumask_first_and(env->dst_grpmask, - tsk_cpus_allowed(p)); - if (new_dst_cpu < nr_cpu_ids) { - env->flags |= LBF_SOME_PINNED; - env->new_dst_cpu = new_dst_cpu; + /* Prevent to re-select dst_cpu via env's cpus */ + for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { + if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { + env->flags |= LBF_SOME_PINNED; + env->new_dst_cpu = cpu; + break; + } } + return 0; } @@ -3921,20 +3949,17 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); if (!tsk_cache_hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { -#ifdef CONFIG_SCHEDSTATS + if (tsk_cache_hot) { schedstat_inc(env->sd, lb_hot_gained[env->idle]); schedstat_inc(p, se.statistics.nr_forced_migrations); } -#endif + return 1; } - if (tsk_cache_hot) { - schedstat_inc(p, se.statistics.nr_failed_migrations_hot); - return 0; - } - return 1; + schedstat_inc(p, se.statistics.nr_failed_migrations_hot); + return 0; } /* @@ -3949,9 +3974,6 @@ static int move_one_task(struct lb_env *env) struct task_struct *p, *n; list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { - if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu)) - continue; - if (!can_migrate_task(p, env)) continue; @@ -4003,7 +4025,7 @@ static int move_tasks(struct lb_env *env) break; } - if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) + if (!can_migrate_task(p, env)) goto next; load = task_h_load(p); @@ -4014,9 +4036,6 @@ static int move_tasks(struct lb_env *env) if ((load / 2) > env->imbalance) goto next; - if (!can_migrate_task(p, env)) - goto next; - move_task(p, env); pulled++; env->imbalance -= load; @@ -4961,7 +4980,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, #define MAX_PINNED_INTERVAL 512 /* Working cpumask for load_balance and load_balance_newidle. */ -DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); +DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); static int need_active_balance(struct lb_env *env) { @@ -4992,11 +5011,10 @@ static int load_balance(int this_cpu, struct rq *this_rq, int *balance) { int ld_moved, cur_ld_moved, active_balance = 0; - int lb_iterations, max_lb_iterations; struct sched_group *group; struct rq *busiest; unsigned long flags; - struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); + struct cpumask *cpus = __get_cpu_var(load_balance_mask); struct lb_env env = { .sd = sd, @@ -5008,8 +5026,14 @@ static int load_balance(int this_cpu, struct rq *this_rq, .cpus = cpus, }; + /* + * For NEWLY_IDLE load_balancing, we don't need to consider + * other cpus in our group + */ + if (idle == CPU_NEWLY_IDLE) + env.dst_grpmask = NULL; + cpumask_copy(cpus, cpu_active_mask); - max_lb_iterations = cpumask_weight(env.dst_grpmask); schedstat_inc(sd, lb_count[idle]); @@ -5035,7 +5059,6 @@ redo: schedstat_add(sd, lb_imbalance[idle], env.imbalance); ld_moved = 0; - lb_iterations = 1; if (busiest->nr_running > 1) { /* * Attempt to move tasks. If find_busiest_group has found @@ -5062,17 +5085,17 @@ more_balance: double_rq_unlock(env.dst_rq, busiest); local_irq_restore(flags); - if (env.flags & LBF_NEED_BREAK) { - env.flags &= ~LBF_NEED_BREAK; - goto more_balance; - } - /* * some other cpu did the load balance for us. */ if (cur_ld_moved && env.dst_cpu != smp_processor_id()) resched_cpu(env.dst_cpu); + if (env.flags & LBF_NEED_BREAK) { + env.flags &= ~LBF_NEED_BREAK; + goto more_balance; + } + /* * Revisit (affine) tasks on src_cpu that couldn't be moved to * us and move them to an alternate dst_cpu in our sched_group @@ -5092,14 +5115,17 @@ more_balance: * moreover subsequent load balance cycles should correct the * excess load moved. */ - if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && - lb_iterations++ < max_lb_iterations) { + if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { env.dst_rq = cpu_rq(env.new_dst_cpu); env.dst_cpu = env.new_dst_cpu; env.flags &= ~LBF_SOME_PINNED; env.loop = 0; env.loop_break = sched_nr_migrate_break; + + /* Prevent to re-select dst_cpu via env's cpus */ + cpumask_clear_cpu(env.dst_cpu, env.cpus); + /* * Go back to "more_balance" rather than "redo" since we * need to continue with same src_cpu. @@ -5220,8 +5246,6 @@ void idle_balance(int this_cpu, struct rq *this_rq) if (this_rq->avg_idle < sysctl_sched_migration_cost) return; - update_rq_runnable_avg(this_rq, 1); - /* * Drop the rq->lock, but keep IRQ/preempt disabled. */ @@ -5396,13 +5420,16 @@ static inline void set_cpu_sd_state_busy(void) struct sched_domain *sd; int cpu = smp_processor_id(); - if (!test_bit(NOHZ_IDLE, nohz_flags(cpu))) - return; - clear_bit(NOHZ_IDLE, nohz_flags(cpu)); - rcu_read_lock(); - for_each_domain(cpu, sd) + sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); + + if (!sd || !sd->nohz_idle) + goto unlock; + sd->nohz_idle = 0; + + for (; sd; sd = sd->parent) atomic_inc(&sd->groups->sgp->nr_busy_cpus); +unlock: rcu_read_unlock(); } @@ -5411,13 +5438,16 @@ void set_cpu_sd_state_idle(void) struct sched_domain *sd; int cpu = smp_processor_id(); - if (test_bit(NOHZ_IDLE, nohz_flags(cpu))) - return; - set_bit(NOHZ_IDLE, nohz_flags(cpu)); - rcu_read_lock(); - for_each_domain(cpu, sd) + sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); + + if (!sd || sd->nohz_idle) + goto unlock; + sd->nohz_idle = 1; + + for (; sd; sd = sd->parent) atomic_dec(&sd->groups->sgp->nr_busy_cpus); +unlock: rcu_read_unlock(); } @@ -5469,7 +5499,7 @@ void update_max_interval(void) * It checks each scheduling domain to see if it is due to be balanced, * and initiates a balancing operation if so. * - * Balancing parameters are set up in arch_init_sched_domains. + * Balancing parameters are set up in init_sched_domains. */ static void rebalance_domains(int cpu, enum cpu_idle_type idle) { @@ -5507,10 +5537,11 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) if (time_after_eq(jiffies, sd->last_balance + interval)) { if (load_balance(cpu, rq, sd, idle, &balance)) { /* - * We've pulled tasks over so either we're no - * longer idle. + * The LBF_SOME_PINNED logic could have changed + * env->dst_cpu, so we can't know our idle + * state even if we migrated tasks. Update it. */ - idle = CPU_NOT_IDLE; + idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; } sd->last_balance = jiffies; } diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 1ad1d2b5395f..99399f8e4799 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -46,13 +46,6 @@ SCHED_FEAT(DOUBLE_TICK, false) SCHED_FEAT(LB_BIAS, true) /* - * Spin-wait on mutex acquisition when the mutex owner is running on - * another cpu -- assumes that when the owner is running, it will soon - * release the lock. Decreases scheduling overhead. - */ -SCHED_FEAT(OWNER_SPIN, true) - -/* * Decrement CPU power based on time not spent running tasks */ SCHED_FEAT(NONTASK_POWER, true) diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index b6baf370cae9..b8ce77328341 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -13,6 +13,16 @@ select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) { return task_cpu(p); /* IDLE tasks as never migrated */ } + +static void pre_schedule_idle(struct rq *rq, struct task_struct *prev) +{ + idle_exit_fair(rq); +} + +static void post_schedule_idle(struct rq *rq) +{ + idle_enter_fair(rq); +} #endif /* CONFIG_SMP */ /* * Idle tasks are unconditionally rescheduled: @@ -25,6 +35,10 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl static struct task_struct *pick_next_task_idle(struct rq *rq) { schedstat_inc(rq, sched_goidle); +#ifdef CONFIG_SMP + /* Trigger the post schedule to do an idle_enter for CFS */ + rq->post_schedule = 1; +#endif return rq->idle; } @@ -86,6 +100,8 @@ const struct sched_class idle_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_idle, + .pre_schedule = pre_schedule_idle, + .post_schedule = post_schedule_idle, #endif .set_curr_task = set_curr_task_idle, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index eb363aa5d83c..24dc29897749 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -8,6 +8,7 @@ #include <linux/tick.h> #include "cpupri.h" +#include "cpuacct.h" extern __read_mostly int scheduler_running; @@ -951,14 +952,6 @@ static const u32 prio_to_wmult[40] = { /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; -/* Time spent by the tasks of the cpu accounting group executing in ... */ -enum cpuacct_stat_index { - CPUACCT_STAT_USER, /* ... user mode */ - CPUACCT_STAT_SYSTEM, /* ... kernel mode */ - - CPUACCT_STAT_NSTATS, -}; - #define ENQUEUE_WAKEUP 1 #define ENQUEUE_HEAD 2 #ifdef CONFIG_SMP @@ -1032,6 +1025,18 @@ extern void update_group_power(struct sched_domain *sd, int cpu); extern void trigger_load_balance(struct rq *rq, int cpu); extern void idle_balance(int this_cpu, struct rq *this_rq); +/* + * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg + * becomes useful in lb + */ +#if defined(CONFIG_FAIR_GROUP_SCHED) +extern void idle_enter_fair(struct rq *this_rq); +extern void idle_exit_fair(struct rq *this_rq); +#else +static inline void idle_enter_fair(struct rq *this_rq) {} +static inline void idle_exit_fair(struct rq *this_rq) {} +#endif + #else /* CONFIG_SMP */ static inline void idle_balance(int cpu, struct rq *rq) @@ -1055,45 +1060,6 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime extern void update_idle_cpu_load(struct rq *this_rq); -#ifdef CONFIG_CGROUP_CPUACCT -#include <linux/cgroup.h> -/* track cpu usage of a group of tasks and its child groups */ -struct cpuacct { - struct cgroup_subsys_state css; - /* cpuusage holds pointer to a u64-type object on every cpu */ - u64 __percpu *cpuusage; - struct kernel_cpustat __percpu *cpustat; -}; - -extern struct cgroup_subsys cpuacct_subsys; -extern struct cpuacct root_cpuacct; - -/* return cpu accounting group corresponding to this container */ -static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) -{ - return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), - struct cpuacct, css); -} - -/* return cpu accounting group to which this task belongs */ -static inline struct cpuacct *task_ca(struct task_struct *tsk) -{ - return container_of(task_subsys_state(tsk, cpuacct_subsys_id), - struct cpuacct, css); -} - -static inline struct cpuacct *parent_ca(struct cpuacct *ca) -{ - if (!ca || !ca->css.cgroup->parent) - return NULL; - return cgroup_ca(ca->css.cgroup->parent); -} - -extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); -#else -static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} -#endif - #ifdef CONFIG_PARAVIRT static inline u64 steal_ticks(u64 steal) { @@ -1348,7 +1314,6 @@ extern void account_cfs_bandwidth_used(int enabled, int was_enabled); enum rq_nohz_flag_bits { NOHZ_TICK_STOPPED, NOHZ_BALANCE_KICK, - NOHZ_IDLE, }; #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) |