summaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/core.c133
-rw-r--r--kernel/sched/cpupri.c10
-rw-r--r--kernel/sched/fair.c153
-rw-r--r--kernel/sched/rt.c13
-rw-r--r--kernel/sched/sched.h31
-rw-r--r--kernel/sched/stop_task.c22
6 files changed, 271 insertions, 91 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 468bdd44c1ba..fbf1fd098dc6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1096,7 +1096,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
* a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
*
* sched_move_task() holds both and thus holding either pins the cgroup,
- * see set_task_rq().
+ * see task_group().
*
* Furthermore, all task_rq users should acquire both locks, see
* task_rq_lock().
@@ -1910,12 +1910,12 @@ static inline void
prepare_task_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next)
{
+ trace_sched_switch(prev, next);
sched_info_switch(prev, next);
perf_event_task_sched_out(prev, next);
fire_sched_out_preempt_notifiers(prev, next);
prepare_lock_switch(rq, next);
prepare_arch_switch(next);
- trace_sched_switch(prev, next);
}
/**
@@ -3142,6 +3142,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
#endif
+static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
+{
+ u64 temp = (__force u64) rtime;
+
+ temp *= (__force u64) utime;
+
+ if (sizeof(cputime_t) == 4)
+ temp = div_u64(temp, (__force u32) total);
+ else
+ temp = div64_u64(temp, (__force u64) total);
+
+ return (__force cputime_t) temp;
+}
+
void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
{
cputime_t rtime, utime = p->utime, total = utime + p->stime;
@@ -3151,13 +3165,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
*/
rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
- if (total) {
- u64 temp = (__force u64) rtime;
-
- temp *= (__force u64) utime;
- do_div(temp, (__force u32) total);
- utime = (__force cputime_t) temp;
- } else
+ if (total)
+ utime = scale_utime(utime, rtime, total);
+ else
utime = rtime;
/*
@@ -3184,13 +3194,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
total = cputime.utime + cputime.stime;
rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
- if (total) {
- u64 temp = (__force u64) rtime;
-
- temp *= (__force u64) cputime.utime;
- do_div(temp, (__force u32) total);
- utime = (__force cputime_t) temp;
- } else
+ if (total)
+ utime = scale_utime(cputime.utime, rtime, total);
+ else
utime = rtime;
sig->prev_utime = max(sig->prev_utime, utime);
@@ -4340,9 +4346,7 @@ recheck:
*/
if (unlikely(policy == p->policy && (!rt_policy(policy) ||
param->sched_priority == p->rt_priority))) {
-
- __task_rq_unlock(rq);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ task_rq_unlock(rq, p, &flags);
return 0;
}
@@ -6024,6 +6028,11 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
* SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
* allows us to avoid some pointer chasing select_idle_sibling().
*
+ * Iterate domains and sched_groups downward, assigning CPUs to be
+ * select_idle_sibling() hw buddy. Cross-wiring hw makes bouncing
+ * due to random perturbation self canceling, ie sw buddies pull
+ * their counterpart to their CPU's hw counterpart.
+ *
* Also keep a unique ID per domain (we use the first cpu number in
* the cpumask of the domain), this allows us to quickly tell if
* two cpus are in the same cache domain, see cpus_share_cache().
@@ -6037,8 +6046,40 @@ static void update_top_cache_domain(int cpu)
int id = cpu;
sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
- if (sd)
+ if (sd) {
+ struct sched_domain *tmp = sd;
+ struct sched_group *sg, *prev;
+ bool right;
+
+ /*
+ * Traverse to first CPU in group, and count hops
+ * to cpu from there, switching direction on each
+ * hop, never ever pointing the last CPU rightward.
+ */
+ do {
+ id = cpumask_first(sched_domain_span(tmp));
+ prev = sg = tmp->groups;
+ right = 1;
+
+ while (cpumask_first(sched_group_cpus(sg)) != id)
+ sg = sg->next;
+
+ while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) {
+ prev = sg;
+ sg = sg->next;
+ right = !right;
+ }
+
+ /* A CPU went down, never point back to domain start. */
+ if (right && cpumask_first(sched_group_cpus(sg->next)) == id)
+ right = false;
+
+ sg = right ? sg->next : prev;
+ tmp->idle_buddy = cpumask_first(sched_group_cpus(sg));
+ } while ((tmp = tmp->child));
+
id = cpumask_first(sched_domain_span(sd));
+ }
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
per_cpu(sd_llc_id, cpu) = id;
@@ -7097,34 +7138,66 @@ match2:
mutex_unlock(&sched_domains_mutex);
}
+static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */
+
/*
* Update cpusets according to cpu_active mask. If cpusets are
* disabled, cpuset_update_active_cpus() becomes a simple wrapper
* around partition_sched_domains().
+ *
+ * If we come here as part of a suspend/resume, don't touch cpusets because we
+ * want to restore it back to its original state upon resume anyway.
*/
static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
void *hcpu)
{
- switch (action & ~CPU_TASKS_FROZEN) {
+ switch (action) {
+ case CPU_ONLINE_FROZEN:
+ case CPU_DOWN_FAILED_FROZEN:
+
+ /*
+ * num_cpus_frozen tracks how many CPUs are involved in suspend
+ * resume sequence. As long as this is not the last online
+ * operation in the resume sequence, just build a single sched
+ * domain, ignoring cpusets.
+ */
+ num_cpus_frozen--;
+ if (likely(num_cpus_frozen)) {
+ partition_sched_domains(1, NULL, NULL);
+ break;
+ }
+
+ /*
+ * This is the last CPU online operation. So fall through and
+ * restore the original sched domains by considering the
+ * cpuset configurations.
+ */
+
case CPU_ONLINE:
case CPU_DOWN_FAILED:
- cpuset_update_active_cpus();
- return NOTIFY_OK;
+ cpuset_update_active_cpus(true);
+ break;
default:
return NOTIFY_DONE;
}
+ return NOTIFY_OK;
}
static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
void *hcpu)
{
- switch (action & ~CPU_TASKS_FROZEN) {
+ switch (action) {
case CPU_DOWN_PREPARE:
- cpuset_update_active_cpus();
- return NOTIFY_OK;
+ cpuset_update_active_cpus(false);
+ break;
+ case CPU_DOWN_PREPARE_FROZEN:
+ num_cpus_frozen++;
+ partition_sched_domains(1, NULL, NULL);
+ break;
default:
return NOTIFY_DONE;
}
+ return NOTIFY_OK;
}
void __init sched_init_smp(void)
@@ -7179,6 +7252,7 @@ int in_sched_functions(unsigned long addr)
#ifdef CONFIG_CGROUP_SCHED
struct task_group root_task_group;
+LIST_HEAD(task_groups);
#endif
DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
@@ -7589,6 +7663,7 @@ void sched_destroy_group(struct task_group *tg)
*/
void sched_move_task(struct task_struct *tsk)
{
+ struct task_group *tg;
int on_rq, running;
unsigned long flags;
struct rq *rq;
@@ -7603,6 +7678,12 @@ void sched_move_task(struct task_struct *tsk)
if (unlikely(running))
tsk->sched_class->put_prev_task(rq, tsk);
+ tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id,
+ lockdep_is_held(&tsk->sighand->siglock)),
+ struct task_group, css);
+ tg = autogroup_task_group(tsk, tg);
+ tsk->sched_task_group = tg;
+
#ifdef CONFIG_FAIR_GROUP_SCHED
if (tsk->sched_class->task_move_group)
tsk->sched_class->task_move_group(tsk, on_rq);
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index d72586fdf660..23aa789c53ee 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -65,8 +65,8 @@ static int convert_prio(int prio)
int cpupri_find(struct cpupri *cp, struct task_struct *p,
struct cpumask *lowest_mask)
{
- int idx = 0;
- int task_pri = convert_prio(p->prio);
+ int idx = 0;
+ int task_pri = convert_prio(p->prio);
if (task_pri >= MAX_RT_PRIO)
return 0;
@@ -137,9 +137,9 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
*/
void cpupri_set(struct cpupri *cp, int cpu, int newpri)
{
- int *currpri = &cp->cpu_to_pri[cpu];
- int oldpri = *currpri;
- int do_mb = 0;
+ int *currpri = &cp->cpu_to_pri[cpu];
+ int oldpri = *currpri;
+ int do_mb = 0;
newpri = convert_prio(newpri);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c099cc6eebe3..c219bf8d704c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2637,8 +2637,6 @@ static int select_idle_sibling(struct task_struct *p, int target)
int cpu = smp_processor_id();
int prev_cpu = task_cpu(p);
struct sched_domain *sd;
- struct sched_group *sg;
- int i;
/*
* If the task is going to be woken-up on this cpu and if it is
@@ -2655,29 +2653,17 @@ static int select_idle_sibling(struct task_struct *p, int target)
return prev_cpu;
/*
- * Otherwise, iterate the domains and find an elegible idle cpu.
+ * Otherwise, check assigned siblings to find an elegible idle cpu.
*/
sd = rcu_dereference(per_cpu(sd_llc, target));
- for_each_lower_domain(sd) {
- sg = sd->groups;
- do {
- if (!cpumask_intersects(sched_group_cpus(sg),
- tsk_cpus_allowed(p)))
- goto next;
-
- for_each_cpu(i, sched_group_cpus(sg)) {
- if (!idle_cpu(i))
- goto next;
- }
- target = cpumask_first_and(sched_group_cpus(sg),
- tsk_cpus_allowed(p));
- goto done;
-next:
- sg = sg->next;
- } while (sg != sd->groups);
+ for_each_lower_domain(sd) {
+ if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p)))
+ continue;
+ if (idle_cpu(sd->idle_buddy))
+ return sd->idle_buddy;
}
-done:
+
return target;
}
@@ -3068,18 +3054,24 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
#define LBF_ALL_PINNED 0x01
#define LBF_NEED_BREAK 0x02
+#define LBF_SOME_PINNED 0x04
struct lb_env {
struct sched_domain *sd;
- int src_cpu;
struct rq *src_rq;
+ int src_cpu;
int dst_cpu;
struct rq *dst_rq;
+ struct cpumask *dst_grpmask;
+ int new_dst_cpu;
enum cpu_idle_type idle;
long imbalance;
+ /* The set of CPUs under consideration for load-balancing */
+ struct cpumask *cpus;
+
unsigned int flags;
unsigned int loop;
@@ -3145,9 +3137,31 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
* 3) are cache-hot on their current CPU.
*/
if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
+ int new_dst_cpu;
+
schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+
+ /*
+ * Remember if this task can be migrated to any other cpu in
+ * our sched_group. We may want to revisit it if we couldn't
+ * meet load balance goals by pulling other tasks on src_cpu.
+ *
+ * Also avoid computing new_dst_cpu if we have already computed
+ * one in current iteration.
+ */
+ if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
+ return 0;
+
+ new_dst_cpu = cpumask_first_and(env->dst_grpmask,
+ tsk_cpus_allowed(p));
+ if (new_dst_cpu < nr_cpu_ids) {
+ env->flags |= LBF_SOME_PINNED;
+ env->new_dst_cpu = new_dst_cpu;
+ }
return 0;
}
+
+ /* Record that we found atleast one task that could run on dst_cpu */
env->flags &= ~LBF_ALL_PINNED;
if (task_running(env->src_rq, p)) {
@@ -3373,6 +3387,14 @@ static int tg_load_down(struct task_group *tg, void *data)
static void update_h_load(long cpu)
{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long now = jiffies;
+
+ if (rq->h_load_throttle == now)
+ return;
+
+ rq->h_load_throttle = now;
+
rcu_read_lock();
walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
rcu_read_unlock();
@@ -3642,8 +3664,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
*/
static inline void update_sg_lb_stats(struct lb_env *env,
struct sched_group *group, int load_idx,
- int local_group, const struct cpumask *cpus,
- int *balance, struct sg_lb_stats *sgs)
+ int local_group, int *balance, struct sg_lb_stats *sgs)
{
unsigned long nr_running, max_nr_running, min_nr_running;
unsigned long load, max_cpu_load, min_cpu_load;
@@ -3660,7 +3681,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
max_nr_running = 0;
min_nr_running = ~0UL;
- for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+ for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
struct rq *rq = cpu_rq(i);
nr_running = rq->nr_running;
@@ -3789,8 +3810,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
* @sds: variable to hold the statistics for this sched_domain.
*/
static inline void update_sd_lb_stats(struct lb_env *env,
- const struct cpumask *cpus,
- int *balance, struct sd_lb_stats *sds)
+ int *balance, struct sd_lb_stats *sds)
{
struct sched_domain *child = env->sd->child;
struct sched_group *sg = env->sd->groups;
@@ -3807,8 +3827,7 @@ static inline void update_sd_lb_stats(struct lb_env *env,
local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
memset(&sgs, 0, sizeof(sgs));
- update_sg_lb_stats(env, sg, load_idx, local_group,
- cpus, balance, &sgs);
+ update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs);
if (local_group && !(*balance))
return;
@@ -4044,7 +4063,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* to restore balance.
*
* @env: The load balancing environment.
- * @cpus: The set of CPUs under consideration for load-balancing.
* @balance: Pointer to a variable indicating if this_cpu
* is the appropriate cpu to perform load balancing at this_level.
*
@@ -4054,7 +4072,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* put to idle by rebalancing its tasks onto our group.
*/
static struct sched_group *
-find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance)
+find_busiest_group(struct lb_env *env, int *balance)
{
struct sd_lb_stats sds;
@@ -4064,7 +4082,7 @@ find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance)
* Compute the various statistics relavent for load balancing at
* this level.
*/
- update_sd_lb_stats(env, cpus, balance, &sds);
+ update_sd_lb_stats(env, balance, &sds);
/*
* this_cpu is not the appropriate cpu to perform load balancing at
@@ -4144,8 +4162,7 @@ ret:
* find_busiest_queue - find the busiest runqueue among the cpus in group.
*/
static struct rq *find_busiest_queue(struct lb_env *env,
- struct sched_group *group,
- const struct cpumask *cpus)
+ struct sched_group *group)
{
struct rq *busiest = NULL, *rq;
unsigned long max_load = 0;
@@ -4160,7 +4177,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
if (!capacity)
capacity = fix_small_capacity(env->sd, group);
- if (!cpumask_test_cpu(i, cpus))
+ if (!cpumask_test_cpu(i, env->cpus))
continue;
rq = cpu_rq(i);
@@ -4227,7 +4244,8 @@ static int load_balance(int this_cpu, struct rq *this_rq,
struct sched_domain *sd, enum cpu_idle_type idle,
int *balance)
{
- int ld_moved, active_balance = 0;
+ int ld_moved, cur_ld_moved, active_balance = 0;
+ int lb_iterations, max_lb_iterations;
struct sched_group *group;
struct rq *busiest;
unsigned long flags;
@@ -4237,16 +4255,19 @@ static int load_balance(int this_cpu, struct rq *this_rq,
.sd = sd,
.dst_cpu = this_cpu,
.dst_rq = this_rq,
+ .dst_grpmask = sched_group_cpus(sd->groups),
.idle = idle,
.loop_break = sched_nr_migrate_break,
+ .cpus = cpus,
};
cpumask_copy(cpus, cpu_active_mask);
+ max_lb_iterations = cpumask_weight(env.dst_grpmask);
schedstat_inc(sd, lb_count[idle]);
redo:
- group = find_busiest_group(&env, cpus, balance);
+ group = find_busiest_group(&env, balance);
if (*balance == 0)
goto out_balanced;
@@ -4256,7 +4277,7 @@ redo:
goto out_balanced;
}
- busiest = find_busiest_queue(&env, group, cpus);
+ busiest = find_busiest_queue(&env, group);
if (!busiest) {
schedstat_inc(sd, lb_nobusyq[idle]);
goto out_balanced;
@@ -4267,6 +4288,7 @@ redo:
schedstat_add(sd, lb_imbalance[idle], env.imbalance);
ld_moved = 0;
+ lb_iterations = 1;
if (busiest->nr_running > 1) {
/*
* Attempt to move tasks. If find_busiest_group has found
@@ -4279,12 +4301,17 @@ redo:
env.src_rq = busiest;
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
+ update_h_load(env.src_cpu);
more_balance:
local_irq_save(flags);
double_rq_lock(this_rq, busiest);
- if (!env.loop)
- update_h_load(env.src_cpu);
- ld_moved += move_tasks(&env);
+
+ /*
+ * cur_ld_moved - load moved in current iteration
+ * ld_moved - cumulative load moved across iterations
+ */
+ cur_ld_moved = move_tasks(&env);
+ ld_moved += cur_ld_moved;
double_rq_unlock(this_rq, busiest);
local_irq_restore(flags);
@@ -4296,14 +4323,52 @@ more_balance:
/*
* some other cpu did the load balance for us.
*/
- if (ld_moved && this_cpu != smp_processor_id())
- resched_cpu(this_cpu);
+ if (cur_ld_moved && env.dst_cpu != smp_processor_id())
+ resched_cpu(env.dst_cpu);
+
+ /*
+ * Revisit (affine) tasks on src_cpu that couldn't be moved to
+ * us and move them to an alternate dst_cpu in our sched_group
+ * where they can run. The upper limit on how many times we
+ * iterate on same src_cpu is dependent on number of cpus in our
+ * sched_group.
+ *
+ * This changes load balance semantics a bit on who can move
+ * load to a given_cpu. In addition to the given_cpu itself
+ * (or a ilb_cpu acting on its behalf where given_cpu is
+ * nohz-idle), we now have balance_cpu in a position to move
+ * load to given_cpu. In rare situations, this may cause
+ * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
+ * _independently_ and at _same_ time to move some load to
+ * given_cpu) causing exceess load to be moved to given_cpu.
+ * This however should not happen so much in practice and
+ * moreover subsequent load balance cycles should correct the
+ * excess load moved.
+ */
+ if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
+ lb_iterations++ < max_lb_iterations) {
+
+ this_rq = cpu_rq(env.new_dst_cpu);
+ env.dst_rq = this_rq;
+ env.dst_cpu = env.new_dst_cpu;
+ env.flags &= ~LBF_SOME_PINNED;
+ env.loop = 0;
+ env.loop_break = sched_nr_migrate_break;
+ /*
+ * Go back to "more_balance" rather than "redo" since we
+ * need to continue with same src_cpu.
+ */
+ goto more_balance;
+ }
/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(env.flags & LBF_ALL_PINNED)) {
cpumask_clear_cpu(cpu_of(busiest), cpus);
- if (!cpumask_empty(cpus))
+ if (!cpumask_empty(cpus)) {
+ env.loop = 0;
+ env.loop_break = sched_nr_migrate_break;
goto redo;
+ }
goto out_balanced;
}
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 573e1ca01102..944cb68420e9 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -788,6 +788,19 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
const struct cpumask *span;
span = sched_rt_period_mask();
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * FIXME: isolated CPUs should really leave the root task group,
+ * whether they are isolcpus or were isolated via cpusets, lest
+ * the timer run on a CPU which does not service all runqueues,
+ * potentially leaving other CPUs indefinitely throttled. If
+ * isolation is really required, the user will turn the throttle
+ * off to kill the perturbations it causes anyway. Meanwhile,
+ * this maintains functionality for boot and/or troubleshooting.
+ */
+ if (rt_b == &root_task_group.rt_bandwidth)
+ span = cpu_online_mask;
+#endif
for_each_cpu(i, span) {
int enqueue = 0;
struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 55844f24435a..f6714d009e77 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -80,7 +80,7 @@ extern struct mutex sched_domains_mutex;
struct cfs_rq;
struct rt_rq;
-static LIST_HEAD(task_groups);
+extern struct list_head task_groups;
struct cfs_bandwidth {
#ifdef CONFIG_CFS_BANDWIDTH
@@ -374,7 +374,11 @@ struct rq {
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */
struct list_head leaf_cfs_rq_list;
-#endif
+#ifdef CONFIG_SMP
+ unsigned long h_load_throttle;
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
#ifdef CONFIG_RT_GROUP_SCHED
struct list_head leaf_rt_rq_list;
#endif
@@ -538,22 +542,19 @@ extern int group_balance_cpu(struct sched_group *sg);
/*
* Return the group to which this tasks belongs.
*
- * We use task_subsys_state_check() and extend the RCU verification with
- * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
- * task it moves into the cgroup. Therefore by holding either of those locks,
- * we pin the task to the current cgroup.
+ * We cannot use task_subsys_state() and friends because the cgroup
+ * subsystem changes that value before the cgroup_subsys::attach() method
+ * is called, therefore we cannot pin it and might observe the wrong value.
+ *
+ * The same is true for autogroup's p->signal->autogroup->tg, the autogroup
+ * core changes this before calling sched_move_task().
+ *
+ * Instead we use a 'copy' which is updated from sched_move_task() while
+ * holding both task_struct::pi_lock and rq::lock.
*/
static inline struct task_group *task_group(struct task_struct *p)
{
- struct task_group *tg;
- struct cgroup_subsys_state *css;
-
- css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
- lockdep_is_held(&p->pi_lock) ||
- lockdep_is_held(&task_rq(p)->lock));
- tg = container_of(css, struct task_group, css);
-
- return autogroup_task_group(p, tg);
+ return p->sched_task_group;
}
/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 7b386e86fd23..da5eb5bed84a 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -27,8 +27,10 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
{
struct task_struct *stop = rq->stop;
- if (stop && stop->on_rq)
+ if (stop && stop->on_rq) {
+ stop->se.exec_start = rq->clock_task;
return stop;
+ }
return NULL;
}
@@ -52,6 +54,21 @@ static void yield_task_stop(struct rq *rq)
static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
{
+ struct task_struct *curr = rq->curr;
+ u64 delta_exec;
+
+ delta_exec = rq->clock_task - curr->se.exec_start;
+ if (unlikely((s64)delta_exec < 0))
+ delta_exec = 0;
+
+ schedstat_set(curr->se.statistics.exec_max,
+ max(curr->se.statistics.exec_max, delta_exec));
+
+ curr->se.sum_exec_runtime += delta_exec;
+ account_group_exec_runtime(curr, delta_exec);
+
+ curr->se.exec_start = rq->clock_task;
+ cpuacct_charge(curr, delta_exec);
}
static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
@@ -60,6 +77,9 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
static void set_curr_task_stop(struct rq *rq)
{
+ struct task_struct *stop = rq->stop;
+
+ stop->se.exec_start = rq->clock_task;
}
static void switched_to_stop(struct rq *rq, struct task_struct *p)