diff options
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/clock.c | 4 | ||||
-rw-r--r-- | kernel/sched/core.c | 66 | ||||
-rw-r--r-- | kernel/sched/cpudeadline.c | 6 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 16 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 30 | ||||
-rw-r--r-- | kernel/sched/fair.c | 39 | ||||
-rw-r--r-- | kernel/sched/idle_task.c | 1 | ||||
-rw-r--r-- | kernel/sched/rt.c | 25 | ||||
-rw-r--r-- | kernel/sched/sched.h | 23 |
9 files changed, 125 insertions, 85 deletions
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 43c2bcc35761..b30a2924ef14 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -301,14 +301,14 @@ u64 sched_clock_cpu(int cpu) if (unlikely(!sched_clock_running)) return 0ull; - preempt_disable(); + preempt_disable_notrace(); scd = cpu_sdc(cpu); if (cpu != smp_processor_id()) clock = sched_clock_remote(scd); else clock = sched_clock_local(scd); - preempt_enable(); + preempt_enable_notrace(); return clock; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cde573d3f12e..9cae286824bb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -432,7 +432,7 @@ void hrtick_start(struct rq *rq, u64 delay) if (rq == this_rq()) { __hrtick_restart(rq); } else if (!rq->hrtick_csd_pending) { - __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); + smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); rq->hrtick_csd_pending = 1; } } @@ -555,12 +555,15 @@ void resched_cpu(int cpu) * selecting an idle cpu will add more delays to the timers than intended * (as that cpu's timer base may not be uptodate wrt jiffies etc). */ -int get_nohz_timer_target(void) +int get_nohz_timer_target(int pinned) { int cpu = smp_processor_id(); int i; struct sched_domain *sd; + if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu)) + return cpu; + rcu_read_lock(); for_each_domain(cpu, sd) { for_each_cpu(i, sched_domain_span(sd)) { @@ -823,19 +826,13 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING if (static_key_false((¶virt_steal_rq_enabled))) { - u64 st; - steal = paravirt_steal_clock(cpu_of(rq)); steal -= rq->prev_steal_time_rq; if (unlikely(steal > delta)) steal = delta; - st = steal_ticks(steal); - steal = st * TICK_NSEC; - rq->prev_steal_time_rq += steal; - delta -= steal; } #endif @@ -1954,7 +1951,7 @@ static int dl_overflow(struct task_struct *p, int policy, { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); - u64 period = attr->sched_period; + u64 period = attr->sched_period ?: attr->sched_deadline; u64 runtime = attr->sched_runtime; u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; int cpus, err = -1; @@ -2151,8 +2148,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) if (mm) mmdrop(mm); if (unlikely(prev_state == TASK_DEAD)) { - task_numa_free(prev); - if (prev->sched_class->task_dead) prev->sched_class->task_dead(prev); @@ -3197,6 +3192,7 @@ static void __setscheduler_params(struct task_struct *p, * getparam()/getattr() don't report silly values for !rt tasks. */ p->rt_priority = attr->sched_priority; + p->normal_prio = normal_prio(p); set_load_weight(p); } @@ -3206,6 +3202,12 @@ static void __setscheduler(struct rq *rq, struct task_struct *p, { __setscheduler_params(p, attr); + /* + * If we get here, there was no pi waiters boosting the + * task. It is safe to use the normal prio. + */ + p->prio = normal_prio(p); + if (dl_prio(p->prio)) p->sched_class = &dl_sched_class; else if (rt_prio(p->prio)) @@ -3264,7 +3266,8 @@ static int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user) { - int newprio = MAX_RT_PRIO - 1 - attr->sched_priority; + int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : + MAX_RT_PRIO - 1 - attr->sched_priority; int retval, oldprio, oldpolicy = -1, on_rq, running; int policy = attr->sched_policy; unsigned long flags; @@ -3328,6 +3331,15 @@ recheck: return -EPERM; } + /* + * Can't set/change SCHED_DEADLINE policy at all for now + * (safest behavior); in the future we would like to allow + * unprivileged DL tasks to increase their relative deadline + * or reduce their runtime (both ways reducing utilization) + */ + if (dl_policy(policy)) + return -EPERM; + /* * Treat SCHED_IDLE as nice 20. Only allow a switch to * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. @@ -3673,13 +3685,14 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) * @pid: the pid in question. * @uattr: structure containing the extended parameters. */ -SYSCALL_DEFINE2(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr) +SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, + unsigned int, flags) { struct sched_attr attr; struct task_struct *p; int retval; - if (!uattr || pid < 0) + if (!uattr || pid < 0 || flags) return -EINVAL; if (sched_copy_attr(uattr, &attr)) @@ -3798,7 +3811,7 @@ static int sched_read_attr(struct sched_attr __user *uattr, attr->size = usize; } - ret = copy_to_user(uattr, attr, usize); + ret = copy_to_user(uattr, attr, attr->size); if (ret) return -EFAULT; @@ -3816,8 +3829,8 @@ err_size: * @uattr: structure containing the extended parameters. * @size: sizeof(attr) for fwd/bwd comp. */ -SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, - unsigned int, size) +SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, + unsigned int, size, unsigned int, flags) { struct sched_attr attr = { .size = sizeof(struct sched_attr), @@ -3826,7 +3839,7 @@ SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, int retval; if (!uattr || pid < 0 || size > PAGE_SIZE || - size < SCHED_ATTR_SIZE_VER0) + size < SCHED_ATTR_SIZE_VER0 || flags) return -EINVAL; rcu_read_lock(); @@ -4705,8 +4718,10 @@ void idle_task_exit(void) BUG_ON(cpu_online(smp_processor_id())); - if (mm != &init_mm) + if (mm != &init_mm) { switch_mm(mm, &init_mm, current); + finish_arch_post_lock_switch(); + } mmdrop(mm); } @@ -7461,6 +7476,7 @@ static int sched_dl_global_constraints(void) u64 period = global_rt_period(); u64 new_bw = to_ratio(period, runtime); int cpu, ret = 0; + unsigned long flags; /* * Here we want to check the bandwidth not being set to some @@ -7474,10 +7490,10 @@ static int sched_dl_global_constraints(void) for_each_possible_cpu(cpu) { struct dl_bw *dl_b = dl_bw_of(cpu); - raw_spin_lock(&dl_b->lock); + raw_spin_lock_irqsave(&dl_b->lock, flags); if (new_bw < dl_b->total_bw) ret = -EBUSY; - raw_spin_unlock(&dl_b->lock); + raw_spin_unlock_irqrestore(&dl_b->lock, flags); if (ret) break; @@ -7490,6 +7506,7 @@ static void sched_dl_do_global(void) { u64 new_bw = -1; int cpu; + unsigned long flags; def_dl_bandwidth.dl_period = global_rt_period(); def_dl_bandwidth.dl_runtime = global_rt_runtime(); @@ -7503,9 +7520,9 @@ static void sched_dl_do_global(void) for_each_possible_cpu(cpu) { struct dl_bw *dl_b = dl_bw_of(cpu); - raw_spin_lock(&dl_b->lock); + raw_spin_lock_irqsave(&dl_b->lock, flags); dl_b->bw = new_bw; - raw_spin_unlock(&dl_b->lock); + raw_spin_unlock_irqrestore(&dl_b->lock, flags); } } @@ -7514,7 +7531,8 @@ static int sched_rt_global_validate(void) if (sysctl_sched_rt_period <= 0) return -EINVAL; - if (sysctl_sched_rt_runtime > sysctl_sched_rt_period) + if ((sysctl_sched_rt_runtime != RUNTIME_INF) && + (sysctl_sched_rt_runtime > sysctl_sched_rt_period)) return -EINVAL; return 0; diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 045fc74e3f09..5b9bb42b2d47 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -70,7 +70,7 @@ static void cpudl_heapify(struct cpudl *cp, int idx) static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl) { - WARN_ON(idx > num_present_cpus() || idx == IDX_INVALID); + WARN_ON(idx == IDX_INVALID || !cpu_present(idx)); if (dl_time_before(new_dl, cp->elements[idx].dl)) { cp->elements[idx].dl = new_dl; @@ -117,7 +117,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, } out: - WARN_ON(best_cpu > num_present_cpus() && best_cpu != -1); + WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); return best_cpu; } @@ -137,7 +137,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) int old_idx, new_cpu; unsigned long flags; - WARN_ON(cpu > num_present_cpus()); + WARN_ON(!cpu_present(cpu)); raw_spin_lock_irqsave(&cp->lock, flags); old_idx = cp->cpu_to_idx[cpu]; diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 58624a65f124..a95097cb4591 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -258,16 +258,22 @@ static __always_inline bool steal_account_process_tick(void) { #ifdef CONFIG_PARAVIRT if (static_key_false(¶virt_steal_enabled)) { - u64 steal, st = 0; + u64 steal; + cputime_t steal_ct; steal = paravirt_steal_clock(smp_processor_id()); steal -= this_rq()->prev_steal_time; - st = steal_ticks(steal); - this_rq()->prev_steal_time += st * TICK_NSEC; + /* + * cputime_t may be less precise than nsecs (eg: if it's + * based on jiffies). Lets cast the result to cputime + * granularity and account the rest on the next rounds. + */ + steal_ct = nsecs_to_cputime(steal); + this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct); - account_steal_time(st); - return st; + account_steal_time(steal_ct); + return steal_ct; } #endif return false; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 3185b775dbf7..27ef40925525 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -121,7 +121,7 @@ static inline void dl_clear_overload(struct rq *rq) static void update_dl_migration(struct dl_rq *dl_rq) { - if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_total > 1) { + if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) { if (!dl_rq->overloaded) { dl_set_overload(rq_of_dl_rq(dl_rq)); dl_rq->overloaded = 1; @@ -135,9 +135,7 @@ static void update_dl_migration(struct dl_rq *dl_rq) static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { struct task_struct *p = dl_task_of(dl_se); - dl_rq = &rq_of_dl_rq(dl_rq)->dl; - dl_rq->dl_nr_total++; if (p->nr_cpus_allowed > 1) dl_rq->dl_nr_migratory++; @@ -147,9 +145,7 @@ static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { struct task_struct *p = dl_task_of(dl_se); - dl_rq = &rq_of_dl_rq(dl_rq)->dl; - dl_rq->dl_nr_total--; if (p->nr_cpus_allowed > 1) dl_rq->dl_nr_migratory--; @@ -589,6 +585,8 @@ int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se) return 1; } +extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); + /* * Update the current task's runtime statistics (provided it is still * a -deadline task and has not been removed from the dl_rq). @@ -611,8 +609,8 @@ static void update_curr_dl(struct rq *rq) * approach need further study. */ delta_exec = rq_clock_task(rq) - curr->se.exec_start; - if (unlikely((s64)delta_exec < 0)) - delta_exec = 0; + if (unlikely((s64)delta_exec <= 0)) + return; schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); @@ -652,11 +650,13 @@ static void update_curr_dl(struct rq *rq) struct rt_rq *rt_rq = &rq->rt; raw_spin_lock(&rt_rq->rt_runtime_lock); - rt_rq->rt_time += delta_exec; /* * We'll let actual RT tasks worry about the overflow here, we - * have our own CBS to keep us inline -- see above. + * have our own CBS to keep us inline; only account when RT + * bandwidth is relevant. */ + if (sched_rt_bandwidth_account(rt_rq)) + rt_rq->rt_time += delta_exec; raw_spin_unlock(&rt_rq->rt_runtime_lock); } } @@ -740,6 +740,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) WARN_ON(!dl_prio(prio)); dl_rq->dl_nr_running++; + inc_nr_running(rq_of_dl_rq(dl_rq)); inc_dl_deadline(dl_rq, deadline); inc_dl_migration(dl_se, dl_rq); @@ -753,6 +754,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) WARN_ON(!dl_prio(prio)); WARN_ON(!dl_rq->dl_nr_running); dl_rq->dl_nr_running--; + dec_nr_running(rq_of_dl_rq(dl_rq)); dec_dl_deadline(dl_rq, dl_se->deadline); dec_dl_migration(dl_se, dl_rq); @@ -859,8 +861,6 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); - - inc_nr_running(rq); } static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) @@ -873,8 +873,6 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) { update_curr_dl(rq); __dequeue_task_dl(rq, p, flags); - - dec_nr_running(rq); } /* @@ -1025,6 +1023,12 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) if (need_pull_dl_task(rq, prev)) pull_dl_task(rq); + /* + * When prev is DL, we may throttle it in put_prev_task(). + * So, we update time before we check for dl_nr_running. + */ + if (prev->sched_class == &dl_sched_class) + update_curr_dl(rq); if (unlikely(!dl_rq->dl_nr_running)) return NULL; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 16042b58a32f..7e9bd0b1fa9e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1914,6 +1914,8 @@ void task_numa_work(struct callback_head *work) start = end; if (pages <= 0) goto out; + + cond_resched(); } while (end != vma->vm_end); } @@ -4785,17 +4787,16 @@ simple: return p; idle: + new_tasks = idle_balance(rq); /* * Because idle_balance() releases (and re-acquires) rq->lock, it is * possible for any higher priority task to appear. In that case we * must re-start the pick_next_entity() loop. */ - new_tasks = idle_balance(rq); - - if (rq->nr_running != rq->cfs.h_nr_running) + if (new_tasks < 0) return RETRY_TASK; - if (new_tasks) + if (new_tasks > 0) goto again; return NULL; @@ -5036,7 +5037,7 @@ static void move_task(struct task_struct *p, struct lb_env *env) * Is this task likely cache-hot: */ static int -task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) +task_hot(struct task_struct *p, u64 now) { s64 delta; @@ -5197,7 +5198,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 2) task is cache cold, or * 3) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); + tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq)); if (!tsk_cache_hot) tsk_cache_hot = migrate_degrades_locality(p, env); @@ -6060,12 +6061,10 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) pwr_now /= SCHED_POWER_SCALE; /* Amount of load we'd subtract */ - tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / - busiest->group_power; - if (busiest->avg_load > tmp) { + if (busiest->avg_load > scaled_busy_load_per_task) { pwr_move += busiest->group_power * min(busiest->load_per_task, - busiest->avg_load - tmp); + busiest->avg_load - scaled_busy_load_per_task); } /* Amount of load we'd add */ @@ -6709,7 +6708,7 @@ static int idle_balance(struct rq *this_rq) * While browsing the domains, we released the rq lock. * A task could have be enqueued in the meantime */ - if (this_rq->nr_running && !pulled_task) { + if (this_rq->cfs.h_nr_running && !pulled_task) { pulled_task = 1; goto out; } @@ -6726,8 +6725,16 @@ static int idle_balance(struct rq *this_rq) this_rq->max_idle_balance_cost = curr_cost; out: - if (pulled_task) + /* Is there a task of a high priority class? */ + if (this_rq->nr_running != this_rq->cfs.h_nr_running && + (this_rq->dl.dl_nr_running || + (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt)))) + pulled_task = -1; + + if (pulled_task) { + idle_exit_fair(this_rq); this_rq->idle_stamp = 0; + } return pulled_task; } @@ -7316,15 +7323,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) struct cfs_rq *cfs_rq = cfs_rq_of(se); /* - * Ensure the task's vruntime is normalized, so that when its + * Ensure the task's vruntime is normalized, so that when it's * switched back to the fair class the enqueue_entity(.flags=0) will * do the right thing. * - * If it was on_rq, then the dequeue_entity(.flags=0) will already - * have normalized the vruntime, if it was !on_rq, then only when + * If it's on_rq, then the dequeue_entity(.flags=0) will already + * have normalized the vruntime, if it's !on_rq, then only when * the task is sleeping will it still have non-normalized vruntime. */ - if (!se->on_rq && p->state != TASK_RUNNING) { + if (!p->on_rq && p->state != TASK_RUNNING) { /* * Fix up our vruntime so that the current sleep doesn't * cause 'unlimited' sleep bonus. diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 1f3725882838..879f2b75266a 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -29,7 +29,6 @@ pick_next_task_idle(struct rq *rq, struct task_struct *prev) put_prev_task(rq, prev); schedstat_inc(rq, sched_goidle); - idle_enter_fair(rq); return rq->idle; } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 398b3f990823..d8cdf1618551 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -470,11 +470,6 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) dequeue_rt_entity(rt_se); } -static inline int rt_rq_throttled(struct rt_rq *rt_rq) -{ - return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; -} - static int rt_se_boosted(struct sched_rt_entity *rt_se) { struct rt_rq *rt_rq = group_rt_rq(rt_se); @@ -545,11 +540,6 @@ static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) { } -static inline int rt_rq_throttled(struct rt_rq *rt_rq) -{ - return rt_rq->rt_throttled; -} - static inline const struct cpumask *sched_rt_period_mask(void) { return cpu_online_mask; @@ -568,6 +558,14 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) #endif /* CONFIG_RT_GROUP_SCHED */ +bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) +{ + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); + + return (hrtimer_active(&rt_b->rt_period_timer) || + rt_rq->rt_time < rt_b->rt_runtime); +} + #ifdef CONFIG_SMP /* * We ran out of runtime, see if we can borrow some from our neighbours. @@ -1371,6 +1369,13 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) return RETRY_TASK; } + /* + * We may dequeue prev's rt_rq in put_prev_task(). + * So, we update time before rt_nr_running check. + */ + if (prev->sched_class == &rt_sched_class) + update_curr_rt(rq); + if (!rt_rq->rt_nr_running) return NULL; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1929deb3f29d..c9007f28d3a2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -423,6 +423,18 @@ struct rt_rq { #endif }; +#ifdef CONFIG_RT_GROUP_SCHED +static inline int rt_rq_throttled(struct rt_rq *rt_rq) +{ + return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; +} +#else +static inline int rt_rq_throttled(struct rt_rq *rt_rq) +{ + return rt_rq->rt_throttled; +} +#endif + /* Deadline class' related fields in a runqueue */ struct dl_rq { /* runqueue is an rbtree, ordered by deadline */ @@ -444,7 +456,6 @@ struct dl_rq { } earliest_dl; unsigned long dl_nr_migratory; - unsigned long dl_nr_total; int overloaded; /* @@ -1205,16 +1216,6 @@ extern void update_idle_cpu_load(struct rq *this_rq); extern void init_task_runnable_average(struct task_struct *p); -#ifdef CONFIG_PARAVIRT -static inline u64 steal_ticks(u64 steal) -{ - if (unlikely(steal > NSEC_PER_SEC)) - return div_u64(steal, TICK_NSEC); - - return __iter_div_u64_rem(steal, TICK_NSEC, &steal); -} -#endif - static inline void inc_nr_running(struct rq *rq) { rq->nr_running++; |