diff options
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/core.c | 130 | ||||
-rw-r--r-- | kernel/sched/cpufreq.c | 2 | ||||
-rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 72 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 3 | ||||
-rw-r--r-- | kernel/sched/debug.c | 2 | ||||
-rw-r--r-- | kernel/sched/fair.c | 144 | ||||
-rw-r--r-- | kernel/sched/isolation.c | 18 | ||||
-rw-r--r-- | kernel/sched/rt.c | 5 | ||||
-rw-r--r-- | kernel/sched/sched.h | 18 | ||||
-rw-r--r-- | kernel/sched/topology.c | 31 |
10 files changed, 230 insertions, 195 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ead464a0f2e5..102dfcf0a29a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -792,10 +792,14 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags) rq->nr_uninterruptible--; enqueue_task(rq, p, flags); + + p->on_rq = TASK_ON_RQ_QUEUED; } void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { + p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING; + if (task_contributes_to_load(p)) rq->nr_uninterruptible++; @@ -920,7 +924,7 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) } /* - * Per-CPU kthreads are allowed to run on !actie && online CPUs, see + * Per-CPU kthreads are allowed to run on !active && online CPUs, see * __set_cpus_allowed_ptr() and select_fallback_rq(). */ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) @@ -1151,7 +1155,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, p, &rf); stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); - tlb_migrate_finish(p->mm); return 0; } else if (task_on_rq_queued(p)) { /* @@ -1237,11 +1240,9 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) rq_pin_lock(src_rq, &srf); rq_pin_lock(dst_rq, &drf); - p->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(src_rq, p, 0); set_task_cpu(p, cpu); activate_task(dst_rq, p, 0); - p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(dst_rq, p, 0); rq_unpin_lock(dst_rq, &drf); @@ -1681,16 +1682,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) __schedstat_inc(p->se.statistics.nr_wakeups_sync); } -static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) -{ - activate_task(rq, p, en_flags); - p->on_rq = TASK_ON_RQ_QUEUED; - - /* If a worker is waking up, notify the workqueue: */ - if (p->flags & PF_WQ_WORKER) - wq_worker_waking_up(p, cpu_of(rq)); -} - /* * Mark the task runnable and perform wakeup-preemption. */ @@ -1742,7 +1733,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, en_flags |= ENQUEUE_MIGRATED; #endif - ttwu_activate(rq, p, en_flags); + activate_task(rq, p, en_flags); ttwu_do_wakeup(rq, p, wake_flags, rf); } @@ -2107,56 +2098,6 @@ out: } /** - * try_to_wake_up_local - try to wake up a local task with rq lock held - * @p: the thread to be awakened - * @rf: request-queue flags for pinning - * - * Put @p on the run-queue if it's not already there. The caller must - * ensure that this_rq() is locked, @p is bound to this_rq() and not - * the current task. - */ -static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) -{ - struct rq *rq = task_rq(p); - - if (WARN_ON_ONCE(rq != this_rq()) || - WARN_ON_ONCE(p == current)) - return; - - lockdep_assert_held(&rq->lock); - - if (!raw_spin_trylock(&p->pi_lock)) { - /* - * This is OK, because current is on_cpu, which avoids it being - * picked for load-balance and preemption/IRQs are still - * disabled avoiding further scheduler activity on it and we've - * not yet picked a replacement task. - */ - rq_unlock(rq, rf); - raw_spin_lock(&p->pi_lock); - rq_relock(rq, rf); - } - - if (!(p->state & TASK_NORMAL)) - goto out; - - trace_sched_waking(p); - - if (!task_on_rq_queued(p)) { - if (p->in_iowait) { - delayacct_blkio_end(p); - atomic_dec(&rq->nr_iowait); - } - ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK); - } - - ttwu_do_wakeup(rq, p, 0, rf); - ttwu_stat(p, smp_processor_id(), 0); -out: - raw_spin_unlock(&p->pi_lock); -} - -/** * wake_up_process - Wake up a specific process * @p: The process to be woken up. * @@ -2467,7 +2408,6 @@ void wake_up_new_task(struct task_struct *p) post_init_entity_util_avg(p); activate_task(rq, p, ENQUEUE_NOCLOCK); - p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP @@ -3466,25 +3406,11 @@ static void __sched notrace __schedule(bool preempt) prev->state = TASK_RUNNING; } else { deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); - prev->on_rq = 0; if (prev->in_iowait) { atomic_inc(&rq->nr_iowait); delayacct_blkio_start(); } - - /* - * If a worker went to sleep, notify and ask workqueue - * whether it wants to wake up a task to maintain - * concurrency. - */ - if (prev->flags & PF_WQ_WORKER) { - struct task_struct *to_wakeup; - - to_wakeup = wq_worker_sleeping(prev); - if (to_wakeup) - try_to_wake_up_local(to_wakeup, &rf); - } } switch_count = &prev->nvcsw; } @@ -3544,6 +3470,20 @@ static inline void sched_submit_work(struct task_struct *tsk) { if (!tsk->state || tsk_is_pi_blocked(tsk)) return; + + /* + * If a worker went to sleep, notify and ask workqueue whether + * it wants to wake up a task to maintain concurrency. + * As this function is called inside the schedule() context, + * we disable preemption to avoid it calling schedule() again + * in the possible wakeup of a kworker. + */ + if (tsk->flags & PF_WQ_WORKER) { + preempt_disable(); + wq_worker_sleeping(tsk); + preempt_enable_no_resched(); + } + /* * If we are going to sleep and we have plugged IO queued, * make sure to submit it to avoid deadlocks. @@ -3552,6 +3492,12 @@ static inline void sched_submit_work(struct task_struct *tsk) blk_schedule_flush_plug(tsk); } +static void sched_update_worker(struct task_struct *tsk) +{ + if (tsk->flags & PF_WQ_WORKER) + wq_worker_running(tsk); +} + asmlinkage __visible void __sched schedule(void) { struct task_struct *tsk = current; @@ -3562,6 +3508,7 @@ asmlinkage __visible void __sched schedule(void) __schedule(false); sched_preempt_enable_no_resched(); } while (need_resched()); + sched_update_worker(tsk); } EXPORT_SYMBOL(schedule); @@ -5918,7 +5865,7 @@ void __init sched_init_smp(void) static int __init migration_init(void) { - sched_rq_cpu_starting(smp_processor_id()); + sched_cpu_starting(smp_processor_id()); return 0; } early_initcall(migration_init); @@ -6559,6 +6506,8 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) static int cpu_shares_write_u64(struct cgroup_subsys_state *css, struct cftype *cftype, u64 shareval) { + if (shareval > scale_load_down(ULONG_MAX)) + shareval = MAX_SHARES; return sched_group_set_shares(css_tg(css), scale_load(shareval)); } @@ -6574,7 +6523,7 @@ static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, static DEFINE_MUTEX(cfs_constraints_mutex); const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ -const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ +static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); @@ -6654,20 +6603,22 @@ out_unlock: return ret; } -int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) +static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) { u64 quota, period; period = ktime_to_ns(tg->cfs_bandwidth.period); if (cfs_quota_us < 0) quota = RUNTIME_INF; - else + else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC) quota = (u64)cfs_quota_us * NSEC_PER_USEC; + else + return -EINVAL; return tg_set_cfs_bandwidth(tg, period, quota); } -long tg_get_cfs_quota(struct task_group *tg) +static long tg_get_cfs_quota(struct task_group *tg) { u64 quota_us; @@ -6680,17 +6631,20 @@ long tg_get_cfs_quota(struct task_group *tg) return quota_us; } -int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) +static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) { u64 quota, period; + if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC) + return -EINVAL; + period = (u64)cfs_period_us * NSEC_PER_USEC; quota = tg->cfs_bandwidth.quota; return tg_set_cfs_bandwidth(tg, period, quota); } -long tg_get_cfs_period(struct task_group *tg) +static long tg_get_cfs_period(struct task_group *tg) { u64 cfs_period_us; @@ -6998,7 +6952,7 @@ static int __maybe_unused cpu_period_quota_parse(char *buf, { char tok[21]; /* U64_MAX */ - if (!sscanf(buf, "%s %llu", tok, periodp)) + if (sscanf(buf, "%20s %llu", tok, periodp) < 1) return -EINVAL; *periodp *= NSEC_PER_USEC; diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index 835671f0f917..b5dcd1d83c7f 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c @@ -7,7 +7,7 @@ */ #include "sched.h" -DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); +DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); /** * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer. diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 2efe629425be..962cf343f798 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -13,6 +13,8 @@ #include <linux/sched/cpufreq.h> #include <trace/events/power.h> +#define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8) + struct sugov_tunables { struct gov_attr_set attr_set; unsigned int rate_limit_us; @@ -48,7 +50,6 @@ struct sugov_cpu { bool iowait_boost_pending; unsigned int iowait_boost; - unsigned int iowait_boost_max; u64 last_update; unsigned long bw_dl; @@ -291,8 +292,8 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) * * The IO wait boost of a task is disabled after a tick since the last update * of a CPU. If a new IO wait boost is requested after more then a tick, then - * we enable the boost starting from the minimum frequency, which improves - * energy efficiency by ignoring sporadic wakeups from IO. + * we enable the boost starting from IOWAIT_BOOST_MIN, which improves energy + * efficiency by ignoring sporadic wakeups from IO. */ static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time, bool set_iowait_boost) @@ -303,8 +304,7 @@ static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time, if (delta_ns <= TICK_NSEC) return false; - sg_cpu->iowait_boost = set_iowait_boost - ? sg_cpu->sg_policy->policy->min : 0; + sg_cpu->iowait_boost = set_iowait_boost ? IOWAIT_BOOST_MIN : 0; sg_cpu->iowait_boost_pending = set_iowait_boost; return true; @@ -318,8 +318,9 @@ static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time, * * Each time a task wakes up after an IO operation, the CPU utilization can be * boosted to a certain utilization which doubles at each "frequent and - * successive" wakeup from IO, ranging from the utilization of the minimum - * OPP to the utilization of the maximum OPP. + * successive" wakeup from IO, ranging from IOWAIT_BOOST_MIN to the utilization + * of the maximum OPP. + * * To keep doubling, an IO boost has to be requested at least once per tick, * otherwise we restart from the utilization of the minimum OPP. */ @@ -344,14 +345,13 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, /* Double the boost at each request */ if (sg_cpu->iowait_boost) { - sg_cpu->iowait_boost <<= 1; - if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max) - sg_cpu->iowait_boost = sg_cpu->iowait_boost_max; + sg_cpu->iowait_boost = + min_t(unsigned int, sg_cpu->iowait_boost << 1, SCHED_CAPACITY_SCALE); return; } /* First wakeup after IO: start with minimum boost */ - sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min; + sg_cpu->iowait_boost = IOWAIT_BOOST_MIN; } /** @@ -373,47 +373,38 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, * This mechanism is designed to boost high frequently IO waiting tasks, while * being more conservative on tasks which does sporadic IO operations. */ -static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, - unsigned long *util, unsigned long *max) +static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, + unsigned long util, unsigned long max) { - unsigned int boost_util, boost_max; + unsigned long boost; /* No boost currently required */ if (!sg_cpu->iowait_boost) - return; + return util; /* Reset boost if the CPU appears to have been idle enough */ if (sugov_iowait_reset(sg_cpu, time, false)) - return; + return util; - /* - * An IO waiting task has just woken up: - * allow to further double the boost value - */ - if (sg_cpu->iowait_boost_pending) { - sg_cpu->iowait_boost_pending = false; - } else { + if (!sg_cpu->iowait_boost_pending) { /* - * Otherwise: reduce the boost value and disable it when we - * reach the minimum. + * No boost pending; reduce the boost value. */ sg_cpu->iowait_boost >>= 1; - if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) { + if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) { sg_cpu->iowait_boost = 0; - return; + return util; } } + sg_cpu->iowait_boost_pending = false; + /* - * Apply the current boost value: a CPU is boosted only if its current - * utilization is smaller then the current IO boost level. + * @util is already in capacity scale; convert iowait_boost + * into the same scale so we can compare. */ - boost_util = sg_cpu->iowait_boost; - boost_max = sg_cpu->iowait_boost_max; - if (*util * boost_max < *max * boost_util) { - *util = boost_util; - *max = boost_max; - } + boost = (sg_cpu->iowait_boost * max) >> SCHED_CAPACITY_SHIFT; + return max(boost, util); } #ifdef CONFIG_NO_HZ_COMMON @@ -460,7 +451,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, util = sugov_get_util(sg_cpu); max = sg_cpu->max; - sugov_iowait_apply(sg_cpu, time, &util, &max); + util = sugov_iowait_apply(sg_cpu, time, util, max); next_f = get_next_freq(sg_policy, util, max); /* * Do not reduce the frequency if the CPU has not been idle @@ -500,7 +491,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) j_util = sugov_get_util(j_sg_cpu); j_max = j_sg_cpu->max; - sugov_iowait_apply(j_sg_cpu, time, &j_util, &j_max); + j_util = sugov_iowait_apply(j_sg_cpu, time, j_util, j_max); if (j_util * max > j_max * util) { util = j_util; @@ -609,13 +600,14 @@ rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us); -static struct attribute *sugov_attributes[] = { +static struct attribute *sugov_attrs[] = { &rate_limit_us.attr, NULL }; +ATTRIBUTE_GROUPS(sugov); static struct kobj_type sugov_tunables_ktype = { - .default_attrs = sugov_attributes, + .default_groups = sugov_groups, .sysfs_ops = &governor_sysfs_ops, }; @@ -782,6 +774,7 @@ out: return 0; fail: + kobject_put(&tunables->attr_set.kobj); policy->governor_data = NULL; sugov_tunables_free(tunables); @@ -837,7 +830,6 @@ static int sugov_start(struct cpufreq_policy *policy) memset(sg_cpu, 0, sizeof(*sg_cpu)); sg_cpu->cpu = cpu; sg_cpu->sg_policy = sg_policy; - sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; } for_each_cpu(cpu, policy->cpus) { diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 6a73e41a2016..43901fa3f269 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -252,7 +252,6 @@ static void task_non_contending(struct task_struct *p) if (dl_entity_is_special(dl_se)) return; - WARN_ON(hrtimer_active(&dl_se->inactive_timer)); WARN_ON(dl_se->dl_non_contending); zerolag_time = dl_se->deadline - @@ -269,7 +268,7 @@ static void task_non_contending(struct task_struct *p) * If the "0-lag time" already passed, decrease the active * utilization now, instead of starting a timer */ - if (zerolag_time < 0) { + if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) { if (dl_task(p)) sub_running_bw(dl_se, dl_rq); if (!dl_task(p) || p->state == TASK_DEAD) { diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 8039d62ae36e..678bfb9bd87f 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -702,7 +702,7 @@ do { \ static const char *sched_tunable_scaling_names[] = { "none", - "logaritmic", + "logarithmic", "linear" }; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ea74d43924b2..f35930f5e528 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2007,6 +2007,10 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) if (p->last_task_numa_placement) { delta = runtime - p->last_sum_exec_runtime; *period = now - p->last_task_numa_placement; + + /* Avoid time going backwards, prevent potential divide error: */ + if (unlikely((s64)*period < 0)) + *period = 0; } else { delta = p->se.avg.load_sum; *period = LOAD_AVG_MAX; @@ -2593,7 +2597,7 @@ out: /* * Drive the periodic memory faults.. */ -void task_tick_numa(struct rq *rq, struct task_struct *curr) +static void task_tick_numa(struct rq *rq, struct task_struct *curr) { struct callback_head *work = &curr->numa_work; u64 period, now; @@ -3567,7 +3571,7 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) * Synchronize entity load avg of dequeued entity without locking * the previous rq. */ -void sync_entity_load_avg(struct sched_entity *se) +static void sync_entity_load_avg(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 last_update_time; @@ -3580,7 +3584,7 @@ void sync_entity_load_avg(struct sched_entity *se) * Task first catches up with cfs_rq, and then subtract * itself from the cfs_rq (task must be off the queue now). */ -void remove_entity_load_avg(struct sched_entity *se) +static void remove_entity_load_avg(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); unsigned long flags; @@ -4885,6 +4889,8 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) return HRTIMER_NORESTART; } +extern const u64 max_cfs_quota_period; + static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) { struct cfs_bandwidth *cfs_b = @@ -4892,6 +4898,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) unsigned long flags; int overrun; int idle = 0; + int count = 0; raw_spin_lock_irqsave(&cfs_b->lock, flags); for (;;) { @@ -4899,6 +4906,28 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) if (!overrun) break; + if (++count > 3) { + u64 new, old = ktime_to_ns(cfs_b->period); + + new = (old * 147) / 128; /* ~115% */ + new = min(new, max_cfs_quota_period); + + cfs_b->period = ns_to_ktime(new); + + /* since max is 1s, this is limited to 1e9^2, which fits in u64 */ + cfs_b->quota *= new; + cfs_b->quota = div64_u64(cfs_b->quota, old); + + pr_warn_ratelimited( + "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n", + smp_processor_id(), + div_u64(new, NSEC_PER_USEC), + div_u64(cfs_b->quota, NSEC_PER_USEC)); + + /* reset count so we don't come right back in here */ + count = 0; + } + idle = do_sched_cfs_period_timer(cfs_b, overrun, flags); } if (idle) @@ -5116,7 +5145,6 @@ static inline void hrtick_update(struct rq *rq) #ifdef CONFIG_SMP static inline unsigned long cpu_util(int cpu); -static unsigned long capacity_of(int cpu); static inline bool cpu_overutilized(int cpu) { @@ -7492,7 +7520,6 @@ static void detach_task(struct task_struct *p, struct lb_env *env) { lockdep_assert_held(&env->src_rq->lock); - p->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, env->dst_cpu); } @@ -7628,7 +7655,6 @@ static void attach_task(struct rq *rq, struct task_struct *p) BUG_ON(task_rq(p) != rq); activate_task(rq, p, ENQUEUE_NOCLOCK); - p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(rq, p, 0); } @@ -7784,10 +7810,10 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq) if (cfs_rq->last_h_load_update == now) return; - cfs_rq->h_load_next = NULL; + WRITE_ONCE(cfs_rq->h_load_next, NULL); for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); - cfs_rq->h_load_next = se; + WRITE_ONCE(cfs_rq->h_load_next, se); if (cfs_rq->last_h_load_update == now) break; } @@ -7797,7 +7823,7 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq) cfs_rq->last_h_load_update = now; } - while ((se = cfs_rq->h_load_next) != NULL) { + while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) { load = cfs_rq->h_load; load = div64_ul(load * se->avg.load_avg, cfs_rq_load_avg(cfs_rq) + 1); @@ -8060,6 +8086,18 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd) } /* + * Check whether a rq has a misfit task and if it looks like we can actually + * help that task: we can migrate the task to a CPU of higher capacity, or + * the task's current CPU is heavily pressured. + */ +static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd) +{ + return rq->misfit_task_load && + (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity || + check_cpu_capacity(rq, sd)); +} + +/* * Group imbalance indicates (and tries to solve) the problem where balancing * groups is inadequate due to ->cpus_allowed constraints. * @@ -9510,22 +9548,26 @@ static inline int on_null_domain(struct rq *rq) * - When one of the busy CPUs notice that there may be an idle rebalancing * needed, they will kick the idle load balancer, which then does idle * load balancing for all the idle CPUs. + * - HK_FLAG_MISC CPUs are used for this task, because HK_FLAG_SCHED not set + * anywhere yet. */ static inline int find_new_ilb(void) { - int ilb = cpumask_first(nohz.idle_cpus_mask); + int ilb; - if (ilb < nr_cpu_ids && idle_cpu(ilb)) - return ilb; + for_each_cpu_and(ilb, nohz.idle_cpus_mask, + housekeeping_cpumask(HK_FLAG_MISC)) { + if (idle_cpu(ilb)) + return ilb; + } return nr_cpu_ids; } /* - * Kick a CPU to do the nohz balancing, if it is time for it. We pick the - * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle - * CPU (if there is one). + * Kick a CPU to do the nohz balancing, if it is time for it. We pick any + * idle CPU in the HK_FLAG_MISC housekeeping set (if there is one). */ static void kick_ilb(unsigned int flags) { @@ -9586,35 +9628,21 @@ static void nohz_balancer_kick(struct rq *rq) if (time_before(now, nohz.next_balance)) goto out; - if (rq->nr_running >= 2 || rq->misfit_task_load) { + if (rq->nr_running >= 2) { flags = NOHZ_KICK_MASK; goto out; } rcu_read_lock(); - sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); - if (sds) { - /* - * If there is an imbalance between LLC domains (IOW we could - * increase the overall cache use), we need some less-loaded LLC - * domain to pull some load. Likewise, we may need to spread - * load within the current LLC domain (e.g. packed SMT cores but - * other CPUs are idle). We can't really know from here how busy - * the others are - so just get a nohz balance going if it looks - * like this LLC domain has tasks we could move. - */ - nr_busy = atomic_read(&sds->nr_busy_cpus); - if (nr_busy > 1) { - flags = NOHZ_KICK_MASK; - goto unlock; - } - - } sd = rcu_dereference(rq->sd); if (sd) { - if ((rq->cfs.h_nr_running >= 1) && - check_cpu_capacity(rq, sd)) { + /* + * If there's a CFS task and the current CPU has reduced + * capacity; kick the ILB to see if there's a better CPU to run + * on. + */ + if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) { flags = NOHZ_KICK_MASK; goto unlock; } @@ -9622,6 +9650,11 @@ static void nohz_balancer_kick(struct rq *rq) sd = rcu_dereference(per_cpu(sd_asym_packing, cpu)); if (sd) { + /* + * When ASYM_PACKING; see if there's a more preferred CPU + * currently idle; in which case, kick the ILB to move tasks + * around. + */ for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { if (sched_asym_prefer(i, cpu)) { flags = NOHZ_KICK_MASK; @@ -9629,6 +9662,45 @@ static void nohz_balancer_kick(struct rq *rq) } } } + + sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu)); + if (sd) { + /* + * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU + * to run the misfit task on. + */ + if (check_misfit_status(rq, sd)) { + flags = NOHZ_KICK_MASK; + goto unlock; + } + + /* + * For asymmetric systems, we do not want to nicely balance + * cache use, instead we want to embrace asymmetry and only + * ensure tasks have enough CPU capacity. + * + * Skip the LLC logic because it's not relevant in that case. + */ + goto unlock; + } + + sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + if (sds) { + /* + * If there is an imbalance between LLC domains (IOW we could + * increase the overall cache use), we need some less-loaded LLC + * domain to pull some load. Likewise, we may need to spread + * load within the current LLC domain (e.g. packed SMT cores but + * other CPUs are idle). We can't really know from here how busy + * the others are - so just get a nohz balance going if it looks + * like this LLC domain has tasks we could move. + */ + nr_busy = atomic_read(&sds->nr_busy_cpus); + if (nr_busy > 1) { + flags = NOHZ_KICK_MASK; + goto unlock; + } + } unlock: rcu_read_unlock(); out: diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index b02d148e7672..687302051a27 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -65,6 +65,7 @@ void __init housekeeping_init(void) static int __init housekeeping_setup(char *str, enum hk_flags flags) { cpumask_var_t non_housekeeping_mask; + cpumask_var_t tmp; int err; alloc_bootmem_cpumask_var(&non_housekeeping_mask); @@ -75,16 +76,23 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags) return 0; } + alloc_bootmem_cpumask_var(&tmp); if (!housekeeping_flags) { alloc_bootmem_cpumask_var(&housekeeping_mask); cpumask_andnot(housekeeping_mask, cpu_possible_mask, non_housekeeping_mask); - if (cpumask_empty(housekeeping_mask)) + + cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask); + if (cpumask_empty(tmp)) { + pr_warn("Housekeeping: must include one present CPU, " + "using boot CPU:%d\n", smp_processor_id()); __cpumask_set_cpu(smp_processor_id(), housekeeping_mask); + __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask); + } } else { - cpumask_var_t tmp; - - alloc_bootmem_cpumask_var(&tmp); + cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask); + if (cpumask_empty(tmp)) + __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask); cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask); if (!cpumask_equal(tmp, housekeeping_mask)) { pr_warn("Housekeeping: nohz_full= must match isolcpus=\n"); @@ -92,8 +100,8 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags) free_bootmem_cpumask_var(non_housekeeping_mask); return 0; } - free_bootmem_cpumask_var(tmp); } + free_bootmem_cpumask_var(tmp); if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) { if (IS_ENABLED(CONFIG_NO_HZ_FULL)) { diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 90fa23d36565..1e6b909dca36 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2555,6 +2555,8 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; if (rt_runtime_us < 0) rt_runtime = RUNTIME_INF; + else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC) + return -EINVAL; return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); } @@ -2575,6 +2577,9 @@ int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us) { u64 rt_runtime, rt_period; + if (rt_period_us > U64_MAX / NSEC_PER_USEC) + return -EINVAL; + rt_period = rt_period_us * NSEC_PER_USEC; rt_runtime = tg->rt_bandwidth.rt_runtime; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index efa686eeff26..b52ed1ada0be 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -780,7 +780,7 @@ struct root_domain { * NULL-terminated list of performance domains intersecting with the * CPUs of the rd. Protected by RCU. */ - struct perf_domain *pd; + struct perf_domain __rcu *pd; }; extern struct root_domain def_root_domain; @@ -869,8 +869,8 @@ struct rq { atomic_t nr_iowait; #ifdef CONFIG_SMP - struct root_domain *rd; - struct sched_domain *sd; + struct root_domain *rd; + struct sched_domain __rcu *sd; unsigned long cpu_capacity; unsigned long cpu_capacity_orig; @@ -1324,13 +1324,13 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) return sd; } -DECLARE_PER_CPU(struct sched_domain *, sd_llc); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc); DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); -DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); -DECLARE_PER_CPU(struct sched_domain *, sd_numa); -DECLARE_PER_CPU(struct sched_domain *, sd_asym_packing); -DECLARE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity); +DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); extern struct static_key_false sched_asym_cpucapacity; struct sched_group_capacity { @@ -2185,7 +2185,7 @@ static inline u64 irq_time_read(int cpu) #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ #ifdef CONFIG_CPU_FREQ -DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); +DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); /** * cpufreq_update_util - Take a note about CPU utilization changes. diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index ab7f371a3a17..f53f89df837d 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -615,13 +615,13 @@ static void destroy_sched_domains(struct sched_domain *sd) * the cpumask of the domain), this allows us to quickly tell if * two CPUs are in the same cache domain, see cpus_share_cache(). */ -DEFINE_PER_CPU(struct sched_domain *, sd_llc); +DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); -DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); -DEFINE_PER_CPU(struct sched_domain *, sd_numa); -DEFINE_PER_CPU(struct sched_domain *, sd_asym_packing); -DEFINE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity); +DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); +DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); +DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); +DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); static void update_top_cache_domain(int cpu) @@ -1059,6 +1059,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd) struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); struct sched_domain *child = sd->child; struct sched_group *sg; + bool already_visited; if (child) cpu = cpumask_first(sched_domain_span(child)); @@ -1066,9 +1067,14 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd) sg = *per_cpu_ptr(sdd->sg, cpu); sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); - /* For claim_allocations: */ - atomic_inc(&sg->ref); - atomic_inc(&sg->sgc->ref); + /* Increase refcounts for claim_allocations: */ + already_visited = atomic_inc_return(&sg->ref) > 1; + /* sgc visits should follow a similar trend as sg */ + WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1)); + + /* If we have already visited that group, it's already initialized. */ + if (already_visited) + return sg; if (child) { cpumask_copy(sched_group_span(sg), sched_domain_span(child)); @@ -1087,8 +1093,8 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd) /* * build_sched_groups will build a circular linked list of the groups - * covered by the given span, and will set each group's ->cpumask correctly, - * and ->cpu_capacity to 0. + * covered by the given span, will set each group's ->cpumask correctly, + * and will initialize their ->sgc. * * Assumes the sched_domain tree is fully constructed */ @@ -2075,9 +2081,8 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) } /* - * Set up scheduler domains and groups. Callers must hold the hotplug lock. - * For now this just excludes isolated CPUs, but could be used to - * exclude other special cases in the future. + * Set up scheduler domains and groups. For now this just excludes isolated + * CPUs, but could be used to exclude other special cases in the future. */ int sched_init_domains(const struct cpumask *cpu_map) { |