diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-05-14 02:18:51 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-05-14 02:18:51 +0200 |
commit | 6e5a0c30b616bfff6926ecca5d88e3d06e6bf79a (patch) | |
tree | c8b459ab41f9265828116d04faa23e5224be6e5e /kernel | |
parent | Merge tag 'perf-core-2024-05-13' of git://git.kernel.org/pub/scm/linux/kernel... (diff) | |
parent | sched/pelt: Remove shift of thermal clock (diff) | |
download | linux-6e5a0c30b616bfff6926ecca5d88e3d06e6bf79a.tar.xz linux-6e5a0c30b616bfff6926ecca5d88e3d06e6bf79a.zip |
Merge tag 'sched-core-2024-05-13' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
- Add cpufreq pressure feedback for the scheduler
- Rework misfit load-balancing wrt affinity restrictions
- Clean up and simplify the code around ::overutilized and
::overload access.
- Simplify sched_balance_newidle()
- Bump SCHEDSTAT_VERSION to 16 due to a cleanup of CPU_MAX_IDLE_TYPES
handling that changed the output.
- Rework & clean up <asm/vtime.h> interactions wrt arch_vtime_task_switch()
- Reorganize, clean up and unify most of the higher level
scheduler balancing function names around the sched_balance_*()
prefix
- Simplify the balancing flag code (sched_balance_running)
- Miscellaneous cleanups & fixes
* tag 'sched-core-2024-05-13' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (50 commits)
sched/pelt: Remove shift of thermal clock
sched/cpufreq: Rename arch_update_thermal_pressure() => arch_update_hw_pressure()
thermal/cpufreq: Remove arch_update_thermal_pressure()
sched/cpufreq: Take cpufreq feedback into account
cpufreq: Add a cpufreq pressure feedback for the scheduler
sched/fair: Fix update of rd->sg_overutilized
sched/vtime: Do not include <asm/vtime.h> header
s390/irq,nmi: Include <asm/vtime.h> header directly
s390/vtime: Remove unused __ARCH_HAS_VTIME_TASK_SWITCH leftover
sched/vtime: Get rid of generic vtime_task_switch() implementation
sched/vtime: Remove confusing arch_vtime_task_switch() declaration
sched/balancing: Simplify the sg_status bitmask and use separate ->overloaded and ->overutilized flags
sched/fair: Rename set_rd_overutilized_status() to set_rd_overutilized()
sched/fair: Rename SG_OVERLOAD to SG_OVERLOADED
sched/fair: Rename {set|get}_rd_overload() to {set|get}_rd_overloaded()
sched/fair: Rename root_domain::overload to ::overloaded
sched/fair: Use helper functions to access root_domain::overload
sched/fair: Check root_domain::overload value before update
sched/fair: Combine EAS check with root_domain::overutilized access
sched/fair: Simplify the continue_balancing logic in sched_balance_newidle()
...
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/sched/core.c | 14 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 13 | ||||
-rw-r--r-- | kernel/sched/fair.c | 503 | ||||
-rw-r--r-- | kernel/sched/loadavg.c | 2 | ||||
-rw-r--r-- | kernel/sched/pelt.c | 22 | ||||
-rw-r--r-- | kernel/sched/pelt.h | 16 | ||||
-rw-r--r-- | kernel/sched/sched.h | 71 | ||||
-rw-r--r-- | kernel/sched/stats.c | 5 | ||||
-rw-r--r-- | kernel/sched/topology.c | 56 | ||||
-rw-r--r-- | kernel/time/timer.c | 2 | ||||
-rw-r--r-- | kernel/workqueue.c | 2 |
11 files changed, 381 insertions, 325 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7019a40457a6..1a914388144a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -108,7 +108,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp); -EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_hw_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); @@ -5662,13 +5662,13 @@ static inline u64 cpu_resched_latency(struct rq *rq) { return 0; } * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. */ -void scheduler_tick(void) +void sched_tick(void) { int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); struct task_struct *curr = rq->curr; struct rq_flags rf; - unsigned long thermal_pressure; + unsigned long hw_pressure; u64 resched_latency; if (housekeeping_cpu(cpu, HK_TYPE_TICK)) @@ -5679,8 +5679,8 @@ void scheduler_tick(void) rq_lock(rq, &rf); update_rq_clock(rq); - thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); - update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure); + hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); + update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure); curr->sched_class->task_tick(rq, curr, 0); if (sched_feat(LATENCY_WARN)) resched_latency = cpu_resched_latency(rq); @@ -5700,7 +5700,7 @@ void scheduler_tick(void) #ifdef CONFIG_SMP rq->idle_balance = idle_cpu(cpu); - trigger_load_balance(rq); + sched_balance_trigger(rq); #endif } @@ -6585,7 +6585,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * paths. For example, see arch/x86/entry_64.S. * * To drive preemption between tasks, the scheduler sets the flag in timer - * interrupt handler scheduler_tick(). + * interrupt handler sched_tick(). * * 3. Wakeups don't really cause entry into schedule(). They add a * task to the run-queue and that's it. diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index af7952f12e6c..aa48b2ec879d 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -424,19 +424,6 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_ */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE -# ifndef __ARCH_HAS_VTIME_TASK_SWITCH -void vtime_task_switch(struct task_struct *prev) -{ - if (is_idle_task(prev)) - vtime_account_idle(prev); - else - vtime_account_kernel(prev); - - vtime_flush(prev); - arch_vtime_task_switch(prev); -} -# endif - void vtime_account_irq(struct task_struct *tsk, unsigned int offset) { unsigned int pc = irq_count() - offset; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c62805dbd608..146ecf9cc3af 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -78,15 +78,9 @@ static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; -int sched_thermal_decay_shift; static int __init setup_sched_thermal_decay_shift(char *str) { - int _shift = 0; - - if (kstrtoint(str, 0, &_shift)) - pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n"); - - sched_thermal_decay_shift = clamp(_shift, 0, 10); + pr_warn("Ignoring the deprecated sched_thermal_decay_shift= option\n"); return 1; } __setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift); @@ -388,8 +382,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) /* * With cfs_rq being unthrottled/throttled during an enqueue, - * it can happen the tmp_alone_branch points the a leaf that - * we finally want to del. In this case, tmp_alone_branch moves + * it can happen the tmp_alone_branch points to the leaf that + * we finally want to delete. In this case, tmp_alone_branch moves * to the prev element but it will point to rq->leaf_cfs_rq_list * at the end of the enqueue. */ @@ -406,7 +400,7 @@ static inline void assert_list_leaf_cfs_rq(struct rq *rq) SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list); } -/* Iterate thr' all leaf cfs_rq's on a runqueue */ +/* Iterate through all leaf cfs_rq's on a runqueue */ #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \ leaf_cfs_rq_list) @@ -595,13 +589,13 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) * * [[ NOTE: this is only equal to the ideal scheduler under the condition * that join/leave operations happen at lag_i = 0, otherwise the - * virtual time has non-continguous motion equivalent to: + * virtual time has non-contiguous motion equivalent to: * * V +-= lag_i / W * * Also see the comment in place_entity() that deals with this. ]] * - * However, since v_i is u64, and the multiplcation could easily overflow + * However, since v_i is u64, and the multiplication could easily overflow * transform it into a relative form that uses smaller quantities: * * Substitute: v_i == (v_i - v0) + v0 @@ -671,7 +665,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) } if (load) { - /* sign flips effective floor / ceil */ + /* sign flips effective floor / ceiling */ if (avg < 0) avg -= (load - 1); avg = div_s64(avg, load); @@ -727,7 +721,7 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) * * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i) * - * Note: using 'avg_vruntime() > se->vruntime' is inacurate due + * Note: using 'avg_vruntime() > se->vruntime' is inaccurate due * to the loss in precision caused by the division. */ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) @@ -1030,7 +1024,7 @@ void init_entity_runnable_average(struct sched_entity *se) if (entity_is_task(se)) sa->load_avg = scale_load_down(se->load.weight); - /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ + /* when this task is enqueued, it will contribute to its cfs_rq's load_avg */ } /* @@ -1622,7 +1616,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, max_dist = READ_ONCE(sched_max_numa_distance); /* * This code is called for each node, introducing N^2 complexity, - * which should be ok given the number of nodes rarely exceeds 8. + * which should be OK given the number of nodes rarely exceeds 8. */ for_each_online_node(node) { unsigned long faults; @@ -3296,7 +3290,7 @@ retry_pids: /* * Shared library pages mapped by multiple processes are not * migrated as it is expected they are cache replicated. Avoid - * hinting faults in read-only file-backed mappings or the vdso + * hinting faults in read-only file-backed mappings or the vDSO * as migrating the pages will be of marginal benefit. */ if (!vma->vm_mm || @@ -3307,7 +3301,7 @@ retry_pids: /* * Skip inaccessible VMAs to avoid any confusion between - * PROT_NONE and NUMA hinting ptes + * PROT_NONE and NUMA hinting PTEs */ if (!vma_is_accessible(vma)) { trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_INACCESSIBLE); @@ -3339,7 +3333,7 @@ retry_pids: } /* - * Scanning the VMA's of short lived tasks add more overhead. So + * Scanning the VMAs of short lived tasks add more overhead. So * delay the scan for new VMAs. */ if (mm->numa_scan_seq && time_before(jiffies, @@ -3383,7 +3377,7 @@ retry_pids: /* * Try to scan sysctl_numa_balancing_size worth of * hpages that have at least one present PTE that - * is not already pte-numa. If the VMA contains + * is not already PTE-numa. If the VMA contains * areas that are unused or already full of prot_numa * PTEs, scan up to virtpages, to skip through those * areas faster. @@ -3690,7 +3684,7 @@ static void reweight_eevdf(struct sched_entity *se, u64 avruntime, /* * VRUNTIME - * ======== + * -------- * * COROLLARY #1: The virtual runtime of the entity needs to be * adjusted if re-weight at !0-lag point. @@ -3773,7 +3767,7 @@ static void reweight_eevdf(struct sched_entity *se, u64 avruntime, /* * DEADLINE - * ======== + * -------- * * When the weight changes, the virtual time slope changes and * we should adjust the relative virtual deadline accordingly. @@ -4745,7 +4739,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s /* * Track task load average for carrying it to new CPU after migrated, and - * track group sched_entity load average for task_h_load calc in migration + * track group sched_entity load average for task_h_load calculation in migration */ if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) __update_load_avg_se(now, cfs_rq, se); @@ -4828,7 +4822,7 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) return cfs_rq->avg.load_avg; } -static int newidle_balance(struct rq *this_rq, struct rq_flags *rf); +static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf); static inline unsigned long task_util(struct task_struct *p) { @@ -4971,13 +4965,22 @@ done: trace_sched_util_est_se_tp(&p->se); } +static inline unsigned long get_actual_cpu_capacity(int cpu) +{ + unsigned long capacity = arch_scale_cpu_capacity(cpu); + + capacity -= max(hw_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu)); + + return capacity; +} + static inline int util_fits_cpu(unsigned long util, unsigned long uclamp_min, unsigned long uclamp_max, int cpu) { - unsigned long capacity_orig, capacity_orig_thermal; unsigned long capacity = capacity_of(cpu); + unsigned long capacity_orig; bool fits, uclamp_max_fits; /* @@ -4999,7 +5002,7 @@ static inline int util_fits_cpu(unsigned long util, * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it * should fit a little cpu even if there's some pressure. * - * Only exception is for thermal pressure since it has a direct impact + * Only exception is for HW or cpufreq pressure since it has a direct impact * on available OPP of the system. * * We honour it for uclamp_min only as a drop in performance level @@ -5009,7 +5012,6 @@ static inline int util_fits_cpu(unsigned long util, * goal is to cap the task. So it's okay if it's getting less. */ capacity_orig = arch_scale_cpu_capacity(cpu); - capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu); /* * We want to force a task to fit a cpu as implied by uclamp_max. @@ -5026,14 +5028,14 @@ static inline int util_fits_cpu(unsigned long util, * | | | | | | | * | | | | | | | * +---------------------------------------- - * cpu0 cpu1 cpu2 + * CPU0 CPU1 CPU2 * * In the above example if a task is capped to a specific performance * point, y, then when: * - * * util = 80% of x then it does not fit on cpu0 and should migrate - * to cpu1 - * * util = 80% of y then it is forced to fit on cpu1 to honour + * * util = 80% of x then it does not fit on CPU0 and should migrate + * to CPU1 + * * util = 80% of y then it is forced to fit on CPU1 to honour * uclamp_max request. * * which is what we're enforcing here. A task always fits if @@ -5064,7 +5066,7 @@ static inline int util_fits_cpu(unsigned long util, * | | | | | | | * | | | | | | | (region c, boosted, util < uclamp_min) * +---------------------------------------- - * cpu0 cpu1 cpu2 + * CPU0 CPU1 CPU2 * * a) If util > uclamp_max, then we're capped, we don't care about * actual fitness value here. We only care if uclamp_max fits @@ -5084,7 +5086,8 @@ static inline int util_fits_cpu(unsigned long util, * handle the case uclamp_min > uclamp_max. */ uclamp_min = min(uclamp_min, uclamp_max); - if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal)) + if (fits && (util < uclamp_min) && + (uclamp_min > get_actual_cpu_capacity(cpu))) return -1; return fits; @@ -5104,15 +5107,19 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu) static inline void update_misfit_status(struct task_struct *p, struct rq *rq) { + int cpu = cpu_of(rq); + if (!sched_asym_cpucap_active()) return; - if (!p || p->nr_cpus_allowed == 1) { - rq->misfit_task_load = 0; - return; - } + /* + * Affinity allows us to go somewhere higher? Or are we on biggest + * available CPU already? Or do we fit into this CPU ? + */ + if (!p || (p->nr_cpus_allowed == 1) || + (arch_scale_cpu_capacity(cpu) == p->max_allowed_capacity) || + task_fits_cpu(p, cpu)) { - if (task_fits_cpu(p, cpu_of(rq))) { rq->misfit_task_load = 0; return; } @@ -5148,7 +5155,7 @@ attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} -static inline int newidle_balance(struct rq *rq, struct rq_flags *rf) +static inline int sched_balance_newidle(struct rq *rq, struct rq_flags *rf) { return 0; } @@ -5254,7 +5261,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) se->vruntime = vruntime - lag; /* - * When joining the competition; the exisiting tasks will be, + * When joining the competition; the existing tasks will be, * on average, halfway through their slice, as such start tasks * off with half a slice to ease into the competition. */ @@ -5403,7 +5410,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Now advance min_vruntime if @se was the entity holding it back, * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be * put back on, and if we advance min_vruntime, we'll be placed back - * further than we started -- ie. we'll be penalized. + * further than we started -- i.e. we'll be penalized. */ if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE) update_min_vruntime(cfs_rq); @@ -5439,7 +5446,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) /* * Track our maximum slice length, if the CPU's load is at - * least twice that of our own weight (i.e. dont track it + * least twice that of our own weight (i.e. don't track it * when there are only lesser-weight tasks around): */ if (schedstat_enabled() && @@ -6675,22 +6682,47 @@ static inline void hrtick_update(struct rq *rq) #ifdef CONFIG_SMP static inline bool cpu_overutilized(int cpu) { - unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN); - unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX); + unsigned long rq_util_min, rq_util_max; + + if (!sched_energy_enabled()) + return false; + + rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN); + rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX); /* Return true only if the utilization doesn't fit CPU's capacity */ return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu); } -static inline void update_overutilized_status(struct rq *rq) +/* + * overutilized value make sense only if EAS is enabled + */ +static inline bool is_rd_overutilized(struct root_domain *rd) +{ + return !sched_energy_enabled() || READ_ONCE(rd->overutilized); +} + +static inline void set_rd_overutilized(struct root_domain *rd, bool flag) { - if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) { - WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED); - trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED); - } + if (!sched_energy_enabled()) + return; + + WRITE_ONCE(rd->overutilized, flag); + trace_sched_overutilized_tp(rd, flag); +} + +static inline void check_update_overutilized_status(struct rq *rq) +{ + /* + * overutilized field is used for load balancing decisions only + * if energy aware scheduler is being used + */ + + if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu)) + set_rd_overutilized(rq->rd, 1); } #else -static inline void update_overutilized_status(struct rq *rq) { } +static inline void check_update_overutilized_status(struct rq *rq) { } #endif /* Runqueue only has SCHED_IDLE tasks enqueued */ @@ -6791,7 +6823,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * and the following generally works well enough in practice. */ if (!task_new) - update_overutilized_status(rq); + check_update_overutilized_status(rq); enqueue_throttle: assert_list_leaf_cfs_rq(rq); @@ -6878,7 +6910,7 @@ dequeue_throttle: #ifdef CONFIG_SMP -/* Working cpumask for: load_balance, load_balance_newidle. */ +/* Working cpumask for: sched_balance_rq(), sched_balance_newidle(). */ static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask); static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask); @@ -7110,13 +7142,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, } static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu); +sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu); /* - * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group. + * sched_balance_find_dst_group_cpu - find the idlest CPU among the CPUs in the group. */ static int -find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) +sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) { unsigned long load, min_load = ULONG_MAX; unsigned int min_exit_latency = UINT_MAX; @@ -7172,7 +7204,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; } -static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p, +static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct task_struct *p, int cpu, int prev_cpu, int sd_flag) { int new_cpu = cpu; @@ -7197,13 +7229,13 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p continue; } - group = find_idlest_group(sd, p, cpu); + group = sched_balance_find_dst_group(sd, p, cpu); if (!group) { sd = sd->child; continue; } - new_cpu = find_idlest_group_cpu(group, p, cpu); + new_cpu = sched_balance_find_dst_group_cpu(group, p, cpu); if (new_cpu == cpu) { /* Now try balancing at a lower domain level of 'cpu': */ sd = sd->child; @@ -7471,7 +7503,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) * Look for the CPU with best capacity. */ else if (fits < 0) - cpu_cap = arch_scale_cpu_capacity(cpu) - thermal_load_avg(cpu_rq(cpu)); + cpu_cap = get_actual_cpu_capacity(cpu); /* * First, select CPU which fits better (-1 being better than 0). @@ -7515,7 +7547,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) /* * On asymmetric system, update task utilization because we will check - * that the task fits with cpu's capacity. + * that the task fits with CPU's capacity. */ if (sched_asym_cpucap_active()) { sync_entity_load_avg(&p->se); @@ -7948,7 +7980,7 @@ compute_energy(struct energy_env *eenv, struct perf_domain *pd, * NOTE: Forkees are not accepted in the energy-aware wake-up path because * they don't have any useful utilization data yet and it's not possible to * forecast their impact on energy consumption. Consequently, they will be - * placed by find_idlest_cpu() on the least loaded CPU, which might turn out + * placed by sched_balance_find_dst_cpu() on the least loaded CPU, which might turn out * to be energy-inefficient in some use-cases. The alternative would be to * bias new tasks towards specific types of CPUs first, or to try to infer * their util_avg from the parent task, but those heuristics could hurt @@ -7964,15 +7996,15 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) struct root_domain *rd = this_rq()->rd; int cpu, best_energy_cpu, target = -1; int prev_fits = -1, best_fits = -1; - unsigned long best_thermal_cap = 0; - unsigned long prev_thermal_cap = 0; + unsigned long best_actual_cap = 0; + unsigned long prev_actual_cap = 0; struct sched_domain *sd; struct perf_domain *pd; struct energy_env eenv; rcu_read_lock(); pd = rcu_dereference(rd->pd); - if (!pd || READ_ONCE(rd->overutilized)) + if (!pd) goto unlock; /* @@ -7995,7 +8027,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) for (; pd; pd = pd->next) { unsigned long util_min = p_util_min, util_max = p_util_max; - unsigned long cpu_cap, cpu_thermal_cap, util; + unsigned long cpu_cap, cpu_actual_cap, util; long prev_spare_cap = -1, max_spare_cap = -1; unsigned long rq_util_min, rq_util_max; unsigned long cur_delta, base_energy; @@ -8007,18 +8039,17 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (cpumask_empty(cpus)) continue; - /* Account thermal pressure for the energy estimation */ + /* Account external pressure for the energy estimation */ cpu = cpumask_first(cpus); - cpu_thermal_cap = arch_scale_cpu_capacity(cpu); - cpu_thermal_cap -= arch_scale_thermal_pressure(cpu); + cpu_actual_cap = get_actual_cpu_capacity(cpu); - eenv.cpu_cap = cpu_thermal_cap; + eenv.cpu_cap = cpu_actual_cap; eenv.pd_cap = 0; for_each_cpu(cpu, cpus) { struct rq *rq = cpu_rq(cpu); - eenv.pd_cap += cpu_thermal_cap; + eenv.pd_cap += cpu_actual_cap; if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) continue; @@ -8039,7 +8070,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) { /* * Open code uclamp_rq_util_with() except for - * the clamp() part. Ie: apply max aggregation + * the clamp() part. I.e.: apply max aggregation * only. util_fits_cpu() logic requires to * operate on non clamped util but must use the * max-aggregated uclamp_{min, max}. @@ -8089,7 +8120,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (prev_delta < base_energy) goto unlock; prev_delta -= base_energy; - prev_thermal_cap = cpu_thermal_cap; + prev_actual_cap = cpu_actual_cap; best_delta = min(best_delta, prev_delta); } @@ -8104,7 +8135,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) * but best energy cpu has better capacity. */ if ((max_fits < 0) && - (cpu_thermal_cap <= best_thermal_cap)) + (cpu_actual_cap <= best_actual_cap)) continue; cur_delta = compute_energy(&eenv, pd, cpus, p, @@ -8125,14 +8156,14 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) best_delta = cur_delta; best_energy_cpu = max_spare_cap_cpu; best_fits = max_fits; - best_thermal_cap = cpu_thermal_cap; + best_actual_cap = cpu_actual_cap; } } rcu_read_unlock(); if ((best_fits > prev_fits) || ((best_fits > 0) && (best_delta < prev_delta)) || - ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap))) + ((best_fits < 0) && (best_actual_cap > prev_actual_cap))) target = best_energy_cpu; return target; @@ -8175,7 +8206,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) cpumask_test_cpu(cpu, p->cpus_ptr)) return cpu; - if (sched_energy_enabled()) { + if (!is_rd_overutilized(this_rq()->rd)) { new_cpu = find_energy_efficient_cpu(p, prev_cpu); if (new_cpu >= 0) return new_cpu; @@ -8213,7 +8244,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) if (unlikely(sd)) { /* Slow path */ - new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); + new_cpu = sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag); } else if (wake_flags & WF_TTWU) { /* XXX always ? */ /* Fast path */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); @@ -8259,14 +8290,46 @@ static void task_dead_fair(struct task_struct *p) remove_entity_load_avg(&p->se); } +/* + * Set the max capacity the task is allowed to run at for misfit detection. + */ +static void set_task_max_allowed_capacity(struct task_struct *p) +{ + struct asym_cap_data *entry; + + if (!sched_asym_cpucap_active()) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(entry, &asym_cap_list, link) { + cpumask_t *cpumask; + + cpumask = cpu_capacity_span(entry); + if (!cpumask_intersects(p->cpus_ptr, cpumask)) + continue; + + p->max_allowed_capacity = entry->capacity; + break; + } + rcu_read_unlock(); +} + +static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context *ctx) +{ + set_cpus_allowed_common(p, ctx); + set_task_max_allowed_capacity(p); +} + static int balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { if (rq->nr_running) return 1; - return newidle_balance(rq, rf) != 0; + return sched_balance_newidle(rq, rf) != 0; } +#else +static inline void set_task_max_allowed_capacity(struct task_struct *p) {} #endif /* CONFIG_SMP */ static void set_next_buddy(struct sched_entity *se) @@ -8517,10 +8580,10 @@ idle: if (!rf) return NULL; - new_tasks = newidle_balance(rq, rf); + new_tasks = sched_balance_newidle(rq, rf); /* - * Because newidle_balance() releases (and re-acquires) rq->lock, it is + * Because sched_balance_newidle() releases (and re-acquires) rq->lock, it is * possible for any higher priority task to appear. In that case we * must re-start the pick_next_entity() loop. */ @@ -8598,7 +8661,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se))) return false; - /* Tell the scheduler that we'd really like pse to run next. */ + /* Tell the scheduler that we'd really like se to run next. */ set_next_buddy(se); yield_task_fair(rq); @@ -8936,7 +8999,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) return 0; - /* Disregard pcpu kthreads; they are where they need to be. */ + /* Disregard percpu kthreads; they are where they need to be. */ if (kthread_is_per_cpu(p)) return 0; @@ -9082,7 +9145,7 @@ static int detach_tasks(struct lb_env *env) * We don't want to steal all, otherwise we may be treated likewise, * which could at worst lead to a livelock crash. */ - if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1) + if (env->idle && env->src_rq->nr_running <= 1) break; env->loop++; @@ -9261,7 +9324,7 @@ static inline bool others_have_blocked(struct rq *rq) if (cpu_util_dl(rq)) return true; - if (thermal_load_avg(rq)) + if (hw_load_avg(rq)) return true; if (cpu_util_irq(rq)) @@ -9291,7 +9354,7 @@ static bool __update_blocked_others(struct rq *rq, bool *done) { const struct sched_class *curr_class; u64 now = rq_clock_pelt(rq); - unsigned long thermal_pressure; + unsigned long hw_pressure; bool decayed; /* @@ -9300,11 +9363,11 @@ static bool __update_blocked_others(struct rq *rq, bool *done) */ curr_class = rq->curr->sched_class; - thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); + hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | - update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) | + update_hw_load_avg(now, rq, hw_pressure) | update_irq_load_avg(rq, 0); if (others_have_blocked(rq)) @@ -9423,7 +9486,7 @@ static unsigned long task_h_load(struct task_struct *p) } #endif -static void update_blocked_averages(int cpu) +static void sched_balance_update_blocked_averages(int cpu) { bool decayed = false, done = true; struct rq *rq = cpu_rq(cpu); @@ -9442,25 +9505,25 @@ static void update_blocked_averages(int cpu) rq_unlock_irqrestore(rq, &rf); } -/********** Helpers for find_busiest_group ************************/ +/********** Helpers for sched_balance_find_src_group ************************/ /* - * sg_lb_stats - stats of a sched_group required for load_balancing + * sg_lb_stats - stats of a sched_group required for load-balancing: */ struct sg_lb_stats { - unsigned long avg_load; /*Avg load across the CPUs of the group */ - unsigned long group_load; /* Total load over the CPUs of the group */ - unsigned long group_capacity; - unsigned long group_util; /* Total utilization over the CPUs of the group */ - unsigned long group_runnable; /* Total runnable time over the CPUs of the group */ - unsigned int sum_nr_running; /* Nr of tasks running in the group */ - unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */ - unsigned int idle_cpus; + unsigned long avg_load; /* Avg load over the CPUs of the group */ + unsigned long group_load; /* Total load over the CPUs of the group */ + unsigned long group_capacity; /* Capacity over the CPUs of the group */ + unsigned long group_util; /* Total utilization over the CPUs of the group */ + unsigned long group_runnable; /* Total runnable time over the CPUs of the group */ + unsigned int sum_nr_running; /* Nr of all tasks running in the group */ + unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */ + unsigned int idle_cpus; /* Nr of idle CPUs in the group */ unsigned int group_weight; enum group_type group_type; - unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ - unsigned int group_smt_balance; /* Task on busy SMT be moved */ - unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ + unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ + unsigned int group_smt_balance; /* Task on busy SMT be moved */ + unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; @@ -9468,19 +9531,18 @@ struct sg_lb_stats { }; /* - * sd_lb_stats - Structure to store the statistics of a sched_domain - * during load balancing. + * sd_lb_stats - stats of a sched_domain required for load-balancing: */ struct sd_lb_stats { - struct sched_group *busiest; /* Busiest group in this sd */ - struct sched_group *local; /* Local group in this sd */ - unsigned long total_load; /* Total load of all groups in sd */ - unsigned long total_capacity; /* Total capacity of all groups in sd */ - unsigned long avg_load; /* Average load across all groups in sd */ - unsigned int prefer_sibling; /* tasks should go to sibling first */ - - struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ - struct sg_lb_stats local_stat; /* Statistics of the local group */ + struct sched_group *busiest; /* Busiest group in this sd */ + struct sched_group *local; /* Local group in this sd */ + unsigned long total_load; /* Total load of all groups in sd */ + unsigned long total_capacity; /* Total capacity of all groups in sd */ + unsigned long avg_load; /* Average load across all groups in sd */ + unsigned int prefer_sibling; /* Tasks should go to sibling first */ + + struct sg_lb_stats busiest_stat; /* Statistics of the busiest group */ + struct sg_lb_stats local_stat; /* Statistics of the local group */ }; static inline void init_sd_lb_stats(struct sd_lb_stats *sds) @@ -9506,8 +9568,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) static unsigned long scale_rt_capacity(int cpu) { + unsigned long max = get_actual_cpu_capacity(cpu); struct rq *rq = cpu_rq(cpu); - unsigned long max = arch_scale_cpu_capacity(cpu); unsigned long used, free; unsigned long irq; @@ -9519,12 +9581,9 @@ static unsigned long scale_rt_capacity(int cpu) /* * avg_rt.util_avg and avg_dl.util_avg track binary signals * (running and not running) with weights 0 and 1024 respectively. - * avg_thermal.load_avg tracks thermal pressure and the weighted - * average uses the actual delta max capacity(load). */ used = cpu_util_rt(rq); used += cpu_util_dl(rq); - used += thermal_load_avg(rq); if (unlikely(used >= max)) return 1; @@ -9617,16 +9676,10 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd) (arch_scale_cpu_capacity(cpu_of(rq)) * 100)); } -/* - * Check whether a rq has a misfit task and if it looks like we can actually - * help that task: we can migrate the task to a CPU of higher capacity, or - * the task's current CPU is heavily pressured. - */ -static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd) +/* Check if the rq has a misfit task */ +static inline bool check_misfit_status(struct rq *rq) { - return rq->misfit_task_load && - (arch_scale_cpu_capacity(rq->cpu) < rq->rd->max_cpu_capacity || - check_cpu_capacity(rq, sd)); + return rq->misfit_task_load; } /* @@ -9650,7 +9703,7 @@ static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd) * * When this is so detected; this group becomes a candidate for busiest; see * update_sd_pick_busiest(). And calculate_imbalance() and - * find_busiest_group() avoid some of the usual balance conditions to allow it + * sched_balance_find_src_group() avoid some of the usual balance conditions to allow it * to create an effective group imbalance. * * This is a somewhat tricky proposition since the next run might not find the @@ -9815,7 +9868,7 @@ static inline bool smt_vs_nonsmt_groups(struct sched_group *sg1, static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group *group) { - if (env->idle == CPU_NOT_IDLE) + if (!env->idle) return false; /* @@ -9839,7 +9892,7 @@ static inline long sibling_imbalance(struct lb_env *env, int ncores_busiest, ncores_local; long imbalance; - if (env->idle == CPU_NOT_IDLE || !busiest->sum_nr_running) + if (!env->idle || !busiest->sum_nr_running) return 0; ncores_busiest = sds->busiest->cores; @@ -9885,13 +9938,15 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd) * @sds: Load-balancing data with statistics of the local group. * @group: sched_group whose statistics are to be updated. * @sgs: variable to hold the statistics for this group. - * @sg_status: Holds flag indicating the status of the sched_group + * @sg_overloaded: sched_group is overloaded + * @sg_overutilized: sched_group is overutilized */ static inline void update_sg_lb_stats(struct lb_env *env, struct sd_lb_stats *sds, struct sched_group *group, struct sg_lb_stats *sgs, - int *sg_status) + bool *sg_overloaded, + bool *sg_overutilized) { int i, nr_running, local_group; @@ -9912,10 +9967,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->sum_nr_running += nr_running; if (nr_running > 1) - *sg_status |= SG_OVERLOAD; + *sg_overloaded = 1; if (cpu_overutilized(i)) - *sg_status |= SG_OVERUTILIZED; + *sg_overutilized = 1; #ifdef CONFIG_NUMA_BALANCING sgs->nr_numa_running += rq->nr_numa_running; @@ -9937,10 +9992,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, /* Check for a misfit task on the cpu */ if (sgs->group_misfit_task_load < rq->misfit_task_load) { sgs->group_misfit_task_load = rq->misfit_task_load; - *sg_status |= SG_OVERLOAD; + *sg_overloaded = 1; } - } else if ((env->idle != CPU_NOT_IDLE) && - sched_reduced_capacity(rq, env->sd)) { + } else if (env->idle && sched_reduced_capacity(rq, env->sd)) { /* Check for a task running on a CPU with reduced capacity */ if (sgs->group_misfit_task_load < load) sgs->group_misfit_task_load = load; @@ -9952,7 +10006,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_weight = group->group_weight; /* Check if dst CPU is idle and preferred to this group */ - if (!local_group && env->idle != CPU_NOT_IDLE && sgs->sum_h_nr_running && + if (!local_group && env->idle && sgs->sum_h_nr_running && sched_group_asym(env, sgs, group)) sgs->group_asym_packing = 1; @@ -10090,7 +10144,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, has_spare: /* - * Select not overloaded group with lowest number of idle cpus + * Select not overloaded group with lowest number of idle CPUs * and highest number of running tasks. We could also compare * the spare capacity which is more stable but it can end up * that the group has less spare capacity but finally more idle @@ -10310,13 +10364,13 @@ static bool update_pick_idlest(struct sched_group *idlest, } /* - * find_idlest_group() finds and returns the least busy CPU group within the + * sched_balance_find_dst_group() finds and returns the least busy CPU group within the * domain. * * Assumes p is allowed on at least one CPU in sd. */ static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) +sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) { struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups; struct sg_lb_stats local_sgs, tmp_sgs; @@ -10564,7 +10618,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; unsigned long sum_util = 0; - int sg_status = 0; + bool sg_overloaded = 0, sg_overutilized = 0; do { struct sg_lb_stats *sgs = &tmp_sgs; @@ -10580,7 +10634,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd update_group_capacity(env->sd, env->dst_cpu); } - update_sg_lb_stats(env, sds, sg, sgs, &sg_status); + update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded, &sg_overutilized); if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; @@ -10608,19 +10662,13 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd env->fbq_type = fbq_classify_group(&sds->busiest_stat); if (!env->sd->parent) { - struct root_domain *rd = env->dst_rq->rd; - /* update overload indicator if we are at root domain */ - WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD); + set_rd_overloaded(env->dst_rq->rd, sg_overloaded); /* Update over-utilization (tipping point, U >= 0) indicator */ - WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED); - trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED); - } else if (sg_status & SG_OVERUTILIZED) { - struct root_domain *rd = env->dst_rq->rd; - - WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED); - trace_sched_overutilized_tp(rd, SG_OVERUTILIZED); + set_rd_overutilized(env->dst_rq->rd, sg_overutilized); + } else if (sg_overutilized) { + set_rd_overutilized(env->dst_rq->rd, sg_overutilized); } update_idle_cpu_scan(env, sum_util); @@ -10710,7 +10758,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * waiting task in this overloaded busiest group. Let's * try to pull it. */ - if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) { + if (env->idle && env->imbalance == 0) { env->migration_type = migrate_task; env->imbalance = 1; } @@ -10729,7 +10777,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /* * If there is no overload, we just want to even the number of - * idle cpus. + * idle CPUs. */ env->migration_type = migrate_task; env->imbalance = max_t(long, 0, @@ -10802,7 +10850,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s ) / SCHED_CAPACITY_SCALE; } -/******* find_busiest_group() helpers end here *********************/ +/******* sched_balance_find_src_group() helpers end here *********************/ /* * Decision matrix according to the local and busiest group type: @@ -10825,7 +10873,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s */ /** - * find_busiest_group - Returns the busiest group within the sched_domain + * sched_balance_find_src_group - Returns the busiest group within the sched_domain * if there is an imbalance. * @env: The load balancing environment. * @@ -10834,7 +10882,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * * Return: - The busiest group if imbalance exists. */ -static struct sched_group *find_busiest_group(struct lb_env *env) +static struct sched_group *sched_balance_find_src_group(struct lb_env *env) { struct sg_lb_stats *local, *busiest; struct sd_lb_stats sds; @@ -10857,12 +10905,9 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (busiest->group_type == group_misfit_task) goto force_balance; - if (sched_energy_enabled()) { - struct root_domain *rd = env->dst_rq->rd; - - if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)) - goto out_balanced; - } + if (!is_rd_overutilized(env->dst_rq->rd) && + rcu_dereference(env->dst_rq->rd->pd)) + goto out_balanced; /* ASYM feature bypasses nice load balance check */ if (busiest->group_type == group_asym_packing) @@ -10925,7 +10970,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) goto force_balance; if (busiest->group_type != group_overloaded) { - if (env->idle == CPU_NOT_IDLE) { + if (!env->idle) { /* * If the busiest group is not overloaded (and as a * result the local one too) but this CPU is already @@ -10973,9 +11018,9 @@ out_balanced: } /* - * find_busiest_queue - find the busiest runqueue among the CPUs in the group. + * sched_balance_find_src_rq - find the busiest runqueue among the CPUs in the group. */ -static struct rq *find_busiest_queue(struct lb_env *env, +static struct rq *sched_balance_find_src_rq(struct lb_env *env, struct sched_group *group) { struct rq *busiest = NULL, *rq; @@ -11133,7 +11178,7 @@ asym_active_balance(struct lb_env *env) * the lower priority @env::dst_cpu help it. Do not follow * CPU priority. */ - return env->idle != CPU_NOT_IDLE && sched_use_asym_prio(env->sd, env->dst_cpu) && + return env->idle && sched_use_asym_prio(env->sd, env->dst_cpu) && (sched_asym_prefer(env->dst_cpu, env->src_cpu) || !sched_use_asym_prio(env->sd, env->src_cpu)); } @@ -11171,7 +11216,7 @@ static int need_active_balance(struct lb_env *env) * because of other sched_class or IRQs if more capacity stays * available on dst_cpu. */ - if ((env->idle != CPU_NOT_IDLE) && + if (env->idle && (env->src_rq->cfs.h_nr_running == 1)) { if ((check_cpu_capacity(env->src_rq, sd)) && (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100)) @@ -11256,7 +11301,7 @@ static int should_we_balance(struct lb_env *env) * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. */ -static int load_balance(int this_cpu, struct rq *this_rq, +static int sched_balance_rq(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *continue_balancing) { @@ -11288,13 +11333,13 @@ redo: goto out_balanced; } - group = find_busiest_group(&env); + group = sched_balance_find_src_group(&env); if (!group) { schedstat_inc(sd->lb_nobusyg[idle]); goto out_balanced; } - busiest = find_busiest_queue(&env, group); + busiest = sched_balance_find_src_rq(&env, group); if (!busiest) { schedstat_inc(sd->lb_nobusyq[idle]); goto out_balanced; @@ -11312,7 +11357,7 @@ redo: env.flags |= LBF_ALL_PINNED; if (busiest->nr_running > 1) { /* - * Attempt to move tasks. If find_busiest_group has found + * Attempt to move tasks. If sched_balance_find_src_group has found * an imbalance but busiest->nr_running <= 1, the group is * still unbalanced. ld_moved simply stays zero, so it is * correctly treated as an imbalance. @@ -11427,8 +11472,12 @@ more_balance: * We do not want newidle balance, which can be very * frequent, pollute the failure counter causing * excessive cache_hot migrations and active balances. + * + * Similarly for migration_misfit which is not related to + * load/util migration, don't pollute nr_balance_failed. */ - if (idle != CPU_NEWLY_IDLE) + if (idle != CPU_NEWLY_IDLE && + env.migration_type != migrate_misfit) sd->nr_balance_failed++; if (need_active_balance(&env)) { @@ -11507,12 +11556,17 @@ out_one_pinned: ld_moved = 0; /* - * newidle_balance() disregards balance intervals, so we could + * sched_balance_newidle() disregards balance intervals, so we could * repeatedly reach this code, which would lead to balance_interval * skyrocketing in a short amount of time. Skip the balance_interval * increase logic to avoid that. + * + * Similarly misfit migration which is not necessarily an indication of + * the system being busy and requires lb to backoff to let it settle + * down. */ - if (env.idle == CPU_NEWLY_IDLE) + if (env.idle == CPU_NEWLY_IDLE || + env.migration_type == migrate_misfit) goto out; /* tune up the balancing interval */ @@ -11645,10 +11699,23 @@ out_unlock: return 0; } -static DEFINE_SPINLOCK(balancing); +/* + * This flag serializes load-balancing passes over large domains + * (above the NODE topology level) - only one load-balancing instance + * may run at a time, to reduce overhead on very large systems with + * lots of CPUs and large NUMA distances. + * + * - Note that load-balancing passes triggered while another one + * is executing are skipped and not re-tried. + * + * - Also note that this does not serialize rebalance_domains() + * execution, as non-SD_SERIALIZE domains will still be + * load-balanced in parallel. + */ +static atomic_t sched_balance_running = ATOMIC_INIT(0); /* - * Scale the max load_balance interval with the number of CPUs in the system. + * Scale the max sched_balance_rq interval with the number of CPUs in the system. * This trades load-balance latency on larger machines for less cross talk. */ void update_max_interval(void) @@ -11686,7 +11753,7 @@ static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost) * * Balancing parameters are set up in init_sched_domains. */ -static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) +static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) { int continue_balancing = 1; int cpu = rq->cpu; @@ -11723,25 +11790,25 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) need_serialize = sd->flags & SD_SERIALIZE; if (need_serialize) { - if (!spin_trylock(&balancing)) + if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1)) goto out; } if (time_after_eq(jiffies, sd->last_balance + interval)) { - if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { + if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) { /* * The LBF_DST_PINNED logic could have changed * env->dst_cpu, so we can't know our idle * state even if we migrated tasks. Update it. */ - idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; - busy = idle != CPU_IDLE && !sched_idle_cpu(cpu); + idle = idle_cpu(cpu); + busy = !idle && !sched_idle_cpu(cpu); } sd->last_balance = jiffies; interval = get_sd_balance_interval(sd, busy); } if (need_serialize) - spin_unlock(&balancing); + atomic_set_release(&sched_balance_running, 0); out: if (time_after(next_balance, sd->last_balance + interval)) { next_balance = sd->last_balance + interval; @@ -11901,7 +11968,7 @@ static void nohz_balancer_kick(struct rq *rq) * currently idle; in which case, kick the ILB to move tasks * around. * - * When balancing betwen cores, all the SMT siblings of the + * When balancing between cores, all the SMT siblings of the * preferred CPU must be idle. */ for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { @@ -11918,7 +11985,7 @@ static void nohz_balancer_kick(struct rq *rq) * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU * to run the misfit task on. */ - if (check_misfit_status(rq, sd)) { + if (check_misfit_status(rq)) { flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock; } @@ -12062,7 +12129,7 @@ void nohz_balance_enter_idle(int cpu) out: /* * Each time a cpu enter idle, we assume that it has blocked load and - * enable the periodic update of the load of idle cpus + * enable the periodic update of the load of idle CPUs */ WRITE_ONCE(nohz.has_blocked, 1); } @@ -12080,13 +12147,13 @@ static bool update_nohz_stats(struct rq *rq) if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick))) return true; - update_blocked_averages(cpu); + sched_balance_update_blocked_averages(cpu); return rq->has_blocked_load; } /* - * Internal function that runs load balance for all idle cpus. The load balance + * Internal function that runs load balance for all idle CPUs. The load balance * can be a simple update of blocked load or a complete load balance with * tasks movement depending of flags. */ @@ -12162,7 +12229,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags) rq_unlock_irqrestore(rq, &rf); if (flags & NOHZ_BALANCE_KICK) - rebalance_domains(rq, CPU_IDLE); + sched_balance_domains(rq, CPU_IDLE); } if (time_after(next_balance, rq->next_balance)) { @@ -12191,7 +12258,7 @@ abort: /* * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the - * rebalancing for all the cpus for whom scheduler ticks are stopped. + * rebalancing for all the CPUs for whom scheduler ticks are stopped. */ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { @@ -12222,7 +12289,7 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) * called from this function on (this) CPU that's not yet in the mask. That's * OK because the goal of nohz_run_idle_balance() is to run ILB only for * updating the blocked load of already idle CPUs without waking up one of - * those idle CPUs and outside the preempt disable / irq off phase of the local + * those idle CPUs and outside the preempt disable / IRQ off phase of the local * cpu about to enter idle, because it can take a long time. */ void nohz_run_idle_balance(int cpu) @@ -12233,7 +12300,7 @@ void nohz_run_idle_balance(int cpu) /* * Update the blocked load only if no SCHED_SOFTIRQ is about to happen - * (ie NOHZ_STATS_KICK set) and will do the same. + * (i.e. NOHZ_STATS_KICK set) and will do the same. */ if ((flags == NOHZ_NEWILB_KICK) && !need_resched()) _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK); @@ -12278,7 +12345,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } #endif /* CONFIG_NO_HZ_COMMON */ /* - * newidle_balance is called by schedule() if this_cpu is about to become + * sched_balance_newidle is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. * * Returns: @@ -12286,10 +12353,11 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } * 0 - failed, no new tasks * > 0 - success, new (fair) tasks present */ -static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) +static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) { unsigned long next_balance = jiffies + HZ; int this_cpu = this_rq->cpu; + int continue_balancing = 1; u64 t0, t1, curr_cost = 0; struct sched_domain *sd; int pulled_task = 0; @@ -12304,8 +12372,9 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) return 0; /* - * We must set idle_stamp _before_ calling idle_balance(), such that we - * measure the duration of idle_balance() as idle time. + * We must set idle_stamp _before_ calling sched_balance_rq() + * for CPU_NEWLY_IDLE, such that we measure the this duration + * as idle time. */ this_rq->idle_stamp = rq_clock(this_rq); @@ -12326,7 +12395,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); - if (!READ_ONCE(this_rq->rd->overload) || + if (!get_rd_overloaded(this_rq->rd) || (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) { if (sd) @@ -12340,11 +12409,10 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) raw_spin_rq_unlock(this_rq); t0 = sched_clock_cpu(this_cpu); - update_blocked_averages(this_cpu); + sched_balance_update_blocked_averages(this_cpu); rcu_read_lock(); for_each_domain(this_cpu, sd) { - int continue_balancing = 1; u64 domain_cost; update_next_balance(sd, &next_balance); @@ -12354,7 +12422,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) if (sd->flags & SD_BALANCE_NEWIDLE) { - pulled_task = load_balance(this_cpu, this_rq, + pulled_task = sched_balance_rq(this_cpu, this_rq, sd, CPU_NEWLY_IDLE, &continue_balancing); @@ -12370,8 +12438,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) * Stop searching for tasks to pull if there are * now runnable tasks on this rq. */ - if (pulled_task || this_rq->nr_running > 0 || - this_rq->ttwu_pending) + if (pulled_task || !continue_balancing) break; } rcu_read_unlock(); @@ -12409,19 +12476,21 @@ out: } /* - * run_rebalance_domains is triggered when needed from the scheduler tick. - * Also triggered for nohz idle balancing (with nohz_balancing_kick set). + * This softirq handler is triggered via SCHED_SOFTIRQ from two places: + * + * - directly from the local scheduler_tick() for periodic load balancing + * + * - indirectly from a remote scheduler_tick() for NOHZ idle balancing + * through the SMP cross-call nohz_csd_func() */ -static __latent_entropy void run_rebalance_domains(struct softirq_action *h) +static __latent_entropy void sched_balance_softirq(struct softirq_action *h) { struct rq *this_rq = this_rq(); - enum cpu_idle_type idle = this_rq->idle_balance ? - CPU_IDLE : CPU_NOT_IDLE; - + enum cpu_idle_type idle = this_rq->idle_balance; /* - * If this CPU has a pending nohz_balance_kick, then do the + * If this CPU has a pending NOHZ_BALANCE_KICK, then do the * balancing on behalf of the other idle CPUs whose ticks are - * stopped. Do nohz_idle_balance *before* rebalance_domains to + * stopped. Do nohz_idle_balance *before* sched_balance_domains to * give the idle CPUs a chance to load balance. Else we may * load balance only within the local sched_domain hierarchy * and abort nohz_idle_balance altogether if we pull some load. @@ -12430,14 +12499,14 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h) return; /* normal load balance */ - update_blocked_averages(this_rq->cpu); - rebalance_domains(this_rq, idle); + sched_balance_update_blocked_averages(this_rq->cpu); + sched_balance_domains(this_rq, idle); } /* * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. */ -void trigger_load_balance(struct rq *rq) +void sched_balance_trigger(struct rq *rq) { /* * Don't need to rebalance while attached to NULL domain or @@ -12621,7 +12690,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr); update_misfit_status(curr, rq); - update_overutilized_status(task_rq(curr)); + check_update_overutilized_status(task_rq(curr)); task_tick_core(rq, curr); } @@ -12641,6 +12710,8 @@ static void task_fork_fair(struct task_struct *p) rq_lock(rq, &rf); update_rq_clock(rq); + set_task_max_allowed_capacity(p); + cfs_rq = task_cfs_rq(current); curr = cfs_rq->curr; if (curr) @@ -12764,6 +12835,8 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) { attach_task_cfs_rq(p); + set_task_max_allowed_capacity(p); + if (task_on_rq_queued(p)) { /* * We were most likely switched from sched_rt, so @@ -13135,7 +13208,7 @@ DEFINE_SCHED_CLASS(fair) = { .rq_offline = rq_offline_fair, .task_dead = task_dead_fair, - .set_cpus_allowed = set_cpus_allowed_common, + .set_cpus_allowed = set_cpus_allowed_fair, #endif .task_tick = task_tick_fair, @@ -13215,7 +13288,7 @@ __init void init_sched_fair_class(void) #endif } - open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); + open_softirq(SCHED_SOFTIRQ, sched_balance_softirq); #ifdef CONFIG_NO_HZ_COMMON nohz.next_balance = jiffies; diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index 52c8f8226b0d..ca9da66cc894 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -379,7 +379,7 @@ void calc_global_load(void) } /* - * Called from scheduler_tick() to periodically update this CPU's + * Called from sched_tick() to periodically update this CPU's * active count. */ void calc_global_load_tick(struct rq *this_rq) diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 63b6cf898220..ef00382de595 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -208,8 +208,8 @@ ___update_load_sum(u64 now, struct sched_avg *sa, * se has been already dequeued but cfs_rq->curr still points to it. * This means that weight will be 0 but not running for a sched_entity * but also for a cfs_rq if the latter becomes idle. As an example, - * this happens during idle_balance() which calls - * update_blocked_averages(). + * this happens during sched_balance_newidle() which calls + * sched_balance_update_blocked_averages(). * * Also see the comment in accumulate_sum(). */ @@ -384,30 +384,30 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) return 0; } -#ifdef CONFIG_SCHED_THERMAL_PRESSURE +#ifdef CONFIG_SCHED_HW_PRESSURE /* - * thermal: + * hardware: * * load_sum = \Sum se->avg.load_sum but se->avg.load_sum is not tracked * * util_avg and runnable_load_avg are not supported and meaningless. * * Unlike rt/dl utilization tracking that track time spent by a cpu - * running a rt/dl task through util_avg, the average thermal pressure is - * tracked through load_avg. This is because thermal pressure signal is + * running a rt/dl task through util_avg, the average HW pressure is + * tracked through load_avg. This is because HW pressure signal is * time weighted "delta" capacity unlike util_avg which is binary. * "delta capacity" = actual capacity - - * capped capacity a cpu due to a thermal event. + * capped capacity a cpu due to a HW event. */ -int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) +int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) { - if (___update_load_sum(now, &rq->avg_thermal, + if (___update_load_sum(now, &rq->avg_hw, capacity, capacity, capacity)) { - ___update_load_avg(&rq->avg_thermal, 1); - trace_pelt_thermal_tp(rq); + ___update_load_avg(&rq->avg_hw, 1); + trace_pelt_hw_tp(rq); return 1; } diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index 9e1083465fbc..2150062949d4 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -7,21 +7,21 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); -#ifdef CONFIG_SCHED_THERMAL_PRESSURE -int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity); +#ifdef CONFIG_SCHED_HW_PRESSURE +int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity); -static inline u64 thermal_load_avg(struct rq *rq) +static inline u64 hw_load_avg(struct rq *rq) { - return READ_ONCE(rq->avg_thermal.load_avg); + return READ_ONCE(rq->avg_hw.load_avg); } #else static inline int -update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) +update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) { return 0; } -static inline u64 thermal_load_avg(struct rq *rq) +static inline u64 hw_load_avg(struct rq *rq) { return 0; } @@ -202,12 +202,12 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running) } static inline int -update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) +update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) { return 0; } -static inline u64 thermal_load_avg(struct rq *rq) +static inline u64 hw_load_avg(struct rq *rq) { return 0; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ae50f212775e..a831af102070 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -112,6 +112,20 @@ extern int sysctl_sched_rt_runtime; extern int sched_rr_timeslice; /* + * Asymmetric CPU capacity bits + */ +struct asym_cap_data { + struct list_head link; + struct rcu_head rcu; + unsigned long capacity; + unsigned long cpus[]; +}; + +extern struct list_head asym_cap_list; + +#define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus) + +/* * Helpers for converting nanosecond timing to jiffy resolution */ #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) @@ -701,7 +715,7 @@ struct rt_rq { } highest_prio; #endif #ifdef CONFIG_SMP - int overloaded; + bool overloaded; struct plist_head pushable_tasks; #endif /* CONFIG_SMP */ @@ -745,7 +759,7 @@ struct dl_rq { u64 next; } earliest_dl; - int overloaded; + bool overloaded; /* * Tasks on this rq that can be pushed away. They are kept in @@ -838,10 +852,6 @@ struct perf_domain { struct rcu_head rcu; }; -/* Scheduling group status flags */ -#define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */ -#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */ - /* * We add the notion of a root-domain which will be used to define per-domain * variables. Each exclusive cpuset essentially defines an island domain by @@ -862,10 +872,10 @@ struct root_domain { * - More than one runnable task * - Running task is misfit */ - int overload; + bool overloaded; /* Indicate one or more cpus over-utilized (tipping point) */ - int overutilized; + bool overutilized; /* * The bit corresponding to a CPU gets set here if such CPU has more @@ -905,8 +915,6 @@ struct root_domain { cpumask_var_t rto_mask; struct cpupri cpupri; - unsigned long max_cpu_capacity; - /* * NULL-terminated list of performance domains intersecting with the * CPUs of the rd. Protected by RCU. @@ -920,6 +928,17 @@ extern void rq_attach_root(struct rq *rq, struct root_domain *rd); extern void sched_get_rd(struct root_domain *rd); extern void sched_put_rd(struct root_domain *rd); +static inline int get_rd_overloaded(struct root_domain *rd) +{ + return READ_ONCE(rd->overloaded); +} + +static inline void set_rd_overloaded(struct root_domain *rd, int status) +{ + if (get_rd_overloaded(rd) != status) + WRITE_ONCE(rd->overloaded, status); +} + #ifdef HAVE_RT_PUSH_IPI extern void rto_push_irq_work_func(struct irq_work *work); #endif @@ -1091,8 +1110,8 @@ struct rq { #ifdef CONFIG_HAVE_SCHED_AVG_IRQ struct sched_avg avg_irq; #endif -#ifdef CONFIG_SCHED_THERMAL_PRESSURE - struct sched_avg avg_thermal; +#ifdef CONFIG_SCHED_HW_PRESSURE + struct sched_avg avg_hw; #endif u64 idle_stamp; u64 avg_idle; @@ -1533,24 +1552,6 @@ static inline u64 rq_clock_task(struct rq *rq) return rq->clock_task; } -/** - * By default the decay is the default pelt decay period. - * The decay shift can change the decay period in - * multiples of 32. - * Decay shift Decay period(ms) - * 0 32 - * 1 64 - * 2 128 - * 3 256 - * 4 512 - */ -extern int sched_thermal_decay_shift; - -static inline u64 rq_clock_thermal(struct rq *rq) -{ - return rq_clock_task(rq) >> sched_thermal_decay_shift; -} - static inline void rq_clock_skip_update(struct rq *rq) { lockdep_assert_rq_held(rq); @@ -2399,7 +2400,7 @@ extern struct task_struct *pick_next_task_idle(struct rq *rq); extern void update_group_capacity(struct sched_domain *sd, int cpu); -extern void trigger_load_balance(struct rq *rq); +extern void sched_balance_trigger(struct rq *rq); extern void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx); @@ -2519,10 +2520,8 @@ static inline void add_nr_running(struct rq *rq, unsigned count) } #ifdef CONFIG_SMP - if (prev_nr < 2 && rq->nr_running >= 2) { - if (!READ_ONCE(rq->rd->overload)) - WRITE_ONCE(rq->rd->overload, 1); - } + if (prev_nr < 2 && rq->nr_running >= 2) + set_rd_overloaded(rq->rd, 1); #endif sched_update_tick_dependency(rq); @@ -2906,7 +2905,7 @@ extern void cfs_bandwidth_usage_dec(void); #define NOHZ_NEWILB_KICK_BIT 2 #define NOHZ_NEXT_KICK_BIT 3 -/* Run rebalance_domains() */ +/* Run sched_balance_domains() */ #define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT) /* Update blocked load */ #define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT) diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 857f837f52cb..78e48f5426ee 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -113,7 +113,7 @@ void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p, * Bump this up when changing the output format or the meaning of an existing * format, so that tools can adapt (or abort) */ -#define SCHEDSTAT_VERSION 15 +#define SCHEDSTAT_VERSION 16 static int show_schedstat(struct seq_file *seq, void *v) { @@ -150,8 +150,7 @@ static int show_schedstat(struct seq_file *seq, void *v) seq_printf(seq, "domain%d %*pb", dcount++, cpumask_pr_args(sched_domain_span(sd))); - for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; - itype++) { + for (itype = 0; itype < CPU_MAX_IDLE_TYPES; itype++) { seq_printf(seq, " %u %u %u %u %u %u %u %u", sd->lb_count[itype], sd->lb_balanced[itype], diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 99ea5986038c..63aecd2a7a9f 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1330,23 +1330,12 @@ next: } /* - * Asymmetric CPU capacity bits - */ -struct asym_cap_data { - struct list_head link; - unsigned long capacity; - unsigned long cpus[]; -}; - -/* * Set of available CPUs grouped by their corresponding capacities * Each list entry contains a CPU mask reflecting CPUs that share the same * capacity. * The lifespan of data is unlimited. */ -static LIST_HEAD(asym_cap_list); - -#define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus) +LIST_HEAD(asym_cap_list); /* * Verify whether there is any CPU capacity asymmetry in a given sched domain. @@ -1386,21 +1375,39 @@ asym_cpu_capacity_classify(const struct cpumask *sd_span, } +static void free_asym_cap_entry(struct rcu_head *head) +{ + struct asym_cap_data *entry = container_of(head, struct asym_cap_data, rcu); + kfree(entry); +} + static inline void asym_cpu_capacity_update_data(int cpu) { unsigned long capacity = arch_scale_cpu_capacity(cpu); - struct asym_cap_data *entry = NULL; + struct asym_cap_data *insert_entry = NULL; + struct asym_cap_data *entry; + /* + * Search if capacity already exits. If not, track which the entry + * where we should insert to keep the list ordered descendingly. + */ list_for_each_entry(entry, &asym_cap_list, link) { if (capacity == entry->capacity) goto done; + else if (!insert_entry && capacity > entry->capacity) + insert_entry = list_prev_entry(entry, link); } entry = kzalloc(sizeof(*entry) + cpumask_size(), GFP_KERNEL); if (WARN_ONCE(!entry, "Failed to allocate memory for asymmetry data\n")) return; entry->capacity = capacity; - list_add(&entry->link, &asym_cap_list); + + /* If NULL then the new capacity is the smallest, add last. */ + if (!insert_entry) + list_add_tail_rcu(&entry->link, &asym_cap_list); + else + list_add_rcu(&entry->link, &insert_entry->link); done: __cpumask_set_cpu(cpu, cpu_capacity_span(entry)); } @@ -1423,8 +1430,8 @@ static void asym_cpu_capacity_scan(void) list_for_each_entry_safe(entry, next, &asym_cap_list, link) { if (cpumask_empty(cpu_capacity_span(entry))) { - list_del(&entry->link); - kfree(entry); + list_del_rcu(&entry->link); + call_rcu(&entry->rcu, free_asym_cap_entry); } } @@ -1434,8 +1441,8 @@ static void asym_cpu_capacity_scan(void) */ if (list_is_singular(&asym_cap_list)) { entry = list_first_entry(&asym_cap_list, typeof(*entry), link); - list_del(&entry->link); - kfree(entry); + list_del_rcu(&entry->link); + call_rcu(&entry->rcu, free_asym_cap_entry); } } @@ -2507,16 +2514,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att /* Attach the domains */ rcu_read_lock(); for_each_cpu(i, cpu_map) { - unsigned long capacity; - rq = cpu_rq(i); sd = *per_cpu_ptr(d.sd, i); - capacity = arch_scale_cpu_capacity(i); - /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */ - if (capacity > READ_ONCE(d.rd->max_cpu_capacity)) - WRITE_ONCE(d.rd->max_cpu_capacity, capacity); - cpu_attach_domain(sd, d.rd, i); if (lowest_flag_domain(i, SD_CLUSTER)) @@ -2530,10 +2530,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att if (has_cluster) static_branch_inc_cpuslocked(&sched_cluster_active); - if (rq && sched_debug_verbose) { - pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", - cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); - } + if (rq && sched_debug_verbose) + pr_info("root domain span: %*pbl\n", cpumask_pr_args(cpu_map)); ret = 0; error: diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 3baf2fbe6848..e394d6d5b9b5 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -2488,7 +2488,7 @@ void update_process_times(int user_tick) if (in_irq()) irq_work_tick(); #endif - scheduler_tick(); + sched_tick(); if (IS_ENABLED(CONFIG_POSIX_TIMERS)) run_posix_cpu_timers(); } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index d2dbe099286b..80882ae43261 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1468,7 +1468,7 @@ void wq_worker_sleeping(struct task_struct *task) * wq_worker_tick - a scheduler tick occurred while a kworker is running * @task: task currently running * - * Called from scheduler_tick(). We're in the IRQ context and the current + * Called from sched_tick(). We're in the IRQ context and the current * worker's fields which follow the 'K' locking rule can be accessed safely. */ void wq_worker_tick(struct task_struct *task) |