diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-08-01 20:49:06 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-08-01 20:49:06 +0200 |
commit | b167fdffe9e737007cbf7c691cde5fa489ca58d7 (patch) | |
tree | 05f36cf3d602cf7e69d22e2737dd4e8fb9849bfa /kernel/sched/sched.h | |
parent | Merge tag 'slab-for-5.20_or_6.0' of git://git.kernel.org/pub/scm/linux/kernel... (diff) | |
parent | rseq: Kill process when unknown flags are encountered in ABI structures (diff) | |
download | linux-b167fdffe9e737007cbf7c691cde5fa489ca58d7.tar.xz linux-b167fdffe9e737007cbf7c691cde5fa489ca58d7.zip |
Merge tag 'sched-core-2022-08-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
"Load-balancing improvements:
- Improve NUMA balancing on AMD Zen systems for affine workloads.
- Improve the handling of reduced-capacity CPUs in load-balancing.
- Energy Model improvements: fix & refine all the energy fairness
metrics (PELT), and remove the conservative threshold requiring 6%
energy savings to migrate a task. Doing this improves power
efficiency for most workloads, and also increases the reliability
of energy-efficiency scheduling.
- Optimize/tweak select_idle_cpu() to spend (much) less time
searching for an idle CPU on overloaded systems. There's reports of
several milliseconds spent there on large systems with large
workloads ...
[ Since the search logic changed, there might be behavioral side
effects. ]
- Improve NUMA imbalance behavior. On certain systems with spare
capacity, initial placement of tasks is non-deterministic, and such
an artificial placement imbalance can persist for a long time,
hurting (and sometimes helping) performance.
The fix is to make fork-time task placement consistent with runtime
NUMA balancing placement.
Note that some performance regressions were reported against this,
caused by workloads that are not memory bandwith limited, which
benefit from the artificial locality of the placement bug(s). Mel
Gorman's conclusion, with which we concur, was that consistency is
better than random workload benefits from non-deterministic bugs:
"Given there is no crystal ball and it's a tradeoff, I think
it's better to be consistent and use similar logic at both fork
time and runtime even if it doesn't have universal benefit."
- Improve core scheduling by fixing a bug in
sched_core_update_cookie() that caused unnecessary forced idling.
- Improve wakeup-balancing by allowing same-LLC wakeup of idle CPUs
for newly woken tasks.
- Fix a newidle balancing bug that introduced unnecessary wakeup
latencies.
ABI improvements/fixes:
- Do not check capabilities and do not issue capability check denial
messages when a scheduler syscall doesn't require privileges. (Such
as increasing niceness.)
- Add forced-idle accounting to cgroups too.
- Fix/improve the RSEQ ABI to not just silently accept unknown flags.
(No existing tooling is known to have learned to rely on the
previous behavior.)
- Depreciate the (unused) RSEQ_CS_FLAG_NO_RESTART_ON_* flags.
Optimizations:
- Optimize & simplify leaf_cfs_rq_list()
- Micro-optimize set_nr_{and_not,if}_polling() via try_cmpxchg().
Misc fixes & cleanups:
- Fix the RSEQ self-tests on RISC-V and Glibc 2.35 systems.
- Fix a full-NOHZ bug that can in some cases result in the tick not
being re-enabled when the last SCHED_RT task is gone from a
runqueue but there's still SCHED_OTHER tasks around.
- Various PREEMPT_RT related fixes.
- Misc cleanups & smaller fixes"
* tag 'sched-core-2022-08-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (32 commits)
rseq: Kill process when unknown flags are encountered in ABI structures
rseq: Deprecate RSEQ_CS_FLAG_NO_RESTART_ON_* flags
sched/core: Fix the bug that task won't enqueue into core tree when update cookie
nohz/full, sched/rt: Fix missed tick-reenabling bug in dequeue_task_rt()
sched/core: Always flush pending blk_plug
sched/fair: fix case with reduced capacity CPU
sched/core: Use try_cmpxchg in set_nr_{and_not,if}_polling
sched/core: add forced idle accounting for cgroups
sched/fair: Remove the energy margin in feec()
sched/fair: Remove task_util from effective utilization in feec()
sched/fair: Use the same cpumask per-PD throughout find_energy_efficient_cpu()
sched/fair: Rename select_idle_mask to select_rq_mask
sched, drivers: Remove max param from effective_cpu_util()/sched_cpu_util()
sched/fair: Decay task PELT values during wakeup migration
sched/fair: Provide u64 read for 32-bits arch helper
sched/fair: Introduce SIS_UTIL to search idle CPU based on sum of util_avg
sched: only perform capability check on privileged operation
sched: Remove unused function group_first_cpu()
sched/fair: Remove redundant word " *"
selftests/rseq: check if libc rseq support is registered
...
Diffstat (limited to 'kernel/sched/sched.h')
-rw-r--r-- | kernel/sched/sched.h | 63 |
1 files changed, 51 insertions, 12 deletions
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 47b89a0fc6e5..aad7f5ee9666 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -520,6 +520,45 @@ struct cfs_bandwidth { }; #endif /* CONFIG_CGROUP_SCHED */ +/* + * u64_u32_load/u64_u32_store + * + * Use a copy of a u64 value to protect against data race. This is only + * applicable for 32-bits architectures. + */ +#ifdef CONFIG_64BIT +# define u64_u32_load_copy(var, copy) var +# define u64_u32_store_copy(var, copy, val) (var = val) +#else +# define u64_u32_load_copy(var, copy) \ +({ \ + u64 __val, __val_copy; \ + do { \ + __val_copy = copy; \ + /* \ + * paired with u64_u32_store_copy(), ordering access \ + * to var and copy. \ + */ \ + smp_rmb(); \ + __val = var; \ + } while (__val != __val_copy); \ + __val; \ +}) +# define u64_u32_store_copy(var, copy, val) \ +do { \ + typeof(val) __val = (val); \ + var = __val; \ + /* \ + * paired with u64_u32_load_copy(), ordering access to var and \ + * copy. \ + */ \ + smp_wmb(); \ + copy = __val; \ +} while (0) +#endif +# define u64_u32_load(var) u64_u32_load_copy(var, var##_copy) +# define u64_u32_store(var, val) u64_u32_store_copy(var, var##_copy, val) + /* CFS-related fields in a runqueue */ struct cfs_rq { struct load_weight load; @@ -560,7 +599,7 @@ struct cfs_rq { */ struct sched_avg avg; #ifndef CONFIG_64BIT - u64 load_last_update_time_copy; + u64 last_update_time_copy; #endif struct { raw_spinlock_t lock ____cacheline_aligned; @@ -609,6 +648,10 @@ struct cfs_rq { int runtime_enabled; s64 runtime_remaining; + u64 throttled_pelt_idle; +#ifndef CONFIG_64BIT + u64 throttled_pelt_idle_copy; +#endif u64 throttled_clock; u64 throttled_clock_pelt; u64 throttled_clock_pelt_time; @@ -981,6 +1024,12 @@ struct rq { u64 clock_task ____cacheline_aligned; u64 clock_pelt; unsigned long lost_idle_time; + u64 clock_pelt_idle; + u64 clock_idle; +#ifndef CONFIG_64BIT + u64 clock_pelt_idle_copy; + u64 clock_idle_copy; +#endif atomic_t nr_iowait; @@ -1815,15 +1864,6 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg) return to_cpumask(sg->sgc->cpumask); } -/** - * group_first_cpu - Returns the first CPU in the cpumask of a sched_group. - * @group: The group whose first CPU is to be returned. - */ -static inline unsigned int group_first_cpu(struct sched_group *group) -{ - return cpumask_first(sched_group_span(group)); -} - extern int group_balance_cpu(struct sched_group *sg); #ifdef CONFIG_SCHED_DEBUG @@ -2044,7 +2084,6 @@ static inline int task_on_rq_migrating(struct task_struct *p) #define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ #define WF_MIGRATED 0x20 /* Internal use, task got migrated */ -#define WF_ON_CPU 0x40 /* Wakee is on_cpu */ #ifdef CONFIG_SMP static_assert(WF_EXEC == SD_BALANCE_EXEC); @@ -2852,7 +2891,7 @@ enum cpu_util_type { }; unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, - unsigned long max, enum cpu_util_type type, + enum cpu_util_type type, struct task_struct *p); static inline unsigned long cpu_bw_dl(struct rq *rq) |