Merge tag 'sched-core-2023-06-27' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar: "Scheduler SMP load-balancer improvements: - Avoid unnecessary migrations within SMT domains on hybrid systems. Problem: On hybrid CPU systems, (processors with a mixture of higher-frequency SMT cores and lower-frequency non-SMT cores), under the old code lower-priority CPUs pulled tasks from the higher-priority cores if more than one SMT sibling was busy - resulting in many unnecessary task migrations. Solution: The new code improves the load balancer to recognize SMT cores with more than one busy sibling and allows lower-priority CPUs to pull tasks, which avoids superfluous migrations and lets lower-priority cores inspect all SMT siblings for the busiest queue. - Implement the 'runnable boosting' feature in the EAS balancer: consider CPU contention in frequency, EAS max util & load-balance busiest CPU selection. This improves CPU utilization for certain workloads, while leaves other key workloads unchanged. Scheduler infrastructure improvements: - Rewrite the scheduler topology setup code by consolidating it into the build_sched_topology() helper function and building it dynamically on the fly. - Resolve the local_clock() vs. noinstr complications by rewriting the code: provide separate sched_clock_noinstr() and local_clock_noinstr() functions to be used in instrumentation code, and make sure it is all instrumentation-safe. Fixes: - Fix a kthread_park() race with wait_woken() - Fix misc wait_task_inactive() bugs unearthed by the -rt merge: - Fix UP PREEMPT bug by unifying the SMP and UP implementations - Fix task_struct::saved_state handling - Fix various rq clock update bugs, unearthed by turning on the rq clock debugging code. - Fix the PSI WINDOW_MIN_US trigger limit, which was easy to trigger by creating enough cgroups, by removing the warnign and restricting window size triggers to PSI file write-permission or CAP_SYS_RESOURCE. - Propagate SMT flags in the topology when removing degenerate domain - Fix grub_reclaim() calculation bug in the deadline scheduler code - Avoid resetting the min update period when it is unnecessary, in psi_trigger_destroy(). - Don't balance a task to its current running CPU in load_balance(), which was possible on certain NUMA topologies with overlapping groups. - Fix the sched-debug printing of rq->nr_uninterruptible Cleanups: - Address various -Wmissing-prototype warnings, as a preparation to (maybe) enable this warning in the future. - Remove unused code - Mark more functions __init - Fix shadow-variable warnings" * tag 'sched-core-2023-06-27' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (50 commits) sched/core: Avoid multiple calling update_rq_clock() in __cfsb_csd_unthrottle() sched/core: Avoid double calling update_rq_clock() in __balance_push_cpu_stop() sched/core: Fixed missing rq clock update before calling set_rq_offline() sched/deadline: Update GRUB description in the documentation sched/deadline: Fix bandwidth reclaim equation in GRUB sched/wait: Fix a kthread_park race with wait_woken() sched/topology: Mark set_sched_topology() __init sched/fair: Rename variable cpu_util eff_util arm64/arch_timer: Fix MMIO byteswap sched/fair, cpufreq: Introduce 'runnable boosting' sched/fair: Refactor CPU utilization functions cpuidle: Use local_clock_noinstr() sched/clock: Provide local_clock_noinstr() x86/tsc: Provide sched_clock_noinstr() clocksource: hyper-v: Provide noinstr sched_clock() clocksource: hyper-v: Adjust hv_read_tsc_page_tsc() to avoid special casing U64_MAX x86/vdso: Fix gettimeofday masking math64: Always inline u128 version of mul_u64_u64_shr() s390/time: Provide sched_clock_noinstr() loongarch: Provide noinstr sched_clock_read() ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2023-06-27 23:03:21 +0200
committer: Linus Torvalds <torvalds@linux-foundation.org> 2023-06-27 23:03:21 +0200
commit: ed3b7923a816ded62dccef377c9ee346c7d3b1b4 (patch)
tree: 41d46fc399c231088a370e7b3e488a93342fa681 /arch/x86
parent: Merge tag 'x86_sgx_for_v6.5' of git://git.kernel.org/pub/scm/linux/kernel/git... (diff)
parent: sched/core: Avoid multiple calling update_rq_clock() in __cfsb_csd_unthrottle() (diff)
download: linux-ed3b7923a816ded62dccef377c9ee346c7d3b1b4.tar.xz
linux-ed3b7923a816ded62dccef377c9ee346c7d3b1b4.zip
8 files changed, 119 insertions, 98 deletions
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 49bb4f2bd300..88d9ef98e087 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -257,6 +257,11 @@ void hv_set_register(unsigned int reg, u64 value);
 u64 hv_get_non_nested_register(unsigned int reg);
 void hv_set_non_nested_register(unsigned int reg, u64 value);
 
+static __always_inline u64 hv_raw_get_register(unsigned int reg)
+{
+	return __rdmsr(reg);
+}
+
 #else /* CONFIG_HYPERV */
 static inline void hyperv_init(void) {}
 static inline void hyperv_setup_mmu_ops(void) {}
diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h
index 4cf6794f9d68..c81858d903dc 100644
--- a/arch/x86/include/asm/vdso/gettimeofday.h
+++ b/arch/x86/include/asm/vdso/gettimeofday.h
@@ -231,14 +231,19 @@ static u64 vread_pvclock(void)
 		ret = __pvclock_read_cycles(pvti, rdtsc_ordered());
 	} while (pvclock_read_retry(pvti, version));
 
-	return ret;
+	return ret & S64_MAX;
 }
 #endif
 
 #ifdef CONFIG_HYPERV_TIMER
 static u64 vread_hvclock(void)
 {
-	return hv_read_tsc_page(&hvclock_page);
+	u64 tsc, time;
+
+	if (hv_read_tsc_page_tsc(&hvclock_page, &tsc, &time))
+		return time & S64_MAX;
+
+	return U64_MAX;
 }
 #endif
 
@@ -246,7 +251,7 @@ static inline u64 __arch_get_hw_counter(s32 clock_mode,
 					const struct vdso_data *vd)
 {
 	if (likely(clock_mode == VDSO_CLOCKMODE_TSC))
-		return (u64)rdtsc_ordered();
+		return (u64)rdtsc_ordered() & S64_MAX;
 	/*
 	 * For any memory-mapped vclock type, we need to make sure that gcc
 	 * doesn't cleverly hoist a load before the mode check.  Otherwise we
@@ -284,6 +289,9 @@ static inline bool arch_vdso_clocksource_ok(const struct vdso_data *vd)
  * which can be invalidated asynchronously and indicate invalidation by
  * returning U64_MAX, which can be effectively tested by checking for a
  * negative value after casting it to s64.
+ *
+ * This effectively forces a S64_MAX mask on the calculations, unlike the
+ * U64_MAX mask normally used by x86 clocksources.
  */
 static inline bool arch_vdso_cycles_ok(u64 cycles)
 {
@@ -303,18 +311,29 @@ static inline bool arch_vdso_cycles_ok(u64 cycles)
  * @last. If not then use @last, which is the base time of the current
  * conversion period.
  *
- * This variant also removes the masking of the subtraction because the
- * clocksource mask of all VDSO capable clocksources on x86 is U64_MAX
- * which would result in a pointless operation. The compiler cannot
- * optimize it away as the mask comes from the vdso data and is not compile
- * time constant.
+ * This variant also uses a custom mask because while the clocksource mask of
+ * all the VDSO capable clocksources on x86 is U64_MAX, the above code uses
+ * U64_MASK as an exception value, additionally arch_vdso_cycles_ok() above
+ * declares everything with the MSB/Sign-bit set as invalid. Therefore the
+ * effective mask is S64_MAX.
  */
 static __always_inline
 u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
 {
-	if (cycles > last)
-		return (cycles - last) * mult;
-	return 0;
+	/*
+	 * Due to the MSB/Sign-bit being used as invald marker (see
+	 * arch_vdso_cycles_valid() above), the effective mask is S64_MAX.
+	 */
+	u64 delta = (cycles - last) & S64_MAX;
+
+	/*
+	 * Due to the above mentioned TSC wobbles, filter out negative motion.
+	 * Per the above masking, the effective sign bit is now bit 62.
+	 */
+	if (unlikely(delta & (1ULL << 62)))
+		return 0;
+
+	return delta * mult;
 }
 #define vdso_calc_delta vdso_calc_delta
 
diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c
index 670eb08b972a..ee4fe8cdb857 100644
--- a/arch/x86/kernel/itmt.c
+++ b/arch/x86/kernel/itmt.c
@@ -165,32 +165,19 @@ int arch_asym_cpu_priority(int cpu)
 
 /**
  * sched_set_itmt_core_prio() - Set CPU priority based on ITMT
- * @prio:	Priority of cpu core
- * @core_cpu:	The cpu number associated with the core
+ * @prio:	Priority of @cpu
+ * @cpu:	The CPU number
  *
  * The pstate driver will find out the max boost frequency
  * and call this function to set a priority proportional
- * to the max boost frequency. CPU with higher boost
+ * to the max boost frequency. CPUs with higher boost
  * frequency will receive higher priority.
  *
  * No need to rebuild sched domain after updating
  * the CPU priorities. The sched domains have no
  * dependency on CPU priorities.
  */
-void sched_set_itmt_core_prio(int prio, int core_cpu)
+void sched_set_itmt_core_prio(int prio, int cpu)
 {
-	int cpu, i = 1;
-
-	for_each_cpu(cpu, topology_sibling_cpumask(core_cpu)) {
-		int smt_prio;
-
-		/*
-		 * Ensure that the siblings are moved to the end
-		 * of the priority chain and only used when
-		 * all other high priority cpus are out of capacity.
-		 */
-		smt_prio = prio * smp_num_siblings / (i * i);
-		per_cpu(sched_core_priority, cpu) = smt_prio;
-		i++;
-	}
+	per_cpu(sched_core_priority, cpu) = prio;
 }
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 0f35d44c56fe..fb8f52149be9 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -71,7 +71,7 @@ static int kvm_set_wallclock(const struct timespec64 *now)
 	return -ENODEV;
 }
 
-static noinstr u64 kvm_clock_read(void)
+static u64 kvm_clock_read(void)
 {
 	u64 ret;
 
@@ -88,7 +88,7 @@ static u64 kvm_clock_get_cycles(struct clocksource *cs)
 
 static noinstr u64 kvm_sched_clock_read(void)
 {
-	return kvm_clock_read() - kvm_sched_clock_offset;
+	return pvclock_clocksource_read_nowd(this_cpu_pvti()) - kvm_sched_clock_offset;
 }
 
 static inline void kvm_sched_clock_init(bool stable)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 8779a7ed3e87..ed2d51960a7d 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -602,7 +602,7 @@ static int x86_core_flags(void)
 #ifdef CONFIG_SCHED_SMT
 static int x86_smt_flags(void)
 {
-	return cpu_smt_flags() | x86_sched_itmt_flags();
+	return cpu_smt_flags();
 }
 #endif
 #ifdef CONFIG_SCHED_CLUSTER
@@ -613,50 +613,57 @@ static int x86_cluster_flags(void)
 #endif
 #endif
 
-static struct sched_domain_topology_level x86_numa_in_package_topology[] = {
-#ifdef CONFIG_SCHED_SMT
-	{ cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
-#endif
-#ifdef CONFIG_SCHED_CLUSTER
-	{ cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) },
-#endif
-#ifdef CONFIG_SCHED_MC
-	{ cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
-#endif
-	{ NULL, },
-};
+/*
+ * Set if a package/die has multiple NUMA nodes inside.
+ * AMD Magny-Cours, Intel Cluster-on-Die, and Intel
+ * Sub-NUMA Clustering have this.
+ */
+static bool x86_has_numa_in_package;
 
-static struct sched_domain_topology_level x86_hybrid_topology[] = {
-#ifdef CONFIG_SCHED_SMT
-	{ cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
-#endif
-#ifdef CONFIG_SCHED_MC
-	{ cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
-#endif
-	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
-	{ NULL, },
-};
+static struct sched_domain_topology_level x86_topology[6];
+
+static void __init build_sched_topology(void)
+{
+	int i = 0;
 
-static struct sched_domain_topology_level x86_topology[] = {
 #ifdef CONFIG_SCHED_SMT
-	{ cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
+	x86_topology[i++] = (struct sched_domain_topology_level){
+		cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT)
+	};
 #endif
 #ifdef CONFIG_SCHED_CLUSTER
-	{ cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) },
+	/*
+	 * For now, skip the cluster domain on Hybrid.
+	 */
+	if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) {
+		x86_topology[i++] = (struct sched_domain_topology_level){
+			cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS)
+		};
+	}
 #endif
 #ifdef CONFIG_SCHED_MC
-	{ cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
+	x86_topology[i++] = (struct sched_domain_topology_level){
+		cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC)
+	};
 #endif
-	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
-	{ NULL, },
-};
+	/*
+	 * When there is NUMA topology inside the package skip the DIE domain
+	 * since the NUMA domains will auto-magically create the right spanning
+	 * domains based on the SLIT.
+	 */
+	if (!x86_has_numa_in_package) {
+		x86_topology[i++] = (struct sched_domain_topology_level){
+			cpu_cpu_mask, SD_INIT_NAME(DIE)
+		};
+	}
 
-/*
- * Set if a package/die has multiple NUMA nodes inside.
- * AMD Magny-Cours, Intel Cluster-on-Die, and Intel
- * Sub-NUMA Clustering have this.
- */
-static bool x86_has_numa_in_package;
+	/*
+	 * There must be one trailing NULL entry left.
+	 */
+	BUG_ON(i >= ARRAY_SIZE(x86_topology)-1);
+
+	set_sched_topology(x86_topology);
+}
 
 void set_cpu_sibling_map(int cpu)
 {
@@ -1264,15 +1271,6 @@ void __init smp_prepare_cpus_common(void)
 		zalloc_cpumask_var(&per_cpu(cpu_l2c_shared_map, i), GFP_KERNEL);
 	}
 
-	/*
-	 * Set 'default' x86 topology, this matches default_topology() in that
-	 * it has NUMA nodes as a topology level. See also
-	 * native_smp_cpus_done().
-	 *
-	 * Must be done before set_cpus_sibling_map() is ran.
-	 */
-	set_sched_topology(x86_topology);
-
 	set_cpu_sibling_map(0);
 }
 
@@ -1393,13 +1391,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
 	pr_debug("Boot done\n");
 
 	calculate_max_logical_packages();
-
-	/* XXX for now assume numa-in-package and hybrid don't overlap */
-	if (x86_has_numa_in_package)
-		set_sched_topology(x86_numa_in_package_topology);
-	if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
-		set_sched_topology(x86_hybrid_topology);
-
+	build_sched_topology();
 	nmi_selftest();
 	impress_friends();
 	cache_aps_init();
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 1412b771651e..3425c6a943e4 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -69,12 +69,10 @@ static int __init tsc_early_khz_setup(char *buf)
 }
 early_param("tsc_early_khz", tsc_early_khz_setup);
 
-__always_inline void cyc2ns_read_begin(struct cyc2ns_data *data)
+__always_inline void __cyc2ns_read(struct cyc2ns_data *data)
 {
 	int seq, idx;
 
-	preempt_disable_notrace();
-
 	do {
 		seq = this_cpu_read(cyc2ns.seq.seqcount.sequence);
 		idx = seq & 1;
@@ -86,6 +84,12 @@ __always_inline void cyc2ns_read_begin(struct cyc2ns_data *data)
 	} while (unlikely(seq != this_cpu_read(cyc2ns.seq.seqcount.sequence)));
 }
 
+__always_inline void cyc2ns_read_begin(struct cyc2ns_data *data)
+{
+	preempt_disable_notrace();
+	__cyc2ns_read(data);
+}
+
 __always_inline void cyc2ns_read_end(void)
 {
 	preempt_enable_notrace();
@@ -115,18 +119,25 @@ __always_inline void cyc2ns_read_end(void)
  *                      -johnstul@us.ibm.com "math is hard, lets go shopping!"
  */
 
-static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc)
+static __always_inline unsigned long long __cycles_2_ns(unsigned long long cyc)
 {
 	struct cyc2ns_data data;
 	unsigned long long ns;
 
-	cyc2ns_read_begin(&data);
+	__cyc2ns_read(&data);
 
 	ns = data.cyc2ns_offset;
 	ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift);
 
-	cyc2ns_read_end();
+	return ns;
+}
 
+static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc)
+{
+	unsigned long long ns;
+	preempt_disable_notrace();
+	ns = __cycles_2_ns(cyc);
+	preempt_enable_notrace();
 	return ns;
 }
 
@@ -223,7 +234,7 @@ noinstr u64 native_sched_clock(void)
 		u64 tsc_now = rdtsc();
 
 		/* return the value in ns */
-		return cycles_2_ns(tsc_now);
+		return __cycles_2_ns(tsc_now);
 	}
 
 	/*
@@ -250,7 +261,7 @@ u64 native_sched_clock_from_tsc(u64 tsc)
 /* We need to define a real function for sched_clock, to override the
    weak default version */
 #ifdef CONFIG_PARAVIRT
-noinstr u64 sched_clock(void)
+noinstr u64 sched_clock_noinstr(void)
 {
 	return paravirt_sched_clock();
 }
@@ -260,11 +271,20 @@ bool using_native_sched_clock(void)
 	return static_call_query(pv_sched_clock) == native_sched_clock;
 }
 #else
-u64 sched_clock(void) __attribute__((alias("native_sched_clock")));
+u64 sched_clock_noinstr(void) __attribute__((alias("native_sched_clock")));
 
 bool using_native_sched_clock(void) { return true; }
 #endif
 
+notrace u64 sched_clock(void)
+{
+	u64 now;
+	preempt_disable_notrace();
+	now = sched_clock_noinstr();
+	preempt_enable_notrace();
+	return now;
+}
+
 int check_tsc_unstable(void)
 {
 	return tsc_unstable;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 04b57a336b34..bc68a39efd70 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2799,14 +2799,13 @@ static u64 read_tsc(void)
 static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp,
 			  int *mode)
 {
-	long v;
 	u64 tsc_pg_val;
+	long v;
 
 	switch (clock->vclock_mode) {
 	case VDSO_CLOCKMODE_HVCLOCK:
-		tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(),
-						  tsc_timestamp);
-		if (tsc_pg_val != U64_MAX) {
+		if (hv_read_tsc_page_tsc(hv_get_tsc_page(),
+					 tsc_timestamp, &tsc_pg_val)) {
 			/* TSC page valid */
 			*mode = VDSO_CLOCKMODE_HVCLOCK;
 			v = (tsc_pg_val - clock->cycle_last) &
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index b74ac2562cfb..52fa5609b7f6 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -66,11 +66,10 @@ static noinstr u64 xen_sched_clock(void)
         struct pvclock_vcpu_time_info *src;
 	u64 ret;
 
-	preempt_disable_notrace();
 	src = &__this_cpu_read(xen_vcpu)->time;
 	ret = pvclock_clocksource_read_nowd(src);
 	ret -= xen_sched_clock_offset;
-	preempt_enable_notrace();
+
 	return ret;
 }
author	Linus Torvalds <torvalds@linux-foundation.org>	2023-06-27 23:03:21 +0200
committer	Linus Torvalds <torvalds@linux-foundation.org>	2023-06-27 23:03:21 +0200
commit	ed3b7923a816ded62dccef377c9ee346c7d3b1b4 (patch)
tree	41d46fc399c231088a370e7b3e488a93342fa681 /arch/x86
parent	Merge tag 'x86_sgx_for_v6.5' of git://git.kernel.org/pub/scm/linux/kernel/git... (diff)
parent	sched/core: Avoid multiple calling update_rq_clock() in __cfsb_csd_unthrottle() (diff)
download	linux-ed3b7923a816ded62dccef377c9ee346c7d3b1b4.tar.xz linux-ed3b7923a816ded62dccef377c9ee346c7d3b1b4.zip