1 files changed, 430 insertions, 0 deletions
diff --git a/arch/s390/kernel/hiperdispatch.c b/arch/s390/kernel/hiperdispatch.c
new file mode 100644
index 000000000000..2a99a216ab62
--- /dev/null
+++ b/arch/s390/kernel/hiperdispatch.c
@@ -0,0 +1,430 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corp. 2024
+ */
+
+#define KMSG_COMPONENT "hd"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+/*
+ * Hiperdispatch:
+ * Dynamically calculates the optimum number of high capacity COREs
+ * by considering the state the system is in. When hiperdispatch decides
+ * that a capacity update is necessary, it schedules a topology update.
+ * During topology updates the CPU capacities are always re-adjusted.
+ *
+ * There is two places where CPU capacities are being accessed within
+ * hiperdispatch.
+ * -> hiperdispatch's reoccuring work function reads CPU capacities to
+ *    determine high capacity CPU count.
+ * -> during a topology update hiperdispatch's adjustment function
+ *    updates CPU capacities.
+ * These two can run on different CPUs in parallel which can cause
+ * hiperdispatch to make wrong decisions. This can potentially cause
+ * some overhead by leading to extra rebuild_sched_domains() calls
+ * for correction. Access to capacities within hiperdispatch has to be
+ * serialized to prevent the overhead.
+ *
+ * Hiperdispatch decision making revolves around steal time.
+ * HD_STEAL_THRESHOLD value is taken as reference. Whenever steal time
+ * crosses the threshold value hiperdispatch falls back to giving high
+ * capacities to entitled CPUs. When steal time drops below the
+ * threshold boundary, hiperdispatch utilizes all CPUs by giving all
+ * of them high capacity.
+ *
+ * The theory behind HD_STEAL_THRESHOLD is related to the SMP thread
+ * performance. Comparing the throughput of;
+ * - single CORE, with N threads, running N tasks
+ * - N separate COREs running N tasks,
+ * using individual COREs for individual tasks yield better
+ * performance. This performance difference is roughly ~30% (can change
+ * between machine generations)
+ *
+ * Hiperdispatch tries to hint scheduler to use individual COREs for
+ * each task, as long as steal time on those COREs are less than 30%,
+ * therefore delaying the throughput loss caused by using SMP threads.
+ */
+
+#include <linux/cpumask.h>
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/kernel_stat.h>
+#include <linux/kstrtox.h>
+#include <linux/ktime.h>
+#include <linux/sysctl.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <asm/hiperdispatch.h>
+#include <asm/setup.h>
+#include <asm/smp.h>
+#include <asm/topology.h>
+
+#define CREATE_TRACE_POINTS
+#include <asm/trace/hiperdispatch.h>
+
+#define HD_DELAY_FACTOR			(4)
+#define HD_DELAY_INTERVAL		(HZ / 4)
+#define HD_STEAL_THRESHOLD		30
+#define HD_STEAL_AVG_WEIGHT		16
+
+static cpumask_t hd_vl_coremask;	/* Mask containing all vertical low COREs */
+static cpumask_t hd_vmvl_cpumask;	/* Mask containing vertical medium and low CPUs */
+static int hd_high_capacity_cores;	/* Current CORE count with high capacity */
+static int hd_entitled_cores;		/* Total vertical high and medium CORE count */
+static int hd_online_cores;		/* Current online CORE count */
+
+static unsigned long hd_previous_steal;	/* Previous iteration's CPU steal timer total */
+static unsigned long hd_high_time;	/* Total time spent while all cpus have high capacity */
+static unsigned long hd_low_time;	/* Total time spent while vl cpus have low capacity */
+static atomic64_t hd_adjustments;	/* Total occurrence count of hiperdispatch adjustments */
+
+static unsigned int hd_steal_threshold = HD_STEAL_THRESHOLD;
+static unsigned int hd_delay_factor = HD_DELAY_FACTOR;
+static int hd_enabled;
+
+static void hd_capacity_work_fn(struct work_struct *work);
+static DECLARE_DELAYED_WORK(hd_capacity_work, hd_capacity_work_fn);
+
+static int hd_set_hiperdispatch_mode(int enable)
+{
+	if (!MACHINE_HAS_TOPOLOGY)
+		enable = 0;
+	if (hd_enabled == enable)
+		return 0;
+	hd_enabled = enable;
+	return 1;
+}
+
+void hd_reset_state(void)
+{
+	cpumask_clear(&hd_vl_coremask);
+	cpumask_clear(&hd_vmvl_cpumask);
+	hd_entitled_cores = 0;
+	hd_online_cores = 0;
+}
+
+void hd_add_core(int cpu)
+{
+	const struct cpumask *siblings;
+	int polarization;
+
+	hd_online_cores++;
+	polarization = smp_cpu_get_polarization(cpu);
+	siblings = topology_sibling_cpumask(cpu);
+	switch (polarization) {
+	case POLARIZATION_VH:
+		hd_entitled_cores++;
+		break;
+	case POLARIZATION_VM:
+		hd_entitled_cores++;
+		cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings);
+		break;
+	case POLARIZATION_VL:
+		cpumask_set_cpu(cpu, &hd_vl_coremask);
+		cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings);
+		break;
+	}
+}
+
+/* Serialize update and read operations of debug counters. */
+static DEFINE_MUTEX(hd_counter_mutex);
+
+static void hd_update_times(void)
+{
+	static ktime_t prev;
+	ktime_t now;
+
+	/*
+	 * Check if hiperdispatch is active, if not set the prev to 0.
+	 * This way it is possible to differentiate the first update iteration after
+	 * enabling hiperdispatch.
+	 */
+	if (hd_entitled_cores == 0 || hd_enabled == 0) {
+		prev = ktime_set(0, 0);
+		return;
+	}
+	now = ktime_get();
+	if (ktime_after(prev, 0)) {
+		if (hd_high_capacity_cores == hd_online_cores)
+			hd_high_time += ktime_ms_delta(now, prev);
+		else
+			hd_low_time += ktime_ms_delta(now, prev);
+	}
+	prev = now;
+}
+
+static void hd_update_capacities(void)
+{
+	int cpu, upscaling_cores;
+	unsigned long capacity;
+
+	upscaling_cores = hd_high_capacity_cores - hd_entitled_cores;
+	capacity = upscaling_cores > 0 ? CPU_CAPACITY_HIGH : CPU_CAPACITY_LOW;
+	hd_high_capacity_cores = hd_entitled_cores;
+	for_each_cpu(cpu, &hd_vl_coremask) {
+		smp_set_core_capacity(cpu, capacity);
+		if (capacity != CPU_CAPACITY_HIGH)
+			continue;
+		hd_high_capacity_cores++;
+		upscaling_cores--;
+		if (upscaling_cores == 0)
+			capacity = CPU_CAPACITY_LOW;
+	}
+}
+
+void hd_disable_hiperdispatch(void)
+{
+	cancel_delayed_work_sync(&hd_capacity_work);
+	hd_high_capacity_cores = hd_online_cores;
+	hd_previous_steal = 0;
+}
+
+int hd_enable_hiperdispatch(void)
+{
+	mutex_lock(&hd_counter_mutex);
+	hd_update_times();
+	mutex_unlock(&hd_counter_mutex);
+	if (hd_enabled == 0)
+		return 0;
+	if (hd_entitled_cores == 0)
+		return 0;
+	if (hd_online_cores <= hd_entitled_cores)
+		return 0;
+	mod_delayed_work(system_wq, &hd_capacity_work, HD_DELAY_INTERVAL * hd_delay_factor);
+	hd_update_capacities();
+	return 1;
+}
+
+static unsigned long hd_steal_avg(unsigned long new)
+{
+	static unsigned long steal;
+
+	steal = (steal * (HD_STEAL_AVG_WEIGHT - 1) + new) / HD_STEAL_AVG_WEIGHT;
+	return steal;
+}
+
+static unsigned long hd_calculate_steal_percentage(void)
+{
+	unsigned long time_delta, steal_delta, steal, percentage;
+	static ktime_t prev;
+	int cpus, cpu;
+	ktime_t now;
+
+	cpus = 0;
+	steal = 0;
+	percentage = 0;
+	for_each_cpu(cpu, &hd_vmvl_cpumask) {
+		steal += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL];
+		cpus++;
+	}
+	/*
+	 * If there is no vertical medium and low CPUs steal time
+	 * is 0 as vertical high CPUs shouldn't experience steal time.
+	 */
+	if (cpus == 0)
+		return percentage;
+	now = ktime_get();
+	time_delta = ktime_to_ns(ktime_sub(now, prev));
+	if (steal > hd_previous_steal && hd_previous_steal != 0) {
+		steal_delta = (steal - hd_previous_steal) * 100 / time_delta;
+		percentage = steal_delta / cpus;
+	}
+	hd_previous_steal = steal;
+	prev = now;
+	return percentage;
+}
+
+static void hd_capacity_work_fn(struct work_struct *work)
+{
+	unsigned long steal_percentage, new_cores;
+
+	mutex_lock(&smp_cpu_state_mutex);
+	/*
+	 * If online cores are less or equal to entitled cores hiperdispatch
+	 * does not need to make any adjustments, call a topology update to
+	 * disable hiperdispatch.
+	 * Normally this check is handled on topology update, but during cpu
+	 * unhotplug, topology and cpu mask updates are done in reverse
+	 * order, causing hd_enable_hiperdispatch() to get stale data.
+	 */
+	if (hd_online_cores <= hd_entitled_cores) {
+		topology_schedule_update();
+		mutex_unlock(&smp_cpu_state_mutex);
+		return;
+	}
+	steal_percentage = hd_steal_avg(hd_calculate_steal_percentage());
+	if (steal_percentage < hd_steal_threshold)
+		new_cores = hd_online_cores;
+	else
+		new_cores = hd_entitled_cores;
+	if (hd_high_capacity_cores != new_cores) {
+		trace_s390_hd_rebuild_domains(hd_high_capacity_cores, new_cores);
+		hd_high_capacity_cores = new_cores;
+		atomic64_inc(&hd_adjustments);
+		topology_schedule_update();
+	}
+	trace_s390_hd_work_fn(steal_percentage, hd_entitled_cores, hd_high_capacity_cores);
+	mutex_unlock(&smp_cpu_state_mutex);
+	schedule_delayed_work(&hd_capacity_work, HD_DELAY_INTERVAL);
+}
+
+static int hiperdispatch_ctl_handler(const struct ctl_table *ctl, int write,
+				     void *buffer, size_t *lenp, loff_t *ppos)
+{
+	int hiperdispatch;
+	int rc;
+	struct ctl_table ctl_entry = {
+		.procname	= ctl->procname,
+		.data		= &hiperdispatch,
+		.maxlen		= sizeof(int),
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	};
+
+	hiperdispatch = hd_enabled;
+	rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos);
+	if (rc < 0 || !write)
+		return rc;
+	mutex_lock(&smp_cpu_state_mutex);
+	if (hd_set_hiperdispatch_mode(hiperdispatch))
+		topology_schedule_update();
+	mutex_unlock(&smp_cpu_state_mutex);
+	return 0;
+}
+
+static struct ctl_table hiperdispatch_ctl_table[] = {
+	{
+		.procname	= "hiperdispatch",
+		.mode		= 0644,
+		.proc_handler	= hiperdispatch_ctl_handler,
+	},
+};
+
+static ssize_t hd_steal_threshold_show(struct device *dev,
+				       struct device_attribute *attr,
+				       char *buf)
+{
+	return sysfs_emit(buf, "%u\n", hd_steal_threshold);
+}
+
+static ssize_t hd_steal_threshold_store(struct device *dev,
+					struct device_attribute *attr,
+					const char *buf,
+					size_t count)
+{
+	unsigned int val;
+	int rc;
+
+	rc = kstrtouint(buf, 0, &val);
+	if (rc)
+		return rc;
+	if (val > 100)
+		return -ERANGE;
+	hd_steal_threshold = val;
+	return count;
+}
+
+static DEVICE_ATTR_RW(hd_steal_threshold);
+
+static ssize_t hd_delay_factor_show(struct device *dev,
+				    struct device_attribute *attr,
+				    char *buf)
+{
+	return sysfs_emit(buf, "%u\n", hd_delay_factor);
+}
+
+static ssize_t hd_delay_factor_store(struct device *dev,
+				     struct device_attribute *attr,
+				     const char *buf,
+				     size_t count)
+{
+	unsigned int val;
+	int rc;
+
+	rc = kstrtouint(buf, 0, &val);
+	if (rc)
+		return rc;
+	if (!val)
+		return -ERANGE;
+	hd_delay_factor = val;
+	return count;
+}
+
+static DEVICE_ATTR_RW(hd_delay_factor);
+
+static struct attribute *hd_attrs[] = {
+	&dev_attr_hd_steal_threshold.attr,
+	&dev_attr_hd_delay_factor.attr,
+	NULL,
+};
+
+static const struct attribute_group hd_attr_group = {
+	.name  = "hiperdispatch",
+	.attrs = hd_attrs,
+};
+
+static int hd_greedy_time_get(void *unused, u64 *val)
+{
+	mutex_lock(&hd_counter_mutex);
+	hd_update_times();
+	*val = hd_high_time;
+	mutex_unlock(&hd_counter_mutex);
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(hd_greedy_time_fops, hd_greedy_time_get, NULL, "%llu\n");
+
+static int hd_conservative_time_get(void *unused, u64 *val)
+{
+	mutex_lock(&hd_counter_mutex);
+	hd_update_times();
+	*val = hd_low_time;
+	mutex_unlock(&hd_counter_mutex);
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(hd_conservative_time_fops, hd_conservative_time_get, NULL, "%llu\n");
+
+static int hd_adjustment_count_get(void *unused, u64 *val)
+{
+	*val = atomic64_read(&hd_adjustments);
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(hd_adjustments_fops, hd_adjustment_count_get, NULL, "%llu\n");
+
+static void __init hd_create_debugfs_counters(void)
+{
+	struct dentry *dir;
+
+	dir = debugfs_create_dir("hiperdispatch", arch_debugfs_dir);
+	debugfs_create_file("conservative_time_ms", 0400, dir, NULL, &hd_conservative_time_fops);
+	debugfs_create_file("greedy_time_ms", 0400, dir, NULL, &hd_greedy_time_fops);
+	debugfs_create_file("adjustment_count", 0400, dir, NULL, &hd_adjustments_fops);
+}
+
+static void __init hd_create_attributes(void)
+{
+	struct device *dev;
+
+	dev = bus_get_dev_root(&cpu_subsys);
+	if (!dev)
+		return;
+	if (sysfs_create_group(&dev->kobj, &hd_attr_group))
+		pr_warn("Unable to create hiperdispatch attribute group\n");
+	put_device(dev);
+}
+
+static int __init hd_init(void)
+{
+	if (IS_ENABLED(CONFIG_HIPERDISPATCH_ON)) {
+		hd_set_hiperdispatch_mode(1);
+		topology_schedule_update();
+	}
+	if (!register_sysctl("s390", hiperdispatch_ctl_table))
+		pr_warn("Failed to register s390.hiperdispatch sysctl attribute\n");
+	hd_create_debugfs_counters();
+	hd_create_attributes();
+	return 0;
+}
+late_initcall(hd_init);