diff options
Diffstat (limited to 'arch/x86/kernel/smpboot.c')
-rw-r--r-- | arch/x86/kernel/smpboot.c | 294 |
1 files changed, 291 insertions, 3 deletions
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 69881b2d446c..fe3ab9632f3b 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -147,6 +147,8 @@ static inline void smpboot_restore_warm_reset_vector(void) *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0; } +static void init_freq_invariance(void); + /* * Report back to the Boot Processor during boot time or to the caller processor * during CPU online. @@ -183,6 +185,8 @@ static void smp_callin(void) */ set_cpu_sibling_map(raw_smp_processor_id()); + init_freq_invariance(); + /* * Get our bogomips. * Update loops_per_jiffy in cpu_data. Previous call to @@ -466,7 +470,7 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) */ static const struct x86_cpu_id snc_cpu[] = { - { X86_VENDOR_INTEL, 6, INTEL_FAM6_SKYLAKE_X }, + X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, NULL), {} }; @@ -1337,7 +1341,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) set_sched_topology(x86_topology); set_cpu_sibling_map(0); - + init_freq_invariance(); smp_sanity_check(); switch (apic_intr_mode) { @@ -1434,7 +1438,7 @@ early_param("possible_cpus", _setup_possible_cpus); /* * cpu_possible_mask should be static, it cannot change as cpu's * are onlined, or offlined. The reason is per-cpu data-structures - * are allocated by some modules at init time, and dont expect to + * are allocated by some modules at init time, and don't expect to * do this dynamically on cpu arrival/departure. * cpu_present_mask on the other hand can change dynamically. * In case when cpu_hotplug is not compiled, then we resort to current @@ -1764,3 +1768,287 @@ void native_play_dead(void) } #endif + +/* + * APERF/MPERF frequency ratio computation. + * + * The scheduler wants to do frequency invariant accounting and needs a <1 + * ratio to account for the 'current' frequency, corresponding to + * freq_curr / freq_max. + * + * Since the frequency freq_curr on x86 is controlled by micro-controller and + * our P-state setting is little more than a request/hint, we need to observe + * the effective frequency 'BusyMHz', i.e. the average frequency over a time + * interval after discarding idle time. This is given by: + * + * BusyMHz = delta_APERF / delta_MPERF * freq_base + * + * where freq_base is the max non-turbo P-state. + * + * The freq_max term has to be set to a somewhat arbitrary value, because we + * can't know which turbo states will be available at a given point in time: + * it all depends on the thermal headroom of the entire package. We set it to + * the turbo level with 4 cores active. + * + * Benchmarks show that's a good compromise between the 1C turbo ratio + * (freq_curr/freq_max would rarely reach 1) and something close to freq_base, + * which would ignore the entire turbo range (a conspicuous part, making + * freq_curr/freq_max always maxed out). + * + * An exception to the heuristic above is the Atom uarch, where we choose the + * highest turbo level for freq_max since Atom's are generally oriented towards + * power efficiency. + * + * Setting freq_max to anything less than the 1C turbo ratio makes the ratio + * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1. + */ + +DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key); + +static DEFINE_PER_CPU(u64, arch_prev_aperf); +static DEFINE_PER_CPU(u64, arch_prev_mperf); +static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE; +static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE; + +void arch_set_max_freq_ratio(bool turbo_disabled) +{ + arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE : + arch_turbo_freq_ratio; +} + +static bool turbo_disabled(void) +{ + u64 misc_en; + int err; + + err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en); + if (err) + return false; + + return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE); +} + +static bool slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) +{ + int err; + + err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq); + if (err) + return false; + + err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq); + if (err) + return false; + + *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */ + *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */ + + return true; +} + +#include <asm/cpu_device_id.h> +#include <asm/intel-family.h> + +#define ICPU(model) \ + {X86_VENDOR_INTEL, 6, model, X86_FEATURE_APERFMPERF, 0} + +static const struct x86_cpu_id has_knl_turbo_ratio_limits[] = { + ICPU(INTEL_FAM6_XEON_PHI_KNL), + ICPU(INTEL_FAM6_XEON_PHI_KNM), + {} +}; + +static const struct x86_cpu_id has_skx_turbo_ratio_limits[] = { + ICPU(INTEL_FAM6_SKYLAKE_X), + {} +}; + +static const struct x86_cpu_id has_glm_turbo_ratio_limits[] = { + ICPU(INTEL_FAM6_ATOM_GOLDMONT), + ICPU(INTEL_FAM6_ATOM_GOLDMONT_D), + ICPU(INTEL_FAM6_ATOM_GOLDMONT_PLUS), + {} +}; + +static bool knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, + int num_delta_fratio) +{ + int fratio, delta_fratio, found; + int err, i; + u64 msr; + + if (!x86_match_cpu(has_knl_turbo_ratio_limits)) + return false; + + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + if (err) + return false; + + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); + if (err) + return false; + + fratio = (msr >> 8) & 0xFF; + i = 16; + found = 0; + do { + if (found >= num_delta_fratio) { + *turbo_freq = fratio; + return true; + } + + delta_fratio = (msr >> (i + 5)) & 0x7; + + if (delta_fratio) { + found += 1; + fratio -= delta_fratio; + } + + i += 8; + } while (i < 64); + + return true; +} + +static bool skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size) +{ + u64 ratios, counts; + u32 group_size; + int err, i; + + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + if (err) + return false; + + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios); + if (err) + return false; + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts); + if (err) + return false; + + for (i = 0; i < 64; i += 8) { + group_size = (counts >> i) & 0xFF; + if (group_size >= size) { + *turbo_freq = (ratios >> i) & 0xFF; + return true; + } + } + + return false; +} + +static bool core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) +{ + int err; + + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + if (err) + return false; + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, turbo_freq); + if (err) + return false; + + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ + *turbo_freq = (*turbo_freq >> 24) & 0xFF; /* 4C turbo */ + + return true; +} + +static bool intel_set_max_freq_ratio(void) +{ + u64 base_freq, turbo_freq; + + if (slv_set_max_freq_ratio(&base_freq, &turbo_freq)) + goto out; + + if (x86_match_cpu(has_glm_turbo_ratio_limits) && + skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) + goto out; + + if (knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) + goto out; + + if (x86_match_cpu(has_skx_turbo_ratio_limits) && + skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4)) + goto out; + + if (core_set_max_freq_ratio(&base_freq, &turbo_freq)) + goto out; + + return false; + +out: + arch_turbo_freq_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, + base_freq); + arch_set_max_freq_ratio(turbo_disabled()); + return true; +} + +static void init_counter_refs(void *arg) +{ + u64 aperf, mperf; + + rdmsrl(MSR_IA32_APERF, aperf); + rdmsrl(MSR_IA32_MPERF, mperf); + + this_cpu_write(arch_prev_aperf, aperf); + this_cpu_write(arch_prev_mperf, mperf); +} + +static void init_freq_invariance(void) +{ + bool ret = false; + + if (smp_processor_id() != 0 || !boot_cpu_has(X86_FEATURE_APERFMPERF)) + return; + + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + ret = intel_set_max_freq_ratio(); + + if (ret) { + on_each_cpu(init_counter_refs, NULL, 1); + static_branch_enable(&arch_scale_freq_key); + } else { + pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n"); + } +} + +DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; + +void arch_scale_freq_tick(void) +{ + u64 freq_scale; + u64 aperf, mperf; + u64 acnt, mcnt; + + if (!arch_scale_freq_invariant()) + return; + + rdmsrl(MSR_IA32_APERF, aperf); + rdmsrl(MSR_IA32_MPERF, mperf); + + acnt = aperf - this_cpu_read(arch_prev_aperf); + mcnt = mperf - this_cpu_read(arch_prev_mperf); + if (!mcnt) + return; + + this_cpu_write(arch_prev_aperf, aperf); + this_cpu_write(arch_prev_mperf, mperf); + + acnt <<= 2*SCHED_CAPACITY_SHIFT; + mcnt *= arch_max_freq_ratio; + + freq_scale = div64_u64(acnt, mcnt); + + if (freq_scale > SCHED_CAPACITY_SCALE) + freq_scale = SCHED_CAPACITY_SCALE; + + this_cpu_write(arch_freq_scale, freq_scale); +} |