diff options
41 files changed, 508 insertions, 754 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index c370f5f0eb38..5cde1ff32ff3 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2835,8 +2835,6 @@ nosync [HW,M68K] Disables sync negotiation for all devices. - notsc [BUGS=X86-32] Disable Time Stamp Counter - nowatchdog [KNL] Disable both lockup detectors, i.e. soft-lockup and NMI watchdog (hard-lockup). diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt index 3a2ba436721a..ad6d2a80cf05 100644 --- a/Documentation/x86/x86_64/boot-options.txt +++ b/Documentation/x86/x86_64/boot-options.txt @@ -92,9 +92,7 @@ APICs Timing notsc - Don't use the CPU time stamp counter to read the wall time. - This can be used to work around timing problems on multiprocessor systems - with not properly synchronized CPUs. + Deprecated, use tsc=unstable instead. nohpet Don't use the HPET timer. diff --git a/arch/arm/include/asm/mach/time.h b/arch/arm/include/asm/mach/time.h index 0f79e4dec7f9..4ac3a019a46f 100644 --- a/arch/arm/include/asm/mach/time.h +++ b/arch/arm/include/asm/mach/time.h @@ -13,7 +13,6 @@ extern void timer_tick(void); typedef void (*clock_access_fn)(struct timespec64 *); -extern int register_persistent_clock(clock_access_fn read_boot, - clock_access_fn read_persistent); +extern int register_persistent_clock(clock_access_fn read_persistent); #endif diff --git a/arch/arm/kernel/time.c b/arch/arm/kernel/time.c index cf2701cb0de8..078b259ead4e 100644 --- a/arch/arm/kernel/time.c +++ b/arch/arm/kernel/time.c @@ -83,29 +83,18 @@ static void dummy_clock_access(struct timespec64 *ts) } static clock_access_fn __read_persistent_clock = dummy_clock_access; -static clock_access_fn __read_boot_clock = dummy_clock_access; void read_persistent_clock64(struct timespec64 *ts) { __read_persistent_clock(ts); } -void read_boot_clock64(struct timespec64 *ts) -{ - __read_boot_clock(ts); -} - -int __init register_persistent_clock(clock_access_fn read_boot, - clock_access_fn read_persistent) +int __init register_persistent_clock(clock_access_fn read_persistent) { /* Only allow the clockaccess functions to be registered once */ - if (__read_persistent_clock == dummy_clock_access && - __read_boot_clock == dummy_clock_access) { - if (read_boot) - __read_boot_clock = read_boot; + if (__read_persistent_clock == dummy_clock_access) { if (read_persistent) __read_persistent_clock = read_persistent; - return 0; } diff --git a/arch/arm/plat-omap/counter_32k.c b/arch/arm/plat-omap/counter_32k.c index 2438b96004c1..fcc5bfec8bd1 100644 --- a/arch/arm/plat-omap/counter_32k.c +++ b/arch/arm/plat-omap/counter_32k.c @@ -110,7 +110,7 @@ int __init omap_init_clocksource_32k(void __iomem *vbase) } sched_clock_register(omap_32k_read_sched_clock, 32, 32768); - register_persistent_clock(NULL, omap_read_persistent_clock64); + register_persistent_clock(omap_read_persistent_clock64); pr_info("OMAP clocksource: 32k_counter at 32768 Hz\n"); return 0; diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index cf561160ea88..e8766beee5ad 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -221,17 +221,22 @@ void read_persistent_clock64(struct timespec64 *ts) ext_to_timespec64(clk, ts); } -void read_boot_clock64(struct timespec64 *ts) +void __init read_persistent_wall_and_boot_offset(struct timespec64 *wall_time, + struct timespec64 *boot_offset) { unsigned char clk[STORE_CLOCK_EXT_SIZE]; + struct timespec64 boot_time; __u64 delta; delta = initial_leap_seconds + TOD_UNIX_EPOCH; - memcpy(clk, tod_clock_base, 16); - *(__u64 *) &clk[1] -= delta; - if (*(__u64 *) &clk[1] > delta) + memcpy(clk, tod_clock_base, STORE_CLOCK_EXT_SIZE); + *(__u64 *)&clk[1] -= delta; + if (*(__u64 *)&clk[1] > delta) clk[0]--; - ext_to_timespec64(clk, ts); + ext_to_timespec64(clk, &boot_time); + + read_persistent_clock64(wall_time); + *boot_offset = timespec64_sub(*wall_time, boot_time); } static u64 read_tod_clock(struct clocksource *cs) diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index cf090e584202..7ed08a7c3398 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -76,4 +76,17 @@ #define INTEL_FAM6_XEON_PHI_KNL 0x57 /* Knights Landing */ #define INTEL_FAM6_XEON_PHI_KNM 0x85 /* Knights Mill */ +/* Useful macros */ +#define INTEL_CPU_FAM_ANY(_family, _model, _driver_data) \ +{ \ + .vendor = X86_VENDOR_INTEL, \ + .family = _family, \ + .model = _model, \ + .feature = X86_FEATURE_ANY, \ + .driver_data = (kernel_ulong_t)&_driver_data \ +} + +#define INTEL_CPU_FAM6(_model, _driver_data) \ + INTEL_CPU_FAM_ANY(6, INTEL_FAM6_##_model, _driver_data) + #endif /* _ASM_X86_INTEL_FAMILY_H */ diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h index fe04491130ae..52f815a80539 100644 --- a/arch/x86/include/asm/intel-mid.h +++ b/arch/x86/include/asm/intel-mid.h @@ -80,35 +80,6 @@ enum intel_mid_cpu_type { extern enum intel_mid_cpu_type __intel_mid_cpu_chip; -/** - * struct intel_mid_ops - Interface between intel-mid & sub archs - * @arch_setup: arch_setup function to re-initialize platform - * structures (x86_init, x86_platform_init) - * - * This structure can be extended if any new interface is required - * between intel-mid & its sub arch files. - */ -struct intel_mid_ops { - void (*arch_setup)(void); -}; - -/* Helper API's for INTEL_MID_OPS_INIT */ -#define DECLARE_INTEL_MID_OPS_INIT(cpuname, cpuid) \ - [cpuid] = get_##cpuname##_ops - -/* Maximum number of CPU ops */ -#define MAX_CPU_OPS(a) (sizeof(a)/sizeof(void *)) - -/* - * For every new cpu addition, a weak get_<cpuname>_ops() function needs be - * declared in arch/x86/platform/intel_mid/intel_mid_weak_decls.h. - */ -#define INTEL_MID_OPS_INIT { \ - DECLARE_INTEL_MID_OPS_INIT(penwell, INTEL_MID_CPU_CHIP_PENWELL), \ - DECLARE_INTEL_MID_OPS_INIT(cloverview, INTEL_MID_CPU_CHIP_CLOVERVIEW), \ - DECLARE_INTEL_MID_OPS_INIT(tangier, INTEL_MID_CPU_CHIP_TANGIER) \ -}; - #ifdef CONFIG_X86_INTEL_MID static inline enum intel_mid_cpu_type intel_mid_identify_cpu(void) @@ -136,20 +107,6 @@ enum intel_mid_timer_options { extern enum intel_mid_timer_options intel_mid_timer_options; -/* - * Penwell uses spread spectrum clock, so the freq number is not exactly - * the same as reported by MSR based on SDM. - */ -#define FSB_FREQ_83SKU 83200 -#define FSB_FREQ_100SKU 99840 -#define FSB_FREQ_133SKU 133000 - -#define FSB_FREQ_167SKU 167000 -#define FSB_FREQ_200SKU 200000 -#define FSB_FREQ_267SKU 267000 -#define FSB_FREQ_333SKU 333000 -#define FSB_FREQ_400SKU 400000 - /* Bus Select SoC Fuse value */ #define BSEL_SOC_FUSE_MASK 0x7 /* FSB 133MHz */ diff --git a/arch/x86/include/asm/kvm_guest.h b/arch/x86/include/asm/kvm_guest.h deleted file mode 100644 index 46185263d9c2..000000000000 --- a/arch/x86/include/asm/kvm_guest.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_KVM_GUEST_H -#define _ASM_X86_KVM_GUEST_H - -int kvm_setup_vsyscall_timeinfo(void); - -#endif /* _ASM_X86_KVM_GUEST_H */ diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 3aea2658323a..4c723632c036 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -7,7 +7,6 @@ #include <uapi/asm/kvm_para.h> extern void kvmclock_init(void); -extern int kvm_register_clock(char *txt); #ifdef CONFIG_KVM_GUEST bool kvm_check_and_clear_guest_paused(void); diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index 2ecd34e2d46c..e85ff65c43c3 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -37,5 +37,6 @@ extern void *text_poke_early(void *addr, const void *opcode, size_t len); extern void *text_poke(void *addr, const void *opcode, size_t len); extern int poke_int3_handler(struct pt_regs *regs); extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler); +extern int after_bootmem; #endif /* _ASM_X86_TEXT_PATCHING_H */ diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 2701d221583a..eb5bbfeccb66 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -33,13 +33,13 @@ static inline cycles_t get_cycles(void) extern struct system_counterval_t convert_art_to_tsc(u64 art); extern struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns); -extern void tsc_early_delay_calibrate(void); +extern void tsc_early_init(void); extern void tsc_init(void); extern void mark_tsc_unstable(char *reason); extern int unsynchronized_tsc(void); extern int check_tsc_unstable(void); extern void mark_tsc_async_resets(char *reason); -extern unsigned long native_calibrate_cpu(void); +extern unsigned long native_calibrate_cpu_early(void); extern unsigned long native_calibrate_tsc(void); extern unsigned long long native_sched_clock_from_tsc(u64 tsc); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index a481763a3776..014f214da581 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -668,6 +668,7 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode, local_irq_save(flags); memcpy(addr, opcode, len); local_irq_restore(flags); + sync_core(); /* Could also do a CLFLUSH here to speed up CPU recovery; but that causes hangs on some VIA CPUs. */ return addr; @@ -693,6 +694,12 @@ void *text_poke(void *addr, const void *opcode, size_t len) struct page *pages[2]; int i; + /* + * While boot memory allocator is runnig we cannot use struct + * pages as they are not yet initialized. + */ + BUG_ON(!after_bootmem); + if (!core_kernel_text((unsigned long)addr)) { pages[0] = vmalloc_to_page(addr); pages[1] = vmalloc_to_page(addr + PAGE_SIZE); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 38915fbfae73..b732438c1a1e 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -232,8 +232,6 @@ static void init_amd_k7(struct cpuinfo_x86 *c) } } - set_cpu_cap(c, X86_FEATURE_K7); - /* calling is from identify_secondary_cpu() ? */ if (!c->cpu_index) return; @@ -617,6 +615,14 @@ static void early_init_amd(struct cpuinfo_x86 *c) early_init_amd_mc(c); +#ifdef CONFIG_X86_32 + if (c->x86 == 6) + set_cpu_cap(c, X86_FEATURE_K7); +#endif + + if (c->x86 >= 0xf) + set_cpu_cap(c, X86_FEATURE_K8); + rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); /* @@ -863,9 +869,6 @@ static void init_amd(struct cpuinfo_x86 *c) init_amd_cacheinfo(c); - if (c->x86 >= 0xf) - set_cpu_cap(c, X86_FEATURE_K8); - if (cpu_has(c, X86_FEATURE_XMM2)) { unsigned long long val; int ret; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index df28e931d732..ba6b8bb1c036 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1019,6 +1019,24 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) } /* + * The NOPL instruction is supposed to exist on all CPUs of family >= 6; + * unfortunately, that's not true in practice because of early VIA + * chips and (more importantly) broken virtualizers that are not easy + * to detect. In the latter case it doesn't even *fail* reliably, so + * probing for it doesn't even work. Disable it completely on 32-bit + * unless we can find a reliable way to detect all the broken cases. + * Enable it explicitly on 64-bit for non-constant inputs of cpu_has(). + */ +static void detect_nopl(void) +{ +#ifdef CONFIG_X86_32 + setup_clear_cpu_cap(X86_FEATURE_NOPL); +#else + setup_force_cpu_cap(X86_FEATURE_NOPL); +#endif +} + +/* * Do minimum CPU detection early. * Fields really needed: vendor, cpuid_level, family, model, mask, * cache alignment. @@ -1092,6 +1110,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) */ if (!pgtable_l5_enabled()) setup_clear_cpu_cap(X86_FEATURE_LA57); + + detect_nopl(); } void __init early_cpu_init(void) @@ -1127,24 +1147,6 @@ void __init early_cpu_init(void) early_identify_cpu(&boot_cpu_data); } -/* - * The NOPL instruction is supposed to exist on all CPUs of family >= 6; - * unfortunately, that's not true in practice because of early VIA - * chips and (more importantly) broken virtualizers that are not easy - * to detect. In the latter case it doesn't even *fail* reliably, so - * probing for it doesn't even work. Disable it completely on 32-bit - * unless we can find a reliable way to detect all the broken cases. - * Enable it explicitly on 64-bit for non-constant inputs of cpu_has(). - */ -static void detect_nopl(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_X86_32 - clear_cpu_cap(c, X86_FEATURE_NOPL); -#else - set_cpu_cap(c, X86_FEATURE_NOPL); -#endif -} - static void detect_null_seg_behavior(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_64 @@ -1207,8 +1209,6 @@ static void generic_identify(struct cpuinfo_x86 *c) get_model_name(c); /* Default name */ - detect_nopl(c); - detect_null_seg_behavior(c); /* diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index e56c95be2808..eeea935e9bb5 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -37,15 +37,18 @@ static void bug_at(unsigned char *ip, int line) BUG(); } -static void __jump_label_transform(struct jump_entry *entry, - enum jump_label_type type, - void *(*poker)(void *, const void *, size_t), - int init) +static void __ref __jump_label_transform(struct jump_entry *entry, + enum jump_label_type type, + void *(*poker)(void *, const void *, size_t), + int init) { union jump_code_union code; const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP }; const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5]; + if (early_boot_irqs_disabled) + poker = text_poke_early; + if (type == JUMP_LABEL_JMP) { if (init) { /* diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index a37bda38d205..09aaabb2bbf1 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -45,7 +45,6 @@ #include <asm/apic.h> #include <asm/apicdef.h> #include <asm/hypervisor.h> -#include <asm/kvm_guest.h> static int kvmapf = 1; @@ -66,15 +65,6 @@ static int __init parse_no_stealacc(char *arg) early_param("no-steal-acc", parse_no_stealacc); -static int kvmclock_vsyscall = 1; -static int __init parse_no_kvmclock_vsyscall(char *arg) -{ - kvmclock_vsyscall = 0; - return 0; -} - -early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); - static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); static DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64); static int has_steal_clock = 0; @@ -560,9 +550,6 @@ static void __init kvm_guest_init(void) if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) apic_set_eoi_write(kvm_guest_apic_eoi_write); - if (kvmclock_vsyscall) - kvm_setup_vsyscall_timeinfo(); - #ifdef CONFIG_SMP smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus; smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; @@ -628,6 +615,7 @@ const __initconst struct hypervisor_x86 x86_hyper_kvm = { .name = "KVM", .detect = kvm_detect, .type = X86_HYPER_KVM, + .init.init_platform = kvmclock_init, .init.guest_late_init = kvm_guest_init, .init.x2apic_available = kvm_para_available, }; diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 3b8e7c13c614..d2edd7e6c294 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -23,30 +23,56 @@ #include <asm/apic.h> #include <linux/percpu.h> #include <linux/hardirq.h> -#include <linux/memblock.h> +#include <linux/cpuhotplug.h> #include <linux/sched.h> #include <linux/sched/clock.h> +#include <linux/mm.h> +#include <asm/hypervisor.h> #include <asm/mem_encrypt.h> #include <asm/x86_init.h> #include <asm/reboot.h> #include <asm/kvmclock.h> -static int kvmclock __ro_after_init = 1; -static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; -static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; -static u64 kvm_sched_clock_offset; +static int kvmclock __initdata = 1; +static int kvmclock_vsyscall __initdata = 1; +static int msr_kvm_system_time __ro_after_init = MSR_KVM_SYSTEM_TIME; +static int msr_kvm_wall_clock __ro_after_init = MSR_KVM_WALL_CLOCK; +static u64 kvm_sched_clock_offset __ro_after_init; -static int parse_no_kvmclock(char *arg) +static int __init parse_no_kvmclock(char *arg) { kvmclock = 0; return 0; } early_param("no-kvmclock", parse_no_kvmclock); -/* The hypervisor will put information about time periodically here */ -static struct pvclock_vsyscall_time_info *hv_clock; -static struct pvclock_wall_clock *wall_clock; +static int __init parse_no_kvmclock_vsyscall(char *arg) +{ + kvmclock_vsyscall = 0; + return 0; +} +early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); + +/* Aligned to page sizes to match whats mapped via vsyscalls to userspace */ +#define HV_CLOCK_SIZE (sizeof(struct pvclock_vsyscall_time_info) * NR_CPUS) +#define HVC_BOOT_ARRAY_SIZE \ + (PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info)) + +static struct pvclock_vsyscall_time_info + hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __aligned(PAGE_SIZE); +static struct pvclock_wall_clock wall_clock; +static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu); + +static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void) +{ + return &this_cpu_read(hv_clock_per_cpu)->pvti; +} + +static inline struct pvclock_vsyscall_time_info *this_cpu_hvclock(void) +{ + return this_cpu_read(hv_clock_per_cpu); +} /* * The wallclock is the time of day when we booted. Since then, some time may @@ -55,21 +81,10 @@ static struct pvclock_wall_clock *wall_clock; */ static void kvm_get_wallclock(struct timespec64 *now) { - struct pvclock_vcpu_time_info *vcpu_time; - int low, high; - int cpu; - - low = (int)slow_virt_to_phys(wall_clock); - high = ((u64)slow_virt_to_phys(wall_clock) >> 32); - - native_write_msr(msr_kvm_wall_clock, low, high); - - cpu = get_cpu(); - - vcpu_time = &hv_clock[cpu].pvti; - pvclock_read_wallclock(wall_clock, vcpu_time, now); - - put_cpu(); + wrmsrl(msr_kvm_wall_clock, slow_virt_to_phys(&wall_clock)); + preempt_disable(); + pvclock_read_wallclock(&wall_clock, this_cpu_pvti(), now); + preempt_enable(); } static int kvm_set_wallclock(const struct timespec64 *now) @@ -79,14 +94,10 @@ static int kvm_set_wallclock(const struct timespec64 *now) static u64 kvm_clock_read(void) { - struct pvclock_vcpu_time_info *src; u64 ret; - int cpu; preempt_disable_notrace(); - cpu = smp_processor_id(); - src = &hv_clock[cpu].pvti; - ret = pvclock_clocksource_read(src); + ret = pvclock_clocksource_read(this_cpu_pvti()); preempt_enable_notrace(); return ret; } @@ -112,11 +123,11 @@ static inline void kvm_sched_clock_init(bool stable) kvm_sched_clock_offset = kvm_clock_read(); pv_time_ops.sched_clock = kvm_sched_clock_read; - printk(KERN_INFO "kvm-clock: using sched offset of %llu cycles\n", - kvm_sched_clock_offset); + pr_info("kvm-clock: using sched offset of %llu cycles", + kvm_sched_clock_offset); BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) > - sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time)); + sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time)); } /* @@ -130,19 +141,11 @@ static inline void kvm_sched_clock_init(bool stable) */ static unsigned long kvm_get_tsc_khz(void) { - struct pvclock_vcpu_time_info *src; - int cpu; - unsigned long tsc_khz; - - cpu = get_cpu(); - src = &hv_clock[cpu].pvti; - tsc_khz = pvclock_tsc_khz(src); - put_cpu(); setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); - return tsc_khz; + return pvclock_tsc_khz(this_cpu_pvti()); } -static void kvm_get_preset_lpj(void) +static void __init kvm_get_preset_lpj(void) { unsigned long khz; u64 lpj; @@ -156,49 +159,40 @@ static void kvm_get_preset_lpj(void) bool kvm_check_and_clear_guest_paused(void) { + struct pvclock_vsyscall_time_info *src = this_cpu_hvclock(); bool ret = false; - struct pvclock_vcpu_time_info *src; - int cpu = smp_processor_id(); - if (!hv_clock) + if (!src) return ret; - src = &hv_clock[cpu].pvti; - if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) { - src->flags &= ~PVCLOCK_GUEST_STOPPED; + if ((src->pvti.flags & PVCLOCK_GUEST_STOPPED) != 0) { + src->pvti.flags &= ~PVCLOCK_GUEST_STOPPED; pvclock_touch_watchdogs(); ret = true; } - return ret; } struct clocksource kvm_clock = { - .name = "kvm-clock", - .read = kvm_clock_get_cycles, - .rating = 400, - .mask = CLOCKSOURCE_MASK(64), - .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .name = "kvm-clock", + .read = kvm_clock_get_cycles, + .rating = 400, + .mask = CLOCKSOURCE_MASK(64), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; EXPORT_SYMBOL_GPL(kvm_clock); -int kvm_register_clock(char *txt) +static void kvm_register_clock(char *txt) { - int cpu = smp_processor_id(); - int low, high, ret; - struct pvclock_vcpu_time_info *src; - - if (!hv_clock) - return 0; + struct pvclock_vsyscall_time_info *src = this_cpu_hvclock(); + u64 pa; - src = &hv_clock[cpu].pvti; - low = (int)slow_virt_to_phys(src) | 1; - high = ((u64)slow_virt_to_phys(src) >> 32); - ret = native_write_msr_safe(msr_kvm_system_time, low, high); - printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", - cpu, high, low, txt); + if (!src) + return; - return ret; + pa = slow_virt_to_phys(&src->pvti) | 0x01ULL; + wrmsrl(msr_kvm_system_time, pa); + pr_info("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt); } static void kvm_save_sched_clock_state(void) @@ -213,11 +207,7 @@ static void kvm_restore_sched_clock_state(void) #ifdef CONFIG_X86_LOCAL_APIC static void kvm_setup_secondary_clock(void) { - /* - * Now that the first cpu already had this clocksource initialized, - * we shouldn't fail. - */ - WARN_ON(kvm_register_clock("secondary cpu clock")); + kvm_register_clock("secondary cpu clock"); } #endif @@ -245,100 +235,84 @@ static void kvm_shutdown(void) native_machine_shutdown(); } -static phys_addr_t __init kvm_memblock_alloc(phys_addr_t size, - phys_addr_t align) +static int __init kvm_setup_vsyscall_timeinfo(void) { - phys_addr_t mem; +#ifdef CONFIG_X86_64 + u8 flags; - mem = memblock_alloc(size, align); - if (!mem) + if (!per_cpu(hv_clock_per_cpu, 0) || !kvmclock_vsyscall) return 0; - if (sev_active()) { - if (early_set_memory_decrypted((unsigned long)__va(mem), size)) - goto e_free; - } + flags = pvclock_read_flags(&hv_clock_boot[0].pvti); + if (!(flags & PVCLOCK_TSC_STABLE_BIT)) + return 0; - return mem; -e_free: - memblock_free(mem, size); + kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; +#endif return 0; } +early_initcall(kvm_setup_vsyscall_timeinfo); -static void __init kvm_memblock_free(phys_addr_t addr, phys_addr_t size) +static int kvmclock_setup_percpu(unsigned int cpu) { - if (sev_active()) - early_set_memory_encrypted((unsigned long)__va(addr), size); + struct pvclock_vsyscall_time_info *p = per_cpu(hv_clock_per_cpu, cpu); - memblock_free(addr, size); + /* + * The per cpu area setup replicates CPU0 data to all cpu + * pointers. So carefully check. CPU0 has been set up in init + * already. + */ + if (!cpu || (p && p != per_cpu(hv_clock_per_cpu, 0))) + return 0; + + /* Use the static page for the first CPUs, allocate otherwise */ + if (cpu < HVC_BOOT_ARRAY_SIZE) + p = &hv_clock_boot[cpu]; + else + p = kzalloc(sizeof(*p), GFP_KERNEL); + + per_cpu(hv_clock_per_cpu, cpu) = p; + return p ? 0 : -ENOMEM; } void __init kvmclock_init(void) { - struct pvclock_vcpu_time_info *vcpu_time; - unsigned long mem, mem_wall_clock; - int size, cpu, wall_clock_size; u8 flags; - size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); - - if (!kvm_para_available()) + if (!kvm_para_available() || !kvmclock) return; - if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) { + if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) { msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW; msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW; - } else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE))) - return; - - wall_clock_size = PAGE_ALIGN(sizeof(struct pvclock_wall_clock)); - mem_wall_clock = kvm_memblock_alloc(wall_clock_size, PAGE_SIZE); - if (!mem_wall_clock) - return; - - wall_clock = __va(mem_wall_clock); - memset(wall_clock, 0, wall_clock_size); - - mem = kvm_memblock_alloc(size, PAGE_SIZE); - if (!mem) { - kvm_memblock_free(mem_wall_clock, wall_clock_size); - wall_clock = NULL; + } else if (!kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { return; } - hv_clock = __va(mem); - memset(hv_clock, 0, size); - - if (kvm_register_clock("primary cpu clock")) { - hv_clock = NULL; - kvm_memblock_free(mem, size); - kvm_memblock_free(mem_wall_clock, wall_clock_size); - wall_clock = NULL; + if (cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "kvmclock:setup_percpu", + kvmclock_setup_percpu, NULL) < 0) { return; } - printk(KERN_INFO "kvm-clock: Using msrs %x and %x", + pr_info("kvm-clock: Using msrs %x and %x", msr_kvm_system_time, msr_kvm_wall_clock); - pvclock_set_pvti_cpu0_va(hv_clock); + this_cpu_write(hv_clock_per_cpu, &hv_clock_boot[0]); + kvm_register_clock("primary cpu clock"); + pvclock_set_pvti_cpu0_va(hv_clock_boot); if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); - cpu = get_cpu(); - vcpu_time = &hv_clock[cpu].pvti; - flags = pvclock_read_flags(vcpu_time); - + flags = pvclock_read_flags(&hv_clock_boot[0].pvti); kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT); - put_cpu(); x86_platform.calibrate_tsc = kvm_get_tsc_khz; x86_platform.calibrate_cpu = kvm_get_tsc_khz; x86_platform.get_wallclock = kvm_get_wallclock; x86_platform.set_wallclock = kvm_set_wallclock; #ifdef CONFIG_X86_LOCAL_APIC - x86_cpuinit.early_percpu_clock_init = - kvm_setup_secondary_clock; + x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock; #endif x86_platform.save_sched_clock_state = kvm_save_sched_clock_state; x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state; @@ -350,31 +324,3 @@ void __init kvmclock_init(void) clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); pv_info.name = "KVM"; } - -int __init kvm_setup_vsyscall_timeinfo(void) -{ -#ifdef CONFIG_X86_64 - int cpu; - u8 flags; - struct pvclock_vcpu_time_info *vcpu_time; - unsigned int size; - - if (!hv_clock) - return 0; - - size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); - - cpu = get_cpu(); - - vcpu_time = &hv_clock[cpu].pvti; - flags = pvclock_read_flags(vcpu_time); - - put_cpu(); - - if (!(flags & PVCLOCK_TSC_STABLE_BIT)) - return 1; - - kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; -#endif - return 0; -} diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 2f86d883dd95..5d32c55aeb8b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -866,6 +866,8 @@ void __init setup_arch(char **cmdline_p) idt_setup_early_traps(); early_cpu_init(); + arch_init_ideal_nops(); + jump_label_init(); early_ioremap_init(); setup_olpc_ofw_pgd(); @@ -1012,6 +1014,7 @@ void __init setup_arch(char **cmdline_p) */ init_hypervisor_platform(); + tsc_early_init(); x86_init.resources.probe_roms(); /* after parse_early_param, so could debug it */ @@ -1197,11 +1200,6 @@ void __init setup_arch(char **cmdline_p) memblock_find_dma_reserve(); -#ifdef CONFIG_KVM_GUEST - kvmclock_init(); -#endif - - tsc_early_delay_calibrate(); if (!early_xdbc_setup_hardware()) early_xdbc_register_console(); @@ -1272,8 +1270,6 @@ void __init setup_arch(char **cmdline_p) mcheck_init(); - arch_init_ideal_nops(); - register_refined_jiffies(CLOCK_TICK_RATE); #ifdef CONFIG_EFI diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 74392d9d51e0..1463468ba9a0 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -33,16 +33,13 @@ EXPORT_SYMBOL(cpu_khz); unsigned int __read_mostly tsc_khz; EXPORT_SYMBOL(tsc_khz); +#define KHZ 1000 + /* * TSC can be unstable due to cpufreq or due to unsynced TSCs */ static int __read_mostly tsc_unstable; -/* native_sched_clock() is called before tsc_init(), so - we must start with the TSC soft disabled to prevent - erroneous rdtsc usage on !boot_cpu_has(X86_FEATURE_TSC) processors */ -static int __read_mostly tsc_disabled = -1; - static DEFINE_STATIC_KEY_FALSE(__use_tsc); int tsc_clocksource_reliable; @@ -106,23 +103,6 @@ void cyc2ns_read_end(void) * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ -static void cyc2ns_data_init(struct cyc2ns_data *data) -{ - data->cyc2ns_mul = 0; - data->cyc2ns_shift = 0; - data->cyc2ns_offset = 0; -} - -static void __init cyc2ns_init(int cpu) -{ - struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); - - cyc2ns_data_init(&c2n->data[0]); - cyc2ns_data_init(&c2n->data[1]); - - seqcount_init(&c2n->seq); -} - static inline unsigned long long cycles_2_ns(unsigned long long cyc) { struct cyc2ns_data data; @@ -138,18 +118,11 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc) return ns; } -static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) +static void __set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) { unsigned long long ns_now; struct cyc2ns_data data; struct cyc2ns *c2n; - unsigned long flags; - - local_irq_save(flags); - sched_clock_idle_sleep_event(); - - if (!khz) - goto done; ns_now = cycles_2_ns(tsc_now); @@ -181,13 +154,56 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_ c2n->data[0] = data; raw_write_seqcount_latch(&c2n->seq); c2n->data[1] = data; +} + +static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) +{ + unsigned long flags; + + local_irq_save(flags); + sched_clock_idle_sleep_event(); + + if (khz) + __set_cyc2ns_scale(khz, cpu, tsc_now); -done: sched_clock_idle_wakeup_event(); local_irq_restore(flags); } /* + * Initialize cyc2ns for boot cpu + */ +static void __init cyc2ns_init_boot_cpu(void) +{ + struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns); + + seqcount_init(&c2n->seq); + __set_cyc2ns_scale(tsc_khz, smp_processor_id(), rdtsc()); +} + +/* + * Secondary CPUs do not run through tsc_init(), so set up + * all the scale factors for all CPUs, assuming the same + * speed as the bootup CPU. (cpufreq notifiers will fix this + * up if their speed diverges) + */ +static void __init cyc2ns_init_secondary_cpus(void) +{ + unsigned int cpu, this_cpu = smp_processor_id(); + struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns); + struct cyc2ns_data *data = c2n->data; + + for_each_possible_cpu(cpu) { + if (cpu != this_cpu) { + seqcount_init(&c2n->seq); + c2n = per_cpu_ptr(&cyc2ns, cpu); + c2n->data[0] = data[0]; + c2n->data[1] = data[1]; + } + } +} + +/* * Scheduler clock - returns current time in nanosec units. */ u64 native_sched_clock(void) @@ -248,8 +264,7 @@ EXPORT_SYMBOL_GPL(check_tsc_unstable); #ifdef CONFIG_X86_TSC int __init notsc_setup(char *str) { - pr_warn("Kernel compiled with CONFIG_X86_TSC, cannot disable TSC completely\n"); - tsc_disabled = 1; + mark_tsc_unstable("boot parameter notsc"); return 1; } #else @@ -665,30 +680,17 @@ static unsigned long cpu_khz_from_cpuid(void) return eax_base_mhz * 1000; } -/** - * native_calibrate_cpu - calibrate the cpu on boot +/* + * calibrate cpu using pit, hpet, and ptimer methods. They are available + * later in boot after acpi is initialized. */ -unsigned long native_calibrate_cpu(void) +static unsigned long pit_hpet_ptimer_calibrate_cpu(void) { u64 tsc1, tsc2, delta, ref1, ref2; unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; - unsigned long flags, latch, ms, fast_calibrate; + unsigned long flags, latch, ms; int hpet = is_hpet_enabled(), i, loopmin; - fast_calibrate = cpu_khz_from_cpuid(); - if (fast_calibrate) - return fast_calibrate; - - fast_calibrate = cpu_khz_from_msr(); - if (fast_calibrate) - return fast_calibrate; - - local_irq_save(flags); - fast_calibrate = quick_pit_calibrate(); - local_irq_restore(flags); - if (fast_calibrate) - return fast_calibrate; - /* * Run 5 calibration loops to get the lowest frequency value * (the best estimate). We use two different calibration modes @@ -831,6 +833,37 @@ unsigned long native_calibrate_cpu(void) return tsc_pit_min; } +/** + * native_calibrate_cpu_early - can calibrate the cpu early in boot + */ +unsigned long native_calibrate_cpu_early(void) +{ + unsigned long flags, fast_calibrate = cpu_khz_from_cpuid(); + + if (!fast_calibrate) + fast_calibrate = cpu_khz_from_msr(); + if (!fast_calibrate) { + local_irq_save(flags); + fast_calibrate = quick_pit_calibrate(); + local_irq_restore(flags); + } + return fast_calibrate; +} + + +/** + * native_calibrate_cpu - calibrate the cpu + */ +static unsigned long native_calibrate_cpu(void) +{ + unsigned long tsc_freq = native_calibrate_cpu_early(); + + if (!tsc_freq) + tsc_freq = pit_hpet_ptimer_calibrate_cpu(); + + return tsc_freq; +} + void recalibrate_cpu_khz(void) { #ifndef CONFIG_SMP @@ -1307,7 +1340,7 @@ unreg: static int __init init_tsc_clocksource(void) { - if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_disabled > 0 || !tsc_khz) + if (!boot_cpu_has(X86_FEATURE_TSC) || !tsc_khz) return 0; if (tsc_unstable) @@ -1341,40 +1374,22 @@ unreg: */ device_initcall(init_tsc_clocksource); -void __init tsc_early_delay_calibrate(void) +static bool __init determine_cpu_tsc_frequencies(bool early) { - unsigned long lpj; - - if (!boot_cpu_has(X86_FEATURE_TSC)) - return; - - cpu_khz = x86_platform.calibrate_cpu(); - tsc_khz = x86_platform.calibrate_tsc(); - - tsc_khz = tsc_khz ? : cpu_khz; - if (!tsc_khz) - return; - - lpj = tsc_khz * 1000; - do_div(lpj, HZ); - loops_per_jiffy = lpj; -} - -void __init tsc_init(void) -{ - u64 lpj, cyc; - int cpu; - - if (!boot_cpu_has(X86_FEATURE_TSC)) { - setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); - return; + /* Make sure that cpu and tsc are not already calibrated */ + WARN_ON(cpu_khz || tsc_khz); + + if (early) { + cpu_khz = x86_platform.calibrate_cpu(); + tsc_khz = x86_platform.calibrate_tsc(); + } else { + /* We should not be here with non-native cpu calibration */ + WARN_ON(x86_platform.calibrate_cpu != native_calibrate_cpu); + cpu_khz = pit_hpet_ptimer_calibrate_cpu(); } - cpu_khz = x86_platform.calibrate_cpu(); - tsc_khz = x86_platform.calibrate_tsc(); - /* - * Trust non-zero tsc_khz as authorative, + * Trust non-zero tsc_khz as authoritative, * and use it to sanity check cpu_khz, * which will be off if system timer is off. */ @@ -1383,52 +1398,78 @@ void __init tsc_init(void) else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz) cpu_khz = tsc_khz; - if (!tsc_khz) { - mark_tsc_unstable("could not calculate TSC khz"); - setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); - return; - } + if (tsc_khz == 0) + return false; pr_info("Detected %lu.%03lu MHz processor\n", - (unsigned long)cpu_khz / 1000, - (unsigned long)cpu_khz % 1000); + (unsigned long)cpu_khz / KHZ, + (unsigned long)cpu_khz % KHZ); if (cpu_khz != tsc_khz) { pr_info("Detected %lu.%03lu MHz TSC", - (unsigned long)tsc_khz / 1000, - (unsigned long)tsc_khz % 1000); + (unsigned long)tsc_khz / KHZ, + (unsigned long)tsc_khz % KHZ); } + return true; +} + +static unsigned long __init get_loops_per_jiffy(void) +{ + unsigned long lpj = tsc_khz * KHZ; + do_div(lpj, HZ); + return lpj; +} + +static void __init tsc_enable_sched_clock(void) +{ /* Sanitize TSC ADJUST before cyc2ns gets initialized */ tsc_store_and_check_tsc_adjust(true); + cyc2ns_init_boot_cpu(); + static_branch_enable(&__use_tsc); +} + +void __init tsc_early_init(void) +{ + if (!boot_cpu_has(X86_FEATURE_TSC)) + return; + if (!determine_cpu_tsc_frequencies(true)) + return; + loops_per_jiffy = get_loops_per_jiffy(); + tsc_enable_sched_clock(); +} + +void __init tsc_init(void) +{ /* - * Secondary CPUs do not run through tsc_init(), so set up - * all the scale factors for all CPUs, assuming the same - * speed as the bootup CPU. (cpufreq notifiers will fix this - * up if their speed diverges) + * native_calibrate_cpu_early can only calibrate using methods that are + * available early in boot. */ - cyc = rdtsc(); - for_each_possible_cpu(cpu) { - cyc2ns_init(cpu); - set_cyc2ns_scale(tsc_khz, cpu, cyc); - } + if (x86_platform.calibrate_cpu == native_calibrate_cpu_early) + x86_platform.calibrate_cpu = native_calibrate_cpu; - if (tsc_disabled > 0) + if (!boot_cpu_has(X86_FEATURE_TSC)) { + setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); return; + } - /* now allow native_sched_clock() to use rdtsc */ + if (!tsc_khz) { + /* We failed to determine frequencies earlier, try again */ + if (!determine_cpu_tsc_frequencies(false)) { + mark_tsc_unstable("could not calculate TSC khz"); + setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); + return; + } + tsc_enable_sched_clock(); + } - tsc_disabled = 0; - static_branch_enable(&__use_tsc); + cyc2ns_init_secondary_cpus(); if (!no_sched_irq_time) enable_sched_clock_irqtime(); - lpj = ((u64)tsc_khz * 1000); - do_div(lpj, HZ); - lpj_fine = lpj; - + lpj_fine = get_loops_per_jiffy(); use_tsc_delay(); check_system_tsc_reliable(); @@ -1455,7 +1496,7 @@ unsigned long calibrate_delay_is_known(void) int constant_tsc = cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC); const struct cpumask *mask = topology_core_cpumask(cpu); - if (tsc_disabled || !constant_tsc || !mask) + if (!constant_tsc || !mask) return 0; sibling = cpumask_any_but(mask, cpu); diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index 19afdbd7d0a7..27ef714d886c 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -1,17 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * tsc_msr.c - TSC frequency enumeration via MSR + * TSC frequency enumeration via MSR * - * Copyright (C) 2013 Intel Corporation + * Copyright (C) 2013, 2018 Intel Corporation * Author: Bin Gao <bin.gao@intel.com> - * - * This file is released under the GPLv2. */ #include <linux/kernel.h> -#include <asm/processor.h> -#include <asm/setup.h> + #include <asm/apic.h> +#include <asm/cpu_device_id.h> +#include <asm/intel-family.h> +#include <asm/msr.h> #include <asm/param.h> +#include <asm/tsc.h> #define MAX_NUM_FREQS 9 @@ -23,44 +25,48 @@ * field msr_plat does. */ struct freq_desc { - u8 x86_family; /* CPU family */ - u8 x86_model; /* model */ u8 msr_plat; /* 1: use MSR_PLATFORM_INFO, 0: MSR_IA32_PERF_STATUS */ u32 freqs[MAX_NUM_FREQS]; }; -static struct freq_desc freq_desc_tables[] = { - /* PNW */ - { 6, 0x27, 0, { 0, 0, 0, 0, 0, 99840, 0, 83200 } }, - /* CLV+ */ - { 6, 0x35, 0, { 0, 133200, 0, 0, 0, 99840, 0, 83200 } }, - /* TNG - Intel Atom processor Z3400 series */ - { 6, 0x4a, 1, { 0, 100000, 133300, 0, 0, 0, 0, 0 } }, - /* VLV2 - Intel Atom processor E3000, Z3600, Z3700 series */ - { 6, 0x37, 1, { 83300, 100000, 133300, 116700, 80000, 0, 0, 0 } }, - /* ANN - Intel Atom processor Z3500 series */ - { 6, 0x5a, 1, { 83300, 100000, 133300, 100000, 0, 0, 0, 0 } }, - /* AMT - Intel Atom processor X7-Z8000 and X5-Z8000 series */ - { 6, 0x4c, 1, { 83300, 100000, 133300, 116700, - 80000, 93300, 90000, 88900, 87500 } }, +/* + * Penwell and Clovertrail use spread spectrum clock, + * so the freq number is not exactly the same as reported + * by MSR based on SDM. + */ +static const struct freq_desc freq_desc_pnw = { + 0, { 0, 0, 0, 0, 0, 99840, 0, 83200 } }; -static int match_cpu(u8 family, u8 model) -{ - int i; +static const struct freq_desc freq_desc_clv = { + 0, { 0, 133200, 0, 0, 0, 99840, 0, 83200 } +}; - for (i = 0; i < ARRAY_SIZE(freq_desc_tables); i++) { - if ((family == freq_desc_tables[i].x86_family) && - (model == freq_desc_tables[i].x86_model)) - return i; - } +static const struct freq_desc freq_desc_byt = { + 1, { 83300, 100000, 133300, 116700, 80000, 0, 0, 0 } +}; - return -1; -} +static const struct freq_desc freq_desc_cht = { + 1, { 83300, 100000, 133300, 116700, 80000, 93300, 90000, 88900, 87500 } +}; -/* Map CPU reference clock freq ID(0-7) to CPU reference clock freq(KHz) */ -#define id_to_freq(cpu_index, freq_id) \ - (freq_desc_tables[cpu_index].freqs[freq_id]) +static const struct freq_desc freq_desc_tng = { + 1, { 0, 100000, 133300, 0, 0, 0, 0, 0 } +}; + +static const struct freq_desc freq_desc_ann = { + 1, { 83300, 100000, 133300, 100000, 0, 0, 0, 0 } +}; + +static const struct x86_cpu_id tsc_msr_cpu_ids[] = { + INTEL_CPU_FAM6(ATOM_PENWELL, freq_desc_pnw), + INTEL_CPU_FAM6(ATOM_CLOVERVIEW, freq_desc_clv), + INTEL_CPU_FAM6(ATOM_SILVERMONT1, freq_desc_byt), + INTEL_CPU_FAM6(ATOM_AIRMONT, freq_desc_cht), + INTEL_CPU_FAM6(ATOM_MERRIFIELD, freq_desc_tng), + INTEL_CPU_FAM6(ATOM_MOOREFIELD, freq_desc_ann), + {} +}; /* * MSR-based CPU/TSC frequency discovery for certain CPUs. @@ -70,18 +76,17 @@ static int match_cpu(u8 family, u8 model) */ unsigned long cpu_khz_from_msr(void) { - u32 lo, hi, ratio, freq_id, freq; + u32 lo, hi, ratio, freq; + const struct freq_desc *freq_desc; + const struct x86_cpu_id *id; unsigned long res; - int cpu_index; - - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) - return 0; - cpu_index = match_cpu(boot_cpu_data.x86, boot_cpu_data.x86_model); - if (cpu_index < 0) + id = x86_match_cpu(tsc_msr_cpu_ids); + if (!id) return 0; - if (freq_desc_tables[cpu_index].msr_plat) { + freq_desc = (struct freq_desc *)id->driver_data; + if (freq_desc->msr_plat) { rdmsr(MSR_PLATFORM_INFO, lo, hi); ratio = (lo >> 8) & 0xff; } else { @@ -91,8 +96,9 @@ unsigned long cpu_khz_from_msr(void) /* Get FSB FREQ ID */ rdmsr(MSR_FSB_FREQ, lo, hi); - freq_id = lo & 0x7; - freq = id_to_freq(cpu_index, freq_id); + + /* Map CPU reference clock freq ID(0-7) to CPU reference clock freq(KHz) */ + freq = freq_desc->freqs[lo & 0x7]; /* TSC frequency = maximum resolved freq * maximum resolved bus ratio */ res = freq * ratio; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 3ab867603e81..2792b5573818 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -109,7 +109,7 @@ struct x86_cpuinit_ops x86_cpuinit = { static void default_nmi_init(void) { }; struct x86_platform_ops x86_platform __ro_after_init = { - .calibrate_cpu = native_calibrate_cpu, + .calibrate_cpu = native_calibrate_cpu_early, .calibrate_tsc = native_calibrate_tsc, .get_wallclock = mach_get_cmos_time, .set_wallclock = mach_set_rtc_mmss, diff --git a/arch/x86/platform/intel-mid/Makefile b/arch/x86/platform/intel-mid/Makefile index fa021dfab088..5cf886c867c2 100644 --- a/arch/x86/platform/intel-mid/Makefile +++ b/arch/x86/platform/intel-mid/Makefile @@ -1,4 +1,4 @@ -obj-$(CONFIG_X86_INTEL_MID) += intel-mid.o intel_mid_vrtc.o mfld.o mrfld.o pwr.o +obj-$(CONFIG_X86_INTEL_MID) += intel-mid.o intel_mid_vrtc.o pwr.o # SFI specific code ifdef CONFIG_X86_INTEL_MID diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c index 2ebdf31d9996..56f66eafb94f 100644 --- a/arch/x86/platform/intel-mid/intel-mid.c +++ b/arch/x86/platform/intel-mid/intel-mid.c @@ -36,8 +36,6 @@ #include <asm/apb_timer.h> #include <asm/reboot.h> -#include "intel_mid_weak_decls.h" - /* * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock, * cmdline option x86_intel_mid_timer can be used to override the configuration @@ -61,10 +59,6 @@ enum intel_mid_timer_options intel_mid_timer_options; -/* intel_mid_ops to store sub arch ops */ -static struct intel_mid_ops *intel_mid_ops; -/* getter function for sub arch ops*/ -static void *(*get_intel_mid_ops[])(void) = INTEL_MID_OPS_INIT; enum intel_mid_cpu_type __intel_mid_cpu_chip; EXPORT_SYMBOL_GPL(__intel_mid_cpu_chip); @@ -82,11 +76,6 @@ static void intel_mid_reboot(void) intel_scu_ipc_simple_command(IPCMSG_COLD_RESET, 0); } -static unsigned long __init intel_mid_calibrate_tsc(void) -{ - return 0; -} - static void __init intel_mid_setup_bp_timer(void) { apbt_time_init(); @@ -133,6 +122,7 @@ static void intel_mid_arch_setup(void) case 0x3C: case 0x4A: __intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_TANGIER; + x86_platform.legacy.rtc = 1; break; case 0x27: default: @@ -140,17 +130,7 @@ static void intel_mid_arch_setup(void) break; } - if (__intel_mid_cpu_chip < MAX_CPU_OPS(get_intel_mid_ops)) - intel_mid_ops = get_intel_mid_ops[__intel_mid_cpu_chip](); - else { - intel_mid_ops = get_intel_mid_ops[INTEL_MID_CPU_CHIP_PENWELL](); - pr_info("ARCH: Unknown SoC, assuming Penwell!\n"); - } - out: - if (intel_mid_ops->arch_setup) - intel_mid_ops->arch_setup(); - /* * Intel MID platforms are using explicitly defined regulators. * @@ -191,7 +171,6 @@ void __init x86_intel_mid_early_setup(void) x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock; - x86_platform.calibrate_tsc = intel_mid_calibrate_tsc; x86_platform.get_nmi_reason = intel_mid_get_nmi_reason; x86_init.pci.arch_init = intel_mid_pci_init; diff --git a/arch/x86/platform/intel-mid/intel_mid_weak_decls.h b/arch/x86/platform/intel-mid/intel_mid_weak_decls.h deleted file mode 100644 index 3c1c3866d82b..000000000000 --- a/arch/x86/platform/intel-mid/intel_mid_weak_decls.h +++ /dev/null @@ -1,18 +0,0 @@ -/* - * intel_mid_weak_decls.h: Weak declarations of intel-mid.c - * - * (C) Copyright 2013 Intel Corporation - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ - - -/* For every CPU addition a new get_<cpuname>_ops interface needs - * to be added. - */ -extern void *get_penwell_ops(void); -extern void *get_cloverview_ops(void); -extern void *get_tangier_ops(void); diff --git a/arch/x86/platform/intel-mid/mfld.c b/arch/x86/platform/intel-mid/mfld.c deleted file mode 100644 index e42978d4deaf..000000000000 --- a/arch/x86/platform/intel-mid/mfld.c +++ /dev/null @@ -1,70 +0,0 @@ -/* - * mfld.c: Intel Medfield platform setup code - * - * (C) Copyright 2013 Intel Corporation - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ - -#include <linux/init.h> - -#include <asm/apic.h> -#include <asm/intel-mid.h> -#include <asm/intel_mid_vrtc.h> - -#include "intel_mid_weak_decls.h" - -static unsigned long __init mfld_calibrate_tsc(void) -{ - unsigned long fast_calibrate; - u32 lo, hi, ratio, fsb; - - rdmsr(MSR_IA32_PERF_STATUS, lo, hi); - pr_debug("IA32 perf status is 0x%x, 0x%0x\n", lo, hi); - ratio = (hi >> 8) & 0x1f; - pr_debug("ratio is %d\n", ratio); - if (!ratio) { - pr_err("read a zero ratio, should be incorrect!\n"); - pr_err("force tsc ratio to 16 ...\n"); - ratio = 16; - } - rdmsr(MSR_FSB_FREQ, lo, hi); - if ((lo & 0x7) == 0x7) - fsb = FSB_FREQ_83SKU; - else - fsb = FSB_FREQ_100SKU; - fast_calibrate = ratio * fsb; - pr_debug("read penwell tsc %lu khz\n", fast_calibrate); - lapic_timer_frequency = fsb * 1000 / HZ; - - /* - * TSC on Intel Atom SoCs is reliable and of known frequency. - * See tsc_msr.c for details. - */ - setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); - setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); - - return fast_calibrate; -} - -static void __init penwell_arch_setup(void) -{ - x86_platform.calibrate_tsc = mfld_calibrate_tsc; -} - -static struct intel_mid_ops penwell_ops = { - .arch_setup = penwell_arch_setup, -}; - -void *get_penwell_ops(void) -{ - return &penwell_ops; -} - -void *get_cloverview_ops(void) -{ - return &penwell_ops; -} diff --git a/arch/x86/platform/intel-mid/mrfld.c b/arch/x86/platform/intel-mid/mrfld.c deleted file mode 100644 index ae7bdeb0e507..000000000000 --- a/arch/x86/platform/intel-mid/mrfld.c +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Intel Merrifield platform specific setup code - * - * (C) Copyright 2013 Intel Corporation - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ - -#include <linux/init.h> - -#include <asm/apic.h> -#include <asm/intel-mid.h> - -#include "intel_mid_weak_decls.h" - -static unsigned long __init tangier_calibrate_tsc(void) -{ - unsigned long fast_calibrate; - u32 lo, hi, ratio, fsb, bus_freq; - - /* *********************** */ - /* Compute TSC:Ratio * FSB */ - /* *********************** */ - - /* Compute Ratio */ - rdmsr(MSR_PLATFORM_INFO, lo, hi); - pr_debug("IA32 PLATFORM_INFO is 0x%x : %x\n", hi, lo); - - ratio = (lo >> 8) & 0xFF; - pr_debug("ratio is %d\n", ratio); - if (!ratio) { - pr_err("Read a zero ratio, force tsc ratio to 4 ...\n"); - ratio = 4; - } - - /* Compute FSB */ - rdmsr(MSR_FSB_FREQ, lo, hi); - pr_debug("Actual FSB frequency detected by SOC 0x%x : %x\n", - hi, lo); - - bus_freq = lo & 0x7; - pr_debug("bus_freq = 0x%x\n", bus_freq); - - if (bus_freq == 0) - fsb = FSB_FREQ_100SKU; - else if (bus_freq == 1) - fsb = FSB_FREQ_100SKU; - else if (bus_freq == 2) - fsb = FSB_FREQ_133SKU; - else if (bus_freq == 3) - fsb = FSB_FREQ_167SKU; - else if (bus_freq == 4) - fsb = FSB_FREQ_83SKU; - else if (bus_freq == 5) - fsb = FSB_FREQ_400SKU; - else if (bus_freq == 6) - fsb = FSB_FREQ_267SKU; - else if (bus_freq == 7) - fsb = FSB_FREQ_333SKU; - else { - BUG(); - pr_err("Invalid bus_freq! Setting to minimal value!\n"); - fsb = FSB_FREQ_100SKU; - } - - /* TSC = FSB Freq * Resolved HFM Ratio */ - fast_calibrate = ratio * fsb; - pr_debug("calculate tangier tsc %lu KHz\n", fast_calibrate); - - /* ************************************ */ - /* Calculate Local APIC Timer Frequency */ - /* ************************************ */ - lapic_timer_frequency = (fsb * 1000) / HZ; - - pr_debug("Setting lapic_timer_frequency = %d\n", - lapic_timer_frequency); - - /* - * TSC on Intel Atom SoCs is reliable and of known frequency. - * See tsc_msr.c for details. - */ - setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); - setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); - - return fast_calibrate; -} - -static void __init tangier_arch_setup(void) -{ - x86_platform.calibrate_tsc = tangier_calibrate_tsc; - x86_platform.legacy.rtc = 1; -} - -/* tangier arch ops */ -static struct intel_mid_ops tangier_ops = { - .arch_setup = tangier_arch_setup, -}; - -void *get_tangier_ops(void) -{ - return &tangier_ops; -} diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 439a94bf89ad..105a57d73701 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -119,6 +119,27 @@ static void __init xen_banner(void) version >> 16, version & 0xffff, extra.extraversion, xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); } + +static void __init xen_pv_init_platform(void) +{ + set_fixmap(FIX_PARAVIRT_BOOTMAP, xen_start_info->shared_info); + HYPERVISOR_shared_info = (void *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); + + /* xen clock uses per-cpu vcpu_info, need to init it for boot cpu */ + xen_vcpu_info_reset(0); + + /* pvclock is in shared info area */ + xen_init_time_ops(); +} + +static void __init xen_pv_guest_late_init(void) +{ +#ifndef CONFIG_SMP + /* Setup shared vcpu info for non-smp configurations */ + xen_setup_vcpu_info_placement(); +#endif +} + /* Check if running on Xen version (major, minor) or later */ bool xen_running_on_version_or_later(unsigned int major, unsigned int minor) @@ -947,34 +968,8 @@ static void xen_write_msr(unsigned int msr, unsigned low, unsigned high) xen_write_msr_safe(msr, low, high); } -void xen_setup_shared_info(void) -{ - set_fixmap(FIX_PARAVIRT_BOOTMAP, xen_start_info->shared_info); - - HYPERVISOR_shared_info = - (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); - - xen_setup_mfn_list_list(); - - if (system_state == SYSTEM_BOOTING) { -#ifndef CONFIG_SMP - /* - * In UP this is as good a place as any to set up shared info. - * Limit this to boot only, at restore vcpu setup is done via - * xen_vcpu_restore(). - */ - xen_setup_vcpu_info_placement(); -#endif - /* - * Now that shared info is set up we can start using routines - * that point to pvclock area. - */ - xen_init_time_ops(); - } -} - /* This is called once we have the cpu_possible_mask */ -void __ref xen_setup_vcpu_info_placement(void) +void __init xen_setup_vcpu_info_placement(void) { int cpu; @@ -1228,6 +1223,8 @@ asmlinkage __visible void __init xen_start_kernel(void) x86_init.irqs.intr_mode_init = x86_init_noop; x86_init.oem.arch_setup = xen_arch_setup; x86_init.oem.banner = xen_banner; + x86_init.hyper.init_platform = xen_pv_init_platform; + x86_init.hyper.guest_late_init = xen_pv_guest_late_init; /* * Set up some pagetable state before starting to set any ptes. diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 2c30cabfda90..52206ad81e4b 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1230,8 +1230,7 @@ static void __init xen_pagetable_p2m_free(void) * We roundup to the PMD, which means that if anybody at this stage is * using the __ka address of xen_start_info or * xen_start_info->shared_info they are in going to crash. Fortunatly - * we have already revectored in xen_setup_kernel_pagetable and in - * xen_setup_shared_info. + * we have already revectored in xen_setup_kernel_pagetable. */ size = roundup(size, PMD_SIZE); @@ -1292,8 +1291,7 @@ static void __init xen_pagetable_init(void) /* Remap memory freed due to conflicts with E820 map */ xen_remap_memory(); - - xen_setup_shared_info(); + xen_setup_mfn_list_list(); } static void xen_write_cr2(unsigned long cr2) { diff --git a/arch/x86/xen/suspend_pv.c b/arch/x86/xen/suspend_pv.c index a2e0f110af56..8303b58c79a9 100644 --- a/arch/x86/xen/suspend_pv.c +++ b/arch/x86/xen/suspend_pv.c @@ -27,8 +27,9 @@ void xen_pv_pre_suspend(void) void xen_pv_post_suspend(int suspend_cancelled) { xen_build_mfn_list_list(); - - xen_setup_shared_info(); + set_fixmap(FIX_PARAVIRT_BOOTMAP, xen_start_info->shared_info); + HYPERVISOR_shared_info = (void *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); + xen_setup_mfn_list_list(); if (suspend_cancelled) { xen_start_info->store_mfn = diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index e0f1bcf01d63..c84f1e039d84 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -31,6 +31,8 @@ /* Xen may fire a timer up to this many ns early */ #define TIMER_SLOP 100000 +static u64 xen_sched_clock_offset __read_mostly; + /* Get the TSC speed from Xen */ static unsigned long xen_tsc_khz(void) { @@ -40,7 +42,7 @@ static unsigned long xen_tsc_khz(void) return pvclock_tsc_khz(info); } -u64 xen_clocksource_read(void) +static u64 xen_clocksource_read(void) { struct pvclock_vcpu_time_info *src; u64 ret; @@ -57,6 +59,11 @@ static u64 xen_clocksource_get_cycles(struct clocksource *cs) return xen_clocksource_read(); } +static u64 xen_sched_clock(void) +{ + return xen_clocksource_read() - xen_sched_clock_offset; +} + static void xen_read_wallclock(struct timespec64 *ts) { struct shared_info *s = HYPERVISOR_shared_info; @@ -367,7 +374,7 @@ void xen_timer_resume(void) } static const struct pv_time_ops xen_time_ops __initconst = { - .sched_clock = xen_clocksource_read, + .sched_clock = xen_sched_clock, .steal_clock = xen_steal_clock, }; @@ -503,8 +510,9 @@ static void __init xen_time_init(void) pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier); } -void __ref xen_init_time_ops(void) +void __init xen_init_time_ops(void) { + xen_sched_clock_offset = xen_clocksource_read(); pv_time_ops = xen_time_ops; x86_init.timers.timer_init = xen_time_init; @@ -542,11 +550,11 @@ void __init xen_hvm_init_time_ops(void) return; if (!xen_feature(XENFEAT_hvm_safe_pvclock)) { - printk(KERN_INFO "Xen doesn't support pvclock on HVM," - "disable pv timer\n"); + pr_info("Xen doesn't support pvclock on HVM, disable pv timer"); return; } + xen_sched_clock_offset = xen_clocksource_read(); pv_time_ops = xen_time_ops; x86_init.timers.setup_percpu_clockev = xen_time_init; x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents; diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 3b34745d0a52..e78684597f57 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -31,7 +31,6 @@ extern struct shared_info xen_dummy_shared_info; extern struct shared_info *HYPERVISOR_shared_info; void xen_setup_mfn_list_list(void); -void xen_setup_shared_info(void); void xen_build_mfn_list_list(void); void xen_setup_machphys_mapping(void); void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); @@ -68,12 +67,11 @@ void xen_init_irq_ops(void); void xen_setup_timer(int cpu); void xen_setup_runstate_info(int cpu); void xen_teardown_timer(int cpu); -u64 xen_clocksource_read(void); void xen_setup_cpu_clockevents(void); void xen_save_time_memory_area(void); void xen_restore_time_memory_area(void); -void __ref xen_init_time_ops(void); -void __init xen_hvm_init_time_ops(void); +void xen_init_time_ops(void); +void xen_hvm_init_time_ops(void); irqreturn_t xen_debug_interrupt(int irq, void *dev_id); diff --git a/drivers/clocksource/tegra20_timer.c b/drivers/clocksource/tegra20_timer.c index dabf0a103567..aa624885e0e2 100644 --- a/drivers/clocksource/tegra20_timer.c +++ b/drivers/clocksource/tegra20_timer.c @@ -259,6 +259,6 @@ static int __init tegra20_init_rtc(struct device_node *np) else clk_prepare_enable(clk); - return register_persistent_clock(NULL, tegra_read_persistent_clock64); + return register_persistent_clock(tegra_read_persistent_clock64); } TIMER_OF_DECLARE(tegra20_rtc, "nvidia,tegra20-rtc", tegra20_init_rtc); diff --git a/include/linux/sched_clock.h b/include/linux/sched_clock.h index 411b52e424e1..abe28d5cb3f4 100644 --- a/include/linux/sched_clock.h +++ b/include/linux/sched_clock.h @@ -9,17 +9,16 @@ #define LINUX_SCHED_CLOCK #ifdef CONFIG_GENERIC_SCHED_CLOCK -extern void sched_clock_postinit(void); +extern void generic_sched_clock_init(void); extern void sched_clock_register(u64 (*read)(void), int bits, unsigned long rate); #else -static inline void sched_clock_postinit(void) { } +static inline void generic_sched_clock_init(void) { } static inline void sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) { - ; } #endif diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index edace6b656e9..e79861418fd7 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -243,7 +243,8 @@ extern void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot); extern int persistent_clock_is_local; extern void read_persistent_clock64(struct timespec64 *ts); -extern void read_boot_clock64(struct timespec64 *ts); +void read_persistent_clock_and_boot_offset(struct timespec64 *wall_clock, + struct timespec64 *boot_offset); extern int update_persistent_clock64(struct timespec64 now); /* diff --git a/init/main.c b/init/main.c index f86b64c35d38..38c68b593d0d 100644 --- a/init/main.c +++ b/init/main.c @@ -79,7 +79,7 @@ #include <linux/pti.h> #include <linux/blkdev.h> #include <linux/elevator.h> -#include <linux/sched_clock.h> +#include <linux/sched/clock.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/context_tracking.h> @@ -642,7 +642,6 @@ asmlinkage __visible void __init start_kernel(void) softirq_init(); timekeeping_init(); time_init(); - sched_clock_postinit(); printk_safe_init(); perf_event_init(); profile_init(); @@ -697,6 +696,7 @@ asmlinkage __visible void __init start_kernel(void) acpi_early_init(); if (late_time_init) late_time_init(); + sched_clock_init(); calibrate_delay(); pid_idr_init(); anon_vma_init(); diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 10c83e73837a..e3e3b979f9bd 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -53,6 +53,7 @@ * */ #include "sched.h" +#include <linux/sched_clock.h> /* * Scheduler clock - returns current time in nanosec units. @@ -66,12 +67,7 @@ unsigned long long __weak sched_clock(void) } EXPORT_SYMBOL_GPL(sched_clock); -__read_mostly int sched_clock_running; - -void sched_clock_init(void) -{ - sched_clock_running = 1; -} +static DEFINE_STATIC_KEY_FALSE(sched_clock_running); #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK /* @@ -195,17 +191,40 @@ void clear_sched_clock_stable(void) smp_mb(); /* matches sched_clock_init_late() */ - if (sched_clock_running == 2) + if (static_key_count(&sched_clock_running.key) == 2) __clear_sched_clock_stable(); } +static void __sched_clock_gtod_offset(void) +{ + struct sched_clock_data *scd = this_scd(); + + __scd_stamp(scd); + __gtod_offset = (scd->tick_raw + __sched_clock_offset) - scd->tick_gtod; +} + +void __init sched_clock_init(void) +{ + /* + * Set __gtod_offset such that once we mark sched_clock_running, + * sched_clock_tick() continues where sched_clock() left off. + * + * Even if TSC is buggered, we're still UP at this point so it + * can't really be out of sync. + */ + local_irq_disable(); + __sched_clock_gtod_offset(); + local_irq_enable(); + + static_branch_inc(&sched_clock_running); +} /* * We run this as late_initcall() such that it runs after all built-in drivers, * notably: acpi_processor and intel_idle, which can mark the TSC as unstable. */ static int __init sched_clock_init_late(void) { - sched_clock_running = 2; + static_branch_inc(&sched_clock_running); /* * Ensure that it is impossible to not do a static_key update. * @@ -350,8 +369,8 @@ u64 sched_clock_cpu(int cpu) if (sched_clock_stable()) return sched_clock() + __sched_clock_offset; - if (unlikely(!sched_clock_running)) - return 0ull; + if (!static_branch_unlikely(&sched_clock_running)) + return sched_clock(); preempt_disable_notrace(); scd = cpu_sdc(cpu); @@ -373,7 +392,7 @@ void sched_clock_tick(void) if (sched_clock_stable()) return; - if (unlikely(!sched_clock_running)) + if (!static_branch_unlikely(&sched_clock_running)) return; lockdep_assert_irqs_disabled(); @@ -385,8 +404,6 @@ void sched_clock_tick(void) void sched_clock_tick_stable(void) { - u64 gtod, clock; - if (!sched_clock_stable()) return; @@ -398,9 +415,7 @@ void sched_clock_tick_stable(void) * TSC to be unstable, any computation will be computing crap. */ local_irq_disable(); - gtod = ktime_get_ns(); - clock = sched_clock(); - __gtod_offset = (clock + __sched_clock_offset) - gtod; + __sched_clock_gtod_offset(); local_irq_enable(); } @@ -434,9 +449,17 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ +void __init sched_clock_init(void) +{ + static_branch_inc(&sched_clock_running); + local_irq_disable(); + generic_sched_clock_init(); + local_irq_enable(); +} + u64 sched_clock_cpu(int cpu) { - if (unlikely(!sched_clock_running)) + if (!static_branch_unlikely(&sched_clock_running)) return 0; return sched_clock(); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d17d99778356..c45de46fdf10 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5916,7 +5916,6 @@ void __init sched_init(void) int i, j; unsigned long alloc_size = 0, ptr; - sched_clock_init(); wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 870d4f3da285..60caf1fb94e0 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -622,8 +622,6 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) #undef PU } -extern __read_mostly int sched_clock_running; - static void print_cpu(struct seq_file *m, int cpu) { struct rq *rq = cpu_rq(cpu); diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 2d8f05aad442..cbc72c2c1fca 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -237,7 +237,7 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) pr_debug("Registered %pF as sched_clock source\n", read); } -void __init sched_clock_postinit(void) +void __init generic_sched_clock_init(void) { /* * If no sched_clock() function has been provided at that point, diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d9e659a12c76..f3b22f456fac 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -17,6 +17,7 @@ #include <linux/nmi.h> #include <linux/sched.h> #include <linux/sched/loadavg.h> +#include <linux/sched/clock.h> #include <linux/syscore_ops.h> #include <linux/clocksource.h> #include <linux/jiffies.h> @@ -1505,18 +1506,23 @@ void __weak read_persistent_clock64(struct timespec64 *ts64) } /** - * read_boot_clock64 - Return time of the system start. + * read_persistent_wall_and_boot_offset - Read persistent clock, and also offset + * from the boot. * * Weak dummy function for arches that do not yet support it. - * Function to read the exact time the system has been started. - * Returns a timespec64 with tv_sec=0 and tv_nsec=0 if unsupported. - * - * XXX - Do be sure to remove it once all arches implement it. + * wall_time - current time as returned by persistent clock + * boot_offset - offset that is defined as wall_time - boot_time + * The default function calculates offset based on the current value of + * local_clock(). This way architectures that support sched_clock() but don't + * support dedicated boot time clock will provide the best estimate of the + * boot time. */ -void __weak read_boot_clock64(struct timespec64 *ts) +void __weak __init +read_persistent_wall_and_boot_offset(struct timespec64 *wall_time, + struct timespec64 *boot_offset) { - ts->tv_sec = 0; - ts->tv_nsec = 0; + read_persistent_clock64(wall_time); + *boot_offset = ns_to_timespec64(local_clock()); } /* @@ -1542,28 +1548,29 @@ static bool persistent_clock_exists; */ void __init timekeeping_init(void) { + struct timespec64 wall_time, boot_offset, wall_to_mono; struct timekeeper *tk = &tk_core.timekeeper; struct clocksource *clock; unsigned long flags; - struct timespec64 now, boot, tmp; - - read_persistent_clock64(&now); - if (!timespec64_valid_strict(&now)) { - pr_warn("WARNING: Persistent clock returned invalid value!\n" - " Check your CMOS/BIOS settings.\n"); - now.tv_sec = 0; - now.tv_nsec = 0; - } else if (now.tv_sec || now.tv_nsec) - persistent_clock_exists = true; - read_boot_clock64(&boot); - if (!timespec64_valid_strict(&boot)) { - pr_warn("WARNING: Boot clock returned invalid value!\n" - " Check your CMOS/BIOS settings.\n"); - boot.tv_sec = 0; - boot.tv_nsec = 0; + read_persistent_wall_and_boot_offset(&wall_time, &boot_offset); + if (timespec64_valid_strict(&wall_time) && + timespec64_to_ns(&wall_time) > 0) { + persistent_clock_exists = true; + } else if (timespec64_to_ns(&wall_time) != 0) { + pr_warn("Persistent clock returned invalid value"); + wall_time = (struct timespec64){0}; } + if (timespec64_compare(&wall_time, &boot_offset) < 0) + boot_offset = (struct timespec64){0}; + + /* + * We want set wall_to_mono, so the following is true: + * wall time + wall_to_mono = boot time + */ + wall_to_mono = timespec64_sub(boot_offset, wall_time); + raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&tk_core.seq); ntp_init(); @@ -1573,13 +1580,10 @@ void __init timekeeping_init(void) clock->enable(clock); tk_setup_internals(tk, clock); - tk_set_xtime(tk, &now); + tk_set_xtime(tk, &wall_time); tk->raw_sec = 0; - if (boot.tv_sec == 0 && boot.tv_nsec == 0) - boot = tk_xtime(tk); - set_normalized_timespec64(&tmp, -boot.tv_sec, -boot.tv_nsec); - tk_set_wall_to_mono(tk, tmp); + tk_set_wall_to_mono(tk, wall_to_mono); timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); |