diff options
Diffstat (limited to 'arch/x86/kernel')
54 files changed, 1658 insertions, 553 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 9b294c13809a..bb5abfef0256 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -28,7 +28,6 @@ KASAN_SANITIZE_dumpstack_$(BITS).o := n KASAN_SANITIZE_stacktrace.o := n KASAN_SANITIZE_paravirt.o := n -OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y OBJECT_FILES_NON_STANDARD_test_nx.o := y OBJECT_FILES_NON_STANDARD_paravirt_patch.o := y @@ -53,6 +52,8 @@ obj-y += setup.o x86_init.o i8259.o irqinit.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-y += probe_roms.o +obj-$(CONFIG_X86_32) += sys_ia32.o +obj-$(CONFIG_IA32_EMULATION) += sys_ia32.o obj-$(CONFIG_X86_64) += sys_x86_64.o obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o obj-$(CONFIG_SYSFS) += ksysfs.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 04205ce127a1..1ae5439a9a85 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -45,6 +45,7 @@ EXPORT_SYMBOL(acpi_disabled); #define PREFIX "ACPI: " int acpi_noirq; /* skip ACPI IRQ initialization */ +int acpi_nobgrt; /* skip ACPI BGRT */ int acpi_pci_disabled; /* skip ACPI PCI scan and IRQ initialization */ EXPORT_SYMBOL(acpi_pci_disabled); @@ -1619,7 +1620,7 @@ int __init acpi_boot_init(void) acpi_process_madt(); acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet); - if (IS_ENABLED(CONFIG_ACPI_BGRT)) + if (IS_ENABLED(CONFIG_ACPI_BGRT) && !acpi_nobgrt) acpi_table_parse(ACPI_SIG_BGRT, acpi_parse_bgrt); if (!acpi_noirq) @@ -1671,6 +1672,13 @@ static int __init parse_acpi(char *arg) } early_param("acpi", parse_acpi); +static int __init parse_acpi_bgrt(char *arg) +{ + acpi_nobgrt = true; + return 0; +} +early_param("bgrt_disable", parse_acpi_bgrt); + /* FIXME: Using pci= for an ACPI parameter is a travesty. */ static int __init parse_pci(char *arg) { @@ -1740,7 +1748,7 @@ int __acpi_acquire_global_lock(unsigned int *lock) new = (((old & ~0x3) + 2) + ((old >> 1) & 0x1)); val = cmpxchg(lock, old, new); } while (unlikely (val != old)); - return (new < 3) ? -1 : 0; + return ((new & 0x3) < 3) ? -1 : 0; } int __acpi_release_global_lock(unsigned int *lock) diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 26b7256f590f..ed3b04483972 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -43,7 +43,7 @@ unsigned long acpi_get_wakeup_address(void) * * Wrapper around acpi_enter_sleep_state() to be called by assmebly. */ -acpi_status asmlinkage __visible x86_acpi_enter_sleep_state(u8 state) +asmlinkage acpi_status __visible x86_acpi_enter_sleep_state(u8 state) { return acpi_enter_sleep_state(state); } diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h index d06c2079b6c1..171a40c74db6 100644 --- a/arch/x86/kernel/acpi/sleep.h +++ b/arch/x86/kernel/acpi/sleep.h @@ -19,4 +19,4 @@ extern void do_suspend_lowlevel(void); extern int x86_acpi_suspend_lowlevel(void); -acpi_status asmlinkage x86_acpi_enter_sleep_state(u8 state); +asmlinkage acpi_status x86_acpi_enter_sleep_state(u8 state); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 15ac0d5f4b40..7867dfb3963e 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1167,8 +1167,8 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries atomic_cond_read_acquire(&desc.refs, !VAL); } -void text_poke_loc_init(struct text_poke_loc *tp, void *addr, - const void *opcode, size_t len, const void *emulate) +static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, + const void *opcode, size_t len, const void *emulate) { struct insn insn; diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 69aed0ebbdfc..b6b3297851f3 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -36,10 +36,9 @@ static const struct pci_device_id amd_root_ids[] = { {} }; - #define PCI_DEVICE_ID_AMD_CNB17H_F4 0x1704 -const struct pci_device_id amd_nb_misc_ids[] = { +static const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) }, @@ -56,7 +55,6 @@ const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) }, {} }; -EXPORT_SYMBOL_GPL(amd_nb_misc_ids); static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) }, diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 5f973fed3c9f..81b9c63dae1b 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -546,12 +546,6 @@ static struct clock_event_device lapic_clockevent = { }; static DEFINE_PER_CPU(struct clock_event_device, lapic_events); -#define DEADLINE_MODEL_MATCH_FUNC(model, func) \ - { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&func } - -#define DEADLINE_MODEL_MATCH_REV(model, rev) \ - { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)rev } - static u32 hsx_deadline_rev(void) { switch (boot_cpu_data.x86_stepping) { @@ -588,23 +582,23 @@ static u32 skx_deadline_rev(void) } static const struct x86_cpu_id deadline_match[] = { - DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_HASWELL_X, hsx_deadline_rev), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_X, 0x0b000020), - DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_BROADWELL_D, bdx_deadline_rev), - DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_SKYLAKE_X, skx_deadline_rev), + X86_MATCH_INTEL_FAM6_MODEL( HASWELL_X, &hsx_deadline_rev), + X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_X, 0x0b000020), + X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_D, &bdx_deadline_rev), + X86_MATCH_INTEL_FAM6_MODEL( SKYLAKE_X, &skx_deadline_rev), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL, 0x22), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_L, 0x20), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_G, 0x17), + X86_MATCH_INTEL_FAM6_MODEL( HASWELL, 0x22), + X86_MATCH_INTEL_FAM6_MODEL( HASWELL_L, 0x20), + X86_MATCH_INTEL_FAM6_MODEL( HASWELL_G, 0x17), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL, 0x25), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_G, 0x17), + X86_MATCH_INTEL_FAM6_MODEL( BROADWELL, 0x25), + X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_G, 0x17), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_L, 0xb2), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE, 0xb2), + X86_MATCH_INTEL_FAM6_MODEL( SKYLAKE_L, 0xb2), + X86_MATCH_INTEL_FAM6_MODEL( SKYLAKE, 0xb2), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_KABYLAKE_L, 0x52), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_KABYLAKE, 0x52), + X86_MATCH_INTEL_FAM6_MODEL( KABYLAKE_L, 0x52), + X86_MATCH_INTEL_FAM6_MODEL( KABYLAKE, 0x52), {}, }; diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 48293d15f1e1..67768e54438b 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -557,6 +557,12 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, irqd->hwirq = virq + i; irqd_set_single_target(irqd); /* + * Prevent that any of these interrupts is invoked in + * non interrupt context via e.g. generic_handle_irq() + * as that can corrupt the affinity move state. + */ + irqd_set_handle_enforce_irqctx(irqd); + /* * Legacy vectors are already assigned when the IOAPIC * takes them over. They stay on the same vector. This is * required for check_timer() to work correctly as it might diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 5c7ee3df4d0b..3ca07ad552ae 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -88,7 +88,6 @@ static void __used common(void) OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); OFFSET(BP_init_size, boot_params, hdr.init_size); OFFSET(BP_pref_address, boot_params, hdr.pref_address); - OFFSET(BP_code32_start, boot_params, hdr.code32_start); BLANK(); DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 82826f2275cc..6e043f295a60 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -3,12 +3,9 @@ # error "Please do not build this file directly, build asm-offsets.c instead" #endif -#include <asm/ucontext.h> +#include <linux/efi.h> -#define __SYSCALL_I386(nr, sym, qual) [nr] = 1, -static char syscalls[] = { -#include <asm/syscalls_32.h> -}; +#include <asm/ucontext.h> /* workaround for a warning with -Wmissing-prototypes */ void foo(void); @@ -62,6 +59,5 @@ void foo(void) #endif BLANK(); - DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); - DEFINE(NR_syscalls, sizeof(syscalls)); + DEFINE(EFI_svam, offsetof(efi_runtime_services_t, set_virtual_address_map)); } diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 24d2fde30d00..c2a47016f243 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -5,30 +5,6 @@ #include <asm/ia32.h> -#define __SYSCALL_64(nr, sym, qual) [nr] = 1, -#define __SYSCALL_X32(nr, sym, qual) -static char syscalls_64[] = { -#include <asm/syscalls_64.h> -}; -#undef __SYSCALL_64 -#undef __SYSCALL_X32 - -#ifdef CONFIG_X86_X32_ABI -#define __SYSCALL_64(nr, sym, qual) -#define __SYSCALL_X32(nr, sym, qual) [nr] = 1, -static char syscalls_x32[] = { -#include <asm/syscalls_64.h> -}; -#undef __SYSCALL_64 -#undef __SYSCALL_X32 -#endif - -#define __SYSCALL_I386(nr, sym, qual) [nr] = 1, -static char syscalls_ia32[] = { -#include <asm/syscalls_32.h> -}; -#undef __SYSCALL_I386 - #if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS) #include <asm/kvm_para.h> #endif @@ -90,17 +66,5 @@ int main(void) DEFINE(stack_canary_offset, offsetof(struct fixed_percpu_data, stack_canary)); BLANK(); #endif - - DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); - DEFINE(NR_syscalls, sizeof(syscalls_64)); - -#ifdef CONFIG_X86_X32_ABI - DEFINE(__NR_syscall_x32_max, sizeof(syscalls_x32) - 1); - DEFINE(X32_NR_syscalls, sizeof(syscalls_x32)); -#endif - - DEFINE(__NR_syscall_compat_max, sizeof(syscalls_ia32) - 1); - DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32)); - return 0; } diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 1f875fbe1384..547ad7bbf0e0 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -394,6 +394,35 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c) per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; } +static void amd_detect_ppin(struct cpuinfo_x86 *c) +{ + unsigned long long val; + + if (!cpu_has(c, X86_FEATURE_AMD_PPIN)) + return; + + /* When PPIN is defined in CPUID, still need to check PPIN_CTL MSR */ + if (rdmsrl_safe(MSR_AMD_PPIN_CTL, &val)) + goto clear_ppin; + + /* PPIN is locked in disabled mode, clear feature bit */ + if ((val & 3UL) == 1UL) + goto clear_ppin; + + /* If PPIN is disabled, try to enable it */ + if (!(val & 2UL)) { + wrmsrl_safe(MSR_AMD_PPIN_CTL, val | 2UL); + rdmsrl_safe(MSR_AMD_PPIN_CTL, &val); + } + + /* If PPIN_EN bit is 1, return from here; otherwise fall through */ + if (val & 2UL) + return; + +clear_ppin: + clear_cpu_cap(c, X86_FEATURE_AMD_PPIN); +} + u16 amd_get_nb_id(int cpu) { return per_cpu(cpu_llc_id, cpu); @@ -926,7 +955,8 @@ static void init_amd(struct cpuinfo_x86 *c) case 0x12: init_amd_ln(c); break; case 0x15: init_amd_bd(c); break; case 0x16: init_amd_jg(c); break; - case 0x17: init_amd_zn(c); break; + case 0x17: fallthrough; + case 0x19: init_amd_zn(c); break; } /* @@ -941,6 +971,7 @@ static void init_amd(struct cpuinfo_x86 *c) amd_detect_cmp(c); amd_get_topology(c); srat_detect_node(c); + amd_detect_ppin(c); init_amd_cacheinfo(c); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4cdb123ff66a..bed0cb83fe24 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1008,8 +1008,8 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) #define NO_ITLB_MULTIHIT BIT(7) #define NO_SPECTRE_V2 BIT(8) -#define VULNWL(_vendor, _family, _model, _whitelist) \ - { X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist } +#define VULNWL(vendor, family, model, whitelist) \ + X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, whitelist) #define VULNWL_INTEL(model, whitelist) \ VULNWL(INTEL, 6, INTEL_FAM6_##model, whitelist) @@ -1224,6 +1224,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) cpu_set_bug_bits(c); + cpu_set_core_cap_bits(c); + fpu__init_system(c); #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c index 0268185bef94..29a3bedabd06 100644 --- a/arch/x86/kernel/cpu/feat_ctl.c +++ b/arch/x86/kernel/cpu/feat_ctl.c @@ -5,6 +5,7 @@ #include <asm/msr-index.h> #include <asm/processor.h> #include <asm/vmx.h> +#include "cpu.h" #undef pr_fmt #define pr_fmt(fmt) "x86/cpu: " fmt diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index be82cd5841c3..9a26e972cdea 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -19,6 +19,8 @@ #include <asm/microcode_intel.h> #include <asm/hwcap2.h> #include <asm/elf.h> +#include <asm/cpu_device_id.h> +#include <asm/cmdline.h> #ifdef CONFIG_X86_64 #include <linux/topology.h> @@ -31,6 +33,20 @@ #include <asm/apic.h> #endif +enum split_lock_detect_state { + sld_off = 0, + sld_warn, + sld_fatal, +}; + +/* + * Default to sld_off because most systems do not support split lock detection + * split_lock_setup() will switch this to sld_warn on systems that support + * split lock detect, unless there is a command line override. + */ +static enum split_lock_detect_state sld_state __ro_after_init = sld_off; +static u64 msr_test_ctrl_cache __ro_after_init; + /* * Processors which have self-snooping capability can handle conflicting * memory type across CPUs by snooping its own cache. However, there exists @@ -570,6 +586,8 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c) wrmsrl(MSR_MISC_FEATURES_ENABLES, msr); } +static void split_lock_init(void); + static void init_intel(struct cpuinfo_x86 *c) { early_init_intel(c); @@ -684,6 +702,8 @@ static void init_intel(struct cpuinfo_x86 *c) tsx_enable(); if (tsx_ctrl_state == TSX_CTRL_DISABLE) tsx_disable(); + + split_lock_init(); } #ifdef CONFIG_X86_32 @@ -945,3 +965,166 @@ static const struct cpu_dev intel_cpu_dev = { }; cpu_dev_register(intel_cpu_dev); + +#undef pr_fmt +#define pr_fmt(fmt) "x86/split lock detection: " fmt + +static const struct { + const char *option; + enum split_lock_detect_state state; +} sld_options[] __initconst = { + { "off", sld_off }, + { "warn", sld_warn }, + { "fatal", sld_fatal }, +}; + +static inline bool match_option(const char *arg, int arglen, const char *opt) +{ + int len = strlen(opt); + + return len == arglen && !strncmp(arg, opt, len); +} + +static bool split_lock_verify_msr(bool on) +{ + u64 ctrl, tmp; + + if (rdmsrl_safe(MSR_TEST_CTRL, &ctrl)) + return false; + if (on) + ctrl |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT; + else + ctrl &= ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT; + if (wrmsrl_safe(MSR_TEST_CTRL, ctrl)) + return false; + rdmsrl(MSR_TEST_CTRL, tmp); + return ctrl == tmp; +} + +static void __init split_lock_setup(void) +{ + enum split_lock_detect_state state = sld_warn; + char arg[20]; + int i, ret; + + if (!split_lock_verify_msr(false)) { + pr_info("MSR access failed: Disabled\n"); + return; + } + + ret = cmdline_find_option(boot_command_line, "split_lock_detect", + arg, sizeof(arg)); + if (ret >= 0) { + for (i = 0; i < ARRAY_SIZE(sld_options); i++) { + if (match_option(arg, ret, sld_options[i].option)) { + state = sld_options[i].state; + break; + } + } + } + + switch (state) { + case sld_off: + pr_info("disabled\n"); + return; + case sld_warn: + pr_info("warning about user-space split_locks\n"); + break; + case sld_fatal: + pr_info("sending SIGBUS on user-space split_locks\n"); + break; + } + + rdmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache); + + if (!split_lock_verify_msr(true)) { + pr_info("MSR access failed: Disabled\n"); + return; + } + + sld_state = state; + setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT); +} + +/* + * MSR_TEST_CTRL is per core, but we treat it like a per CPU MSR. Locking + * is not implemented as one thread could undo the setting of the other + * thread immediately after dropping the lock anyway. + */ +static void sld_update_msr(bool on) +{ + u64 test_ctrl_val = msr_test_ctrl_cache; + + if (on) + test_ctrl_val |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT; + + wrmsrl(MSR_TEST_CTRL, test_ctrl_val); +} + +static void split_lock_init(void) +{ + split_lock_verify_msr(sld_state != sld_off); +} + +bool handle_user_split_lock(struct pt_regs *regs, long error_code) +{ + if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal) + return false; + + pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n", + current->comm, current->pid, regs->ip); + + /* + * Disable the split lock detection for this task so it can make + * progress and set TIF_SLD so the detection is re-enabled via + * switch_to_sld() when the task is scheduled out. + */ + sld_update_msr(false); + set_tsk_thread_flag(current, TIF_SLD); + return true; +} + +/* + * This function is called only when switching between tasks with + * different split-lock detection modes. It sets the MSR for the + * mode of the new task. This is right most of the time, but since + * the MSR is shared by hyperthreads on a physical core there can + * be glitches when the two threads need different modes. + */ +void switch_to_sld(unsigned long tifn) +{ + sld_update_msr(!(tifn & _TIF_SLD)); +} + +#define SPLIT_LOCK_CPU(model) {X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY} + +/* + * The following processors have the split lock detection feature. But + * since they don't have the IA32_CORE_CAPABILITIES MSR, the feature cannot + * be enumerated. Enable it by family and model matching on these + * processors. + */ +static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = { + SPLIT_LOCK_CPU(INTEL_FAM6_ICELAKE_X), + SPLIT_LOCK_CPU(INTEL_FAM6_ICELAKE_L), + {} +}; + +void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c) +{ + u64 ia32_core_caps = 0; + + if (c->x86_vendor != X86_VENDOR_INTEL) + return; + if (cpu_has(c, X86_FEATURE_CORE_CAPABILITIES)) { + /* Enumerate features reported in IA32_CORE_CAPABILITIES MSR. */ + rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps); + } else if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) { + /* Enumerate split lock detection by family and model. */ + if (x86_match_cpu(split_lock_cpu_ids)) + ia32_core_caps |= MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT; + } + + if (ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT) + split_lock_setup(); +} diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index 6dd78d8235e4..d3482eb43ff3 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -16,12 +16,17 @@ * respective wildcard entries. * * A typical table entry would be to match a specific CPU - * { X86_VENDOR_INTEL, 6, 0x12 } - * or to match a specific CPU feature - * { X86_FEATURE_MATCH(X86_FEATURE_FOOBAR) } + * + * X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, INTEL_FAM6_BROADWELL, + * X86_FEATURE_ANY, NULL); * * Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY, - * %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor) + * %X86_MODEL_ANY, %X86_FEATURE_ANY (except for vendor) + * + * asm/cpu_device_id.h contains a set of useful macros which are shortcuts + * for various common selections. The above can be shortened to: + * + * X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, NULL); * * Arrays used to match for this should also be declared using * MODULE_DEVICE_TABLE(x86cpu, ...) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 2c4f949611e4..54165f3569e8 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -142,6 +142,8 @@ void mce_setup(struct mce *m) if (this_cpu_has(X86_FEATURE_INTEL_PPIN)) rdmsrl(MSR_PPIN, m->ppin); + else if (this_cpu_has(X86_FEATURE_AMD_PPIN)) + rdmsrl(MSR_AMD_PPIN, m->ppin); m->microcode = boot_cpu_data.microcode; } @@ -1213,8 +1215,14 @@ static void __mc_scan_banks(struct mce *m, struct mce *final, * On Intel systems this is entered on all CPUs in parallel through * MCE broadcast. However some CPUs might be broken beyond repair, * so be always careful when synchronizing with others. + * + * Tracing and kprobes are disabled: if we interrupted a kernel context + * with IF=1, we need to minimize stack usage. There are also recursion + * issues: if the machine check was due to a failure of the memory + * backing the user stack, tracing that reads the user stack will cause + * potentially infinite recursion. */ -void do_machine_check(struct pt_regs *regs, long error_code) +void notrace do_machine_check(struct pt_regs *regs, long error_code) { DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); DECLARE_BITMAP(toclear, MAX_NR_BANKS); @@ -1360,6 +1368,7 @@ out_ist: ist_exit(regs); } EXPORT_SYMBOL_GPL(do_machine_check); +NOKPROBE_SYMBOL(do_machine_check); #ifndef CONFIG_MEMORY_FAILURE int memory_failure(unsigned long pfn, int flags) @@ -1877,6 +1886,8 @@ bool filter_mce(struct mce *m) { if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) return amd_filter_mce(m); + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + return intel_filter_mce(m); return false; } @@ -1892,10 +1903,11 @@ static void unexpected_machine_check(struct pt_regs *regs, long error_code) void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check; -dotraplinkage void do_mce(struct pt_regs *regs, long error_code) +dotraplinkage notrace void do_mce(struct pt_regs *regs, long error_code) { machine_check_vector(regs, error_code); } +NOKPROBE_SYMBOL(do_mce); /* * Called for each booted CPU to set up machine checks. diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c index 7c8958dee103..d089567a9ce8 100644 --- a/arch/x86/kernel/cpu/mce/dev-mcelog.c +++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c @@ -29,11 +29,7 @@ static char *mce_helper_argv[2] = { mce_helper, NULL }; * separate MCEs from kernel messages to avoid bogus bug reports. */ -static struct mce_log_buffer mcelog = { - .signature = MCE_LOG_SIGNATURE, - .len = MCE_LOG_LEN, - .recordlen = sizeof(struct mce), -}; +static struct mce_log_buffer *mcelog; static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); @@ -45,21 +41,21 @@ static int dev_mce_log(struct notifier_block *nb, unsigned long val, mutex_lock(&mce_chrdev_read_mutex); - entry = mcelog.next; + entry = mcelog->next; /* * When the buffer fills up discard new entries. Assume that the * earlier errors are the more interesting ones: */ - if (entry >= MCE_LOG_LEN) { - set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags); + if (entry >= mcelog->len) { + set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog->flags); goto unlock; } - mcelog.next = entry + 1; + mcelog->next = entry + 1; - memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); - mcelog.entry[entry].finished = 1; + memcpy(mcelog->entry + entry, mce, sizeof(struct mce)); + mcelog->entry[entry].finished = 1; /* wake processes polling /dev/mcelog */ wake_up_interruptible(&mce_chrdev_wait); @@ -214,21 +210,21 @@ static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf, /* Only supports full reads right now */ err = -EINVAL; - if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) + if (*off != 0 || usize < mcelog->len * sizeof(struct mce)) goto out; - next = mcelog.next; + next = mcelog->next; err = 0; for (i = 0; i < next; i++) { - struct mce *m = &mcelog.entry[i]; + struct mce *m = &mcelog->entry[i]; err |= copy_to_user(buf, m, sizeof(*m)); buf += sizeof(*m); } - memset(mcelog.entry, 0, next * sizeof(struct mce)); - mcelog.next = 0; + memset(mcelog->entry, 0, next * sizeof(struct mce)); + mcelog->next = 0; if (err) err = -EFAULT; @@ -242,7 +238,7 @@ out: static __poll_t mce_chrdev_poll(struct file *file, poll_table *wait) { poll_wait(file, &mce_chrdev_wait, wait); - if (READ_ONCE(mcelog.next)) + if (READ_ONCE(mcelog->next)) return EPOLLIN | EPOLLRDNORM; if (!mce_apei_read_done && apei_check_mce()) return EPOLLIN | EPOLLRDNORM; @@ -261,13 +257,13 @@ static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, case MCE_GET_RECORD_LEN: return put_user(sizeof(struct mce), p); case MCE_GET_LOG_LEN: - return put_user(MCE_LOG_LEN, p); + return put_user(mcelog->len, p); case MCE_GETCLEAR_FLAGS: { unsigned flags; do { - flags = mcelog.flags; - } while (cmpxchg(&mcelog.flags, flags, 0) != flags); + flags = mcelog->flags; + } while (cmpxchg(&mcelog->flags, flags, 0) != flags); return put_user(flags, p); } @@ -339,8 +335,18 @@ static struct miscdevice mce_chrdev_device = { static __init int dev_mcelog_init_device(void) { + int mce_log_len; int err; + mce_log_len = max(MCE_LOG_MIN_LEN, num_online_cpus()); + mcelog = kzalloc(sizeof(*mcelog) + mce_log_len * sizeof(struct mce), GFP_KERNEL); + if (!mcelog) + return -ENOMEM; + + strncpy(mcelog->signature, MCE_LOG_SIGNATURE, sizeof(mcelog->signature)); + mcelog->len = mce_log_len; + mcelog->recordlen = sizeof(struct mce); + /* register character device /dev/mcelog */ err = misc_register(&mce_chrdev_device); if (err) { @@ -350,6 +356,7 @@ static __init int dev_mcelog_init_device(void) else pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err); + kfree(mcelog); return err; } diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index f996ffb887bc..d8f9230d2034 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -521,3 +521,20 @@ void mce_intel_feature_clear(struct cpuinfo_x86 *c) { intel_clear_lmce(); } + +bool intel_filter_mce(struct mce *m) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + /* MCE errata HSD131, HSM142, HSW131, BDM48, and HSM142 */ + if ((c->x86 == 6) && + ((c->x86_model == INTEL_FAM6_HASWELL) || + (c->x86_model == INTEL_FAM6_HASWELL_L) || + (c->x86_model == INTEL_FAM6_BROADWELL) || + (c->x86_model == INTEL_FAM6_HASWELL_G)) && + (m->bank == 0) && + ((m->status & 0xa0000000ffffffff) == 0x80000000000f0005)) + return true; + + return false; +} diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index b785c0d0b590..3b008172ad73 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -8,6 +8,9 @@ #include <linux/device.h> #include <asm/mce.h> +/* Pointer to the installed machine check handler for this CPU setup. */ +extern void (*machine_check_vector)(struct pt_regs *, long error_code); + enum severity_level { MCE_NO_SEVERITY, MCE_DEFERRED_SEVERITY, @@ -48,6 +51,7 @@ void cmci_disable_bank(int bank); void intel_init_cmci(void); void intel_init_lmce(void); void intel_clear_lmce(void); +bool intel_filter_mce(struct mce *m); #else # define cmci_intel_adjust_timer mce_adjust_timer_default static inline bool mce_intel_cmci_poll(void) { return false; } @@ -56,6 +60,7 @@ static inline void cmci_disable_bank(int bank) { } static inline void intel_init_cmci(void) { } static inline void intel_init_lmce(void) { } static inline void intel_clear_lmce(void) { } +static inline bool intel_filter_mce(struct mce *m) { return false; }; #endif void mce_timer_kick(unsigned long interval); diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c index c222f283b456..300e3fd5ade3 100644 --- a/arch/x86/kernel/cpu/umwait.c +++ b/arch/x86/kernel/cpu/umwait.c @@ -4,6 +4,7 @@ #include <linux/cpu.h> #include <asm/msr.h> +#include <asm/mwait.h> #define UMWAIT_C02_ENABLE 0 diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 46d732696c1c..9b6fafa69be9 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -25,6 +25,8 @@ #include <linux/init.h> #include <linux/export.h> #include <linux/clocksource.h> +#include <linux/cpu.h> +#include <linux/reboot.h> #include <asm/div64.h> #include <asm/x86_init.h> #include <asm/hypervisor.h> @@ -47,6 +49,11 @@ #define VMWARE_CMD_GETVCPU_INFO 68 #define VMWARE_CMD_LEGACY_X2APIC 3 #define VMWARE_CMD_VCPU_RESERVED 31 +#define VMWARE_CMD_STEALCLOCK 91 + +#define STEALCLOCK_NOT_AVAILABLE (-1) +#define STEALCLOCK_DISABLED 0 +#define STEALCLOCK_ENABLED 1 #define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \ __asm__("inl (%%dx), %%eax" : \ @@ -86,6 +93,18 @@ } \ } while (0) +struct vmware_steal_time { + union { + uint64_t clock; /* stolen time counter in units of vtsc */ + struct { + /* only for little-endian */ + uint32_t clock_low; + uint32_t clock_high; + }; + }; + uint64_t reserved[7]; +}; + static unsigned long vmware_tsc_khz __ro_after_init; static u8 vmware_hypercall_mode __ro_after_init; @@ -103,15 +122,25 @@ static unsigned long vmware_get_tsc_khz(void) #ifdef CONFIG_PARAVIRT static struct cyc2ns_data vmware_cyc2ns __ro_after_init; -static int vmw_sched_clock __initdata = 1; +static bool vmw_sched_clock __initdata = true; +static DEFINE_PER_CPU_DECRYPTED(struct vmware_steal_time, vmw_steal_time) __aligned(64); +static bool has_steal_clock; +static bool steal_acc __initdata = true; /* steal time accounting */ static __init int setup_vmw_sched_clock(char *s) { - vmw_sched_clock = 0; + vmw_sched_clock = false; return 0; } early_param("no-vmw-sched-clock", setup_vmw_sched_clock); +static __init int parse_no_stealacc(char *arg) +{ + steal_acc = false; + return 0; +} +early_param("no-steal-acc", parse_no_stealacc); + static unsigned long long notrace vmware_sched_clock(void) { unsigned long long ns; @@ -122,7 +151,7 @@ static unsigned long long notrace vmware_sched_clock(void) return ns; } -static void __init vmware_sched_clock_setup(void) +static void __init vmware_cyc2ns_setup(void) { struct cyc2ns_data *d = &vmware_cyc2ns; unsigned long long tsc_now = rdtsc(); @@ -132,17 +161,201 @@ static void __init vmware_sched_clock_setup(void) d->cyc2ns_offset = mul_u64_u32_shr(tsc_now, d->cyc2ns_mul, d->cyc2ns_shift); - pv_ops.time.sched_clock = vmware_sched_clock; - pr_info("using sched offset of %llu ns\n", d->cyc2ns_offset); + pr_info("using clock offset of %llu ns\n", d->cyc2ns_offset); +} + +static int vmware_cmd_stealclock(uint32_t arg1, uint32_t arg2) +{ + uint32_t result, info; + + asm volatile (VMWARE_HYPERCALL : + "=a"(result), + "=c"(info) : + "a"(VMWARE_HYPERVISOR_MAGIC), + "b"(0), + "c"(VMWARE_CMD_STEALCLOCK), + "d"(0), + "S"(arg1), + "D"(arg2) : + "memory"); + return result; +} + +static bool stealclock_enable(phys_addr_t pa) +{ + return vmware_cmd_stealclock(upper_32_bits(pa), + lower_32_bits(pa)) == STEALCLOCK_ENABLED; +} + +static int __stealclock_disable(void) +{ + return vmware_cmd_stealclock(0, 1); +} + +static void stealclock_disable(void) +{ + __stealclock_disable(); +} + +static bool vmware_is_stealclock_available(void) +{ + return __stealclock_disable() != STEALCLOCK_NOT_AVAILABLE; +} + +/** + * vmware_steal_clock() - read the per-cpu steal clock + * @cpu: the cpu number whose steal clock we want to read + * + * The function reads the steal clock if we are on a 64-bit system, otherwise + * reads it in parts, checking that the high part didn't change in the + * meantime. + * + * Return: + * The steal clock reading in ns. + */ +static uint64_t vmware_steal_clock(int cpu) +{ + struct vmware_steal_time *steal = &per_cpu(vmw_steal_time, cpu); + uint64_t clock; + + if (IS_ENABLED(CONFIG_64BIT)) + clock = READ_ONCE(steal->clock); + else { + uint32_t initial_high, low, high; + + do { + initial_high = READ_ONCE(steal->clock_high); + /* Do not reorder initial_high and high readings */ + virt_rmb(); + low = READ_ONCE(steal->clock_low); + /* Keep low reading in between */ + virt_rmb(); + high = READ_ONCE(steal->clock_high); + } while (initial_high != high); + + clock = ((uint64_t)high << 32) | low; + } + + return mul_u64_u32_shr(clock, vmware_cyc2ns.cyc2ns_mul, + vmware_cyc2ns.cyc2ns_shift); +} + +static void vmware_register_steal_time(void) +{ + int cpu = smp_processor_id(); + struct vmware_steal_time *st = &per_cpu(vmw_steal_time, cpu); + + if (!has_steal_clock) + return; + + if (!stealclock_enable(slow_virt_to_phys(st))) { + has_steal_clock = false; + return; + } + + pr_info("vmware-stealtime: cpu %d, pa %llx\n", + cpu, (unsigned long long) slow_virt_to_phys(st)); } +static void vmware_disable_steal_time(void) +{ + if (!has_steal_clock) + return; + + stealclock_disable(); +} + +static void vmware_guest_cpu_init(void) +{ + if (has_steal_clock) + vmware_register_steal_time(); +} + +static void vmware_pv_guest_cpu_reboot(void *unused) +{ + vmware_disable_steal_time(); +} + +static int vmware_pv_reboot_notify(struct notifier_block *nb, + unsigned long code, void *unused) +{ + if (code == SYS_RESTART) + on_each_cpu(vmware_pv_guest_cpu_reboot, NULL, 1); + return NOTIFY_DONE; +} + +static struct notifier_block vmware_pv_reboot_nb = { + .notifier_call = vmware_pv_reboot_notify, +}; + +#ifdef CONFIG_SMP +static void __init vmware_smp_prepare_boot_cpu(void) +{ + vmware_guest_cpu_init(); + native_smp_prepare_boot_cpu(); +} + +static int vmware_cpu_online(unsigned int cpu) +{ + local_irq_disable(); + vmware_guest_cpu_init(); + local_irq_enable(); + return 0; +} + +static int vmware_cpu_down_prepare(unsigned int cpu) +{ + local_irq_disable(); + vmware_disable_steal_time(); + local_irq_enable(); + return 0; +} +#endif + +static __init int activate_jump_labels(void) +{ + if (has_steal_clock) { + static_key_slow_inc(¶virt_steal_enabled); + if (steal_acc) + static_key_slow_inc(¶virt_steal_rq_enabled); + } + + return 0; +} +arch_initcall(activate_jump_labels); + static void __init vmware_paravirt_ops_setup(void) { pv_info.name = "VMware hypervisor"; pv_ops.cpu.io_delay = paravirt_nop; - if (vmware_tsc_khz && vmw_sched_clock) - vmware_sched_clock_setup(); + if (vmware_tsc_khz == 0) + return; + + vmware_cyc2ns_setup(); + + if (vmw_sched_clock) + pv_ops.time.sched_clock = vmware_sched_clock; + + if (vmware_is_stealclock_available()) { + has_steal_clock = true; + pv_ops.time.steal_clock = vmware_steal_clock; + + /* We use reboot notifier only to disable steal clock */ + register_reboot_notifier(&vmware_pv_reboot_nb); + +#ifdef CONFIG_SMP + smp_ops.smp_prepare_boot_cpu = + vmware_smp_prepare_boot_cpu; + if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "x86/vmware:online", + vmware_cpu_online, + vmware_cpu_down_prepare) < 0) + pr_err("vmware_guest: Failed to install cpu hotplug callbacks\n"); +#else + vmware_guest_cpu_init(); +#endif + } } #else #define vmware_paravirt_ops_setup() do {} while (0) @@ -213,7 +426,7 @@ static void __init vmware_platform_setup(void) vmware_set_capabilities(); } -static u8 vmware_select_hypercall(void) +static u8 __init vmware_select_hypercall(void) { int eax, ebx, ecx, edx; diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index a1806598aaa4..32b153d38748 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -120,11 +120,6 @@ static bool xfeature_is_supervisor(int xfeature_nr) return ecx & 1; } -static bool xfeature_is_user(int xfeature_nr) -{ - return !xfeature_is_supervisor(xfeature_nr); -} - /* * When executing XSAVEOPT (or other optimized XSAVE instructions), if * a processor implementation detects that an FPU state component is still @@ -265,21 +260,25 @@ static void __init setup_xstate_features(void) cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); + xstate_sizes[i] = eax; + /* - * If an xfeature is supervisor state, the offset - * in EBX is invalid. We leave it to -1. + * If an xfeature is supervisor state, the offset in EBX is + * invalid, leave it to -1. */ - if (xfeature_is_user(i)) - xstate_offsets[i] = ebx; + if (xfeature_is_supervisor(i)) + continue; + + xstate_offsets[i] = ebx; - xstate_sizes[i] = eax; /* - * In our xstate size checks, we assume that the - * highest-numbered xstate feature has the - * highest offset in the buffer. Ensure it does. + * In our xstate size checks, we assume that the highest-numbered + * xstate feature has the highest offset in the buffer. Ensure + * it does. */ WARN_ONCE(last_good_offset > xstate_offsets[i], - "x86/fpu: misordered xstate at %d\n", last_good_offset); + "x86/fpu: misordered xstate at %d\n", last_good_offset); + last_good_offset = xstate_offsets[i]; } } @@ -326,6 +325,13 @@ static int xfeature_is_aligned(int xfeature_nr) u32 eax, ebx, ecx, edx; CHECK_XFEATURE(xfeature_nr); + + if (!xfeature_enabled(xfeature_nr)) { + WARN_ONCE(1, "Checking alignment of disabled xfeature %d\n", + xfeature_nr); + return 0; + } + cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); /* * The value returned by ECX[1] indicates the alignment @@ -338,11 +344,11 @@ static int xfeature_is_aligned(int xfeature_nr) /* * This function sets up offsets and sizes of all extended states in * xsave area. This supports both standard format and compacted format - * of the xsave aread. + * of the xsave area. */ -static void __init setup_xstate_comp(void) +static void __init setup_xstate_comp_offsets(void) { - unsigned int xstate_comp_sizes[XFEATURE_MAX]; + unsigned int next_offset; int i; /* @@ -356,31 +362,23 @@ static void __init setup_xstate_comp(void) if (!boot_cpu_has(X86_FEATURE_XSAVES)) { for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { - if (xfeature_enabled(i)) { + if (xfeature_enabled(i)) xstate_comp_offsets[i] = xstate_offsets[i]; - xstate_comp_sizes[i] = xstate_sizes[i]; - } } return; } - xstate_comp_offsets[FIRST_EXTENDED_XFEATURE] = - FXSAVE_SIZE + XSAVE_HDR_SIZE; + next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE; for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { - if (xfeature_enabled(i)) - xstate_comp_sizes[i] = xstate_sizes[i]; - else - xstate_comp_sizes[i] = 0; + if (!xfeature_enabled(i)) + continue; - if (i > FIRST_EXTENDED_XFEATURE) { - xstate_comp_offsets[i] = xstate_comp_offsets[i-1] - + xstate_comp_sizes[i-1]; + if (xfeature_is_aligned(i)) + next_offset = ALIGN(next_offset, 64); - if (xfeature_is_aligned(i)) - xstate_comp_offsets[i] = - ALIGN(xstate_comp_offsets[i], 64); - } + xstate_comp_offsets[i] = next_offset; + next_offset += xstate_sizes[i]; } } @@ -774,7 +772,7 @@ void __init fpu__init_system_xstate(void) fpu__init_prepare_fx_sw_frame(); setup_init_fpu_buf(); - setup_xstate_comp(); + setup_xstate_comp_offsets(); print_xstate_offset_size(); pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", @@ -897,8 +895,6 @@ const void *get_xsave_field_ptr(int xfeature_nr) #ifdef CONFIG_ARCH_HAS_PKEYS -#define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2) -#define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1) /* * This will go out and modify PKRU register to set the access * rights for @pkey to @init_val. @@ -917,6 +913,13 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, if (!boot_cpu_has(X86_FEATURE_OSPKE)) return -EINVAL; + /* + * This code should only be called with valid 'pkey' + * values originating from in-kernel users. Complain + * if a bad value is observed. + */ + WARN_ON_ONCE(pkey >= arch_max_pkey()); + /* Set the bits we need in PKRU: */ if (init_val & PKEY_DISABLE_ACCESS) new_pkru_bits |= PKRU_AD_BIT; diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 3923ab4630d7..f66a6b90f954 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -67,11 +67,6 @@ __HEAD SYM_CODE_START(startup_32) movl pa(initial_stack),%ecx - /* test KEEP_SEGMENTS flag to see if the bootloader is asking - us to not reload segments */ - testb $KEEP_SEGMENTS, BP_loadflags(%esi) - jnz 2f - /* * Set segments to known values. */ @@ -82,7 +77,6 @@ SYM_CODE_START(startup_32) movl %eax,%fs movl %eax,%gs movl %eax,%ss -2: leal -__PAGE_OFFSET(%ecx),%esp /* diff --git a/arch/x86/kernel/ima_arch.c b/arch/x86/kernel/ima_arch.c index 23054909c8dd..7dfb1e808928 100644 --- a/arch/x86/kernel/ima_arch.c +++ b/arch/x86/kernel/ima_arch.c @@ -17,7 +17,7 @@ static enum efi_secureboot_mode get_sb_mode(void) size = sizeof(secboot); - if (!efi_enabled(EFI_RUNTIME_SERVICES)) { + if (!efi_rt_services_supported(EFI_RT_SUPPORTED_GET_VARIABLE)) { pr_info("ima: secureboot mode unknown, no efi\n"); return efi_secureboot_mode_unknown; } diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 8abeee0dd7bf..a53e7b4a7419 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -13,6 +13,7 @@ #include <asm/io_bitmap.h> #include <asm/desc.h> +#include <asm/syscalls.h> #ifdef CONFIG_X86_IOPL_IOPERM diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 21efee32e2b1..c7965ff429c5 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -230,7 +230,7 @@ u64 arch_irq_stat(void) * SMP cross-CPU interrupts have their own specific * handlers). */ -__visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) +__visible void __irq_entry do_IRQ(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); struct irq_desc * desc; @@ -263,7 +263,6 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) exiting_irq(); set_irq_regs(old_regs); - return 1; } #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 16919a9671fa..5aa523c2d573 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -44,15 +44,6 @@ * (these are usually mapped into the 0x30-0xff vector range) */ -/* - * IRQ2 is cascade interrupt to second interrupt controller - */ -static struct irqaction irq2 = { - .handler = no_action, - .name = "cascade", - .flags = IRQF_NO_THREAD, -}; - DEFINE_PER_CPU(vector_irq_t, vector_irq) = { [0 ... NR_VECTORS - 1] = VECTOR_UNUSED, }; @@ -84,7 +75,7 @@ void __init init_IRQ(void) * On cpu 0, Assign ISA_IRQ_VECTOR(irq) to IRQ 0..15. * If these IRQ's are handled by legacy interrupt-controllers like PIC, * then this configuration will likely be static after the boot. If - * these IRQ's are handled by more mordern controllers like IO-APIC, + * these IRQs are handled by more modern controllers like IO-APIC, * then this vector space can be freed and re-used dynamically as the * irq's migrate etc. */ @@ -104,6 +95,9 @@ void __init native_init_IRQ(void) idt_setup_apic_and_irq_gates(); lapic_assign_system_vectors(); - if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) - setup_irq(2, &irq2); + if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) { + /* IRQ2 is cascade interrupt to second interrupt controller */ + if (request_irq(2, no_action, IRQF_NO_THREAD, "cascade", NULL)) + pr_err("%s: request_irq() failed\n", "cascade"); + } } diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index 9c4498ea0b3c..5ba8477c2cb7 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -58,7 +58,7 @@ __jump_label_set_jump_code(struct jump_entry *entry, enum jump_label_type type, return code; } -static void inline __jump_label_transform(struct jump_entry *entry, +static inline void __jump_label_transform(struct jump_entry *entry, enum jump_label_type type, int init) { diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index f293d872602a..db6578d45157 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -141,9 +141,8 @@ prepare_add_efi_setup_data(struct boot_params *params, struct setup_data *sd = (void *)params + efi_setup_data_offset; struct efi_setup_data *esd = (void *)sd + sizeof(struct setup_data); - esd->fw_vendor = efi.fw_vendor; - esd->runtime = efi.runtime; - esd->tables = efi.config_table; + esd->fw_vendor = efi_fw_vendor; + esd->tables = efi_config_table; esd->smbios = efi.smbios; sd->type = SETUP_EFI; diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 3f45b5c43a71..ea13f6888284 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -71,6 +71,21 @@ found: return (unsigned long)buf; } +static void synthesize_clac(kprobe_opcode_t *addr) +{ + /* + * Can't be static_cpu_has() due to how objtool treats this feature bit. + * This isn't a fast path anyway. + */ + if (!boot_cpu_has(X86_FEATURE_SMAP)) + return; + + /* Replace the NOP3 with CLAC */ + addr[0] = 0x0f; + addr[1] = 0x01; + addr[2] = 0xca; +} + /* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */ static void synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val) { @@ -92,6 +107,9 @@ asm ( /* We don't bother saving the ss register */ " pushq %rsp\n" " pushfq\n" + ".global optprobe_template_clac\n" + "optprobe_template_clac:\n" + ASM_NOP3 SAVE_REGS_STRING " movq %rsp, %rsi\n" ".global optprobe_template_val\n" @@ -111,6 +129,9 @@ asm ( #else /* CONFIG_X86_32 */ " pushl %esp\n" " pushfl\n" + ".global optprobe_template_clac\n" + "optprobe_template_clac:\n" + ASM_NOP3 SAVE_REGS_STRING " movl %esp, %edx\n" ".global optprobe_template_val\n" @@ -134,6 +155,8 @@ asm ( void optprobe_template_func(void); STACK_FRAME_NON_STANDARD(optprobe_template_func); +#define TMPL_CLAC_IDX \ + ((long)optprobe_template_clac - (long)optprobe_template_entry) #define TMPL_MOVE_IDX \ ((long)optprobe_template_val - (long)optprobe_template_entry) #define TMPL_CALL_IDX \ @@ -389,6 +412,8 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, op->optinsn.size = ret; len = TMPL_END_IDX + op->optinsn.size; + synthesize_clac(buf + TMPL_CLAC_IDX); + /* Set probe information */ synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op); diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 904494b924c1..34b18f6eeb2c 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -159,12 +159,19 @@ bool kvm_check_and_clear_guest_paused(void) return ret; } +static int kvm_cs_enable(struct clocksource *cs) +{ + vclocks_set_used(VDSO_CLOCKMODE_PVCLOCK); + return 0; +} + struct clocksource kvm_clock = { .name = "kvm-clock", .read = kvm_clock_get_cycles, .rating = 400, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .enable = kvm_cs_enable, }; EXPORT_SYMBOL_GPL(kvm_clock); @@ -272,7 +279,7 @@ static int __init kvm_setup_vsyscall_timeinfo(void) if (!(flags & PVCLOCK_TSC_STABLE_BIT)) return 0; - kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; + kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK; #endif kvmclock_init_mem(); diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index c57e1ca70fd1..84c3ba32f211 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -27,7 +27,6 @@ #include <asm/tlb.h> #include <asm/desc.h> #include <asm/mmu_context.h> -#include <asm/syscalls.h> #include <asm/pgtable_areas.h> /* This is a multiple of PAGE_SIZE. */ diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 54c21d6abd5a..6407ea21fa1b 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -403,9 +403,9 @@ static void default_do_nmi(struct pt_regs *regs) * a 'real' unknown NMI. For example, while processing * a perf NMI another perf NMI comes in along with a * 'real' unknown NMI. These two NMIs get combined into - * one (as descibed above). When the next NMI gets + * one (as described above). When the next NMI gets * processed, it will be flagged by perf as handled, but - * noone will know that there was a 'real' unknown NMI sent + * no one will know that there was a 'real' unknown NMI sent * also. As a result it gets swallowed. Or if the first * perf NMI returns two events handled then the second * NMI will get eaten by the logic below, again losing a diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 3053c85e0e42..9da70b279dad 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -28,7 +28,6 @@ #include <linux/hw_breakpoint.h> #include <asm/cpu.h> #include <asm/apic.h> -#include <asm/syscalls.h> #include <linux/uaccess.h> #include <asm/mwait.h> #include <asm/fpu/internal.h> @@ -650,6 +649,9 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) /* Enforce MSR update to ensure consistent state */ __speculation_ctrl_update(~tifn, tifn); } + + if ((tifp ^ tifn) & _TIF_SLD) + switch_to_sld(tifn); } /* diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 5052ced43373..954b013cc585 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -49,7 +49,6 @@ #include <asm/tlbflush.h> #include <asm/cpu.h> -#include <asm/syscalls.h> #include <asm/debugreg.h> #include <asm/switch_to.h> #include <asm/vm86.h> diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ffd497804dbc..5ef9d8f25b0e 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -48,7 +48,6 @@ #include <asm/desc.h> #include <asm/proto.h> #include <asm/ia32.h> -#include <asm/syscalls.h> #include <asm/debugreg.h> #include <asm/switch_to.h> #include <asm/xen/hypervisor.h> diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 10125358b9c4..11065dc03f5b 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -145,7 +145,7 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, void pvclock_set_pvti_cpu0_va(struct pvclock_vsyscall_time_info *pvti) { - WARN_ON(vclock_was_used(VCLOCK_PVCLOCK)); + WARN_ON(vclock_was_used(VDSO_CLOCKMODE_PVCLOCK)); pvti_cpu0_va = pvti; } diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 0cc7c0b106bb..3ca43be4f9cf 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -531,7 +531,7 @@ static void emergency_vmx_disable_all(void) /* * We need to disable VMX on all CPUs before rebooting, otherwise - * we risk hanging up the machine, because the CPU ignore INIT + * we risk hanging up the machine, because the CPU ignores INIT * signals when VMX is enabled. * * We can't take any locks and we may be on an inconsistent diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index ef3ba99068d3..a4d9a261425b 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -9,6 +9,8 @@ #include <asm/kexec.h> #include <asm/processor-flags.h> #include <asm/pgtable_types.h> +#include <asm/nospec-branch.h> +#include <asm/unwind_hints.h> /* * Must be relocatable PIC code callable as a C function @@ -39,6 +41,7 @@ .align PAGE_SIZE .code64 SYM_CODE_START_NOALIGN(relocate_kernel) + UNWIND_HINT_EMPTY /* * %rdi indirection_page * %rsi page_list @@ -105,6 +108,7 @@ SYM_CODE_START_NOALIGN(relocate_kernel) SYM_CODE_END(relocate_kernel) SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) + UNWIND_HINT_EMPTY /* set return address to 0 if not preserving context */ pushq $0 /* store the start address on the stack */ @@ -192,14 +196,12 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) 1: popq %rdx leaq PAGE_SIZE(%r10), %rsp + ANNOTATE_RETPOLINE_SAFE call *%rdx /* get the re-entry point of the peer system */ movq 0(%rsp), %rbp - call 1f -1: - popq %r8 - subq $(1b - relocate_kernel), %r8 + leaq relocate_kernel(%rip), %r8 movq CP_PA_SWAP_PAGE(%r8), %r10 movq CP_PA_BACKUP_PAGES_MAP(%r8), %rdi movq CP_PA_TABLE_PAGE(%r8), %rax @@ -212,6 +214,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) SYM_CODE_END(identity_mapped) SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped) + UNWIND_HINT_EMPTY movq RSP(%r8), %rsp movq CR4(%r8), %rax movq %rax, %cr4 @@ -233,6 +236,7 @@ SYM_CODE_END(virtual_mapped) /* Do the copies */ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) + UNWIND_HINT_EMPTY movq %rdi, %rcx /* Put the page_list in %rcx */ xorl %edi, %edi xorl %esi, %esi diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index a74262c71484..e6b545047f38 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -64,7 +64,6 @@ RESERVE_BRK(dmi_alloc, 65536); * at link time, with RESERVE_BRK*() facility reserving additional * chunks. */ -static __initdata unsigned long _brk_start = (unsigned long)__brk_base; unsigned long _brk_end = (unsigned long)__brk_base; diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 8a29573851a3..83b74fb38c8f 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -42,29 +42,9 @@ #endif /* CONFIG_X86_64 */ #include <asm/syscall.h> -#include <asm/syscalls.h> - #include <asm/sigframe.h> #include <asm/signal.h> -#define COPY(x) do { \ - get_user_ex(regs->x, &sc->x); \ -} while (0) - -#define GET_SEG(seg) ({ \ - unsigned short tmp; \ - get_user_ex(tmp, &sc->seg); \ - tmp; \ -}) - -#define COPY_SEG(seg) do { \ - regs->seg = GET_SEG(seg); \ -} while (0) - -#define COPY_SEG_CPL3(seg) do { \ - regs->seg = GET_SEG(seg) | 3; \ -} while (0) - #ifdef CONFIG_X86_64 /* * If regs->ss will cause an IRET fault, change it. Otherwise leave it @@ -92,53 +72,58 @@ static void force_valid_ss(struct pt_regs *regs) ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA_EXPDOWN)) regs->ss = __USER_DS; } +# define CONTEXT_COPY_SIZE offsetof(struct sigcontext, reserved1) +#else +# define CONTEXT_COPY_SIZE sizeof(struct sigcontext) #endif static int restore_sigcontext(struct pt_regs *regs, - struct sigcontext __user *sc, + struct sigcontext __user *usc, unsigned long uc_flags) { - unsigned long buf_val; - void __user *buf; - unsigned int tmpflags; - unsigned int err = 0; + struct sigcontext sc; /* Always make any pending restarted system calls return -EINTR */ current->restart_block.fn = do_no_restart_syscall; - get_user_try { + if (copy_from_user(&sc, usc, CONTEXT_COPY_SIZE)) + return -EFAULT; #ifdef CONFIG_X86_32 - set_user_gs(regs, GET_SEG(gs)); - COPY_SEG(fs); - COPY_SEG(es); - COPY_SEG(ds); + set_user_gs(regs, sc.gs); + regs->fs = sc.fs; + regs->es = sc.es; + regs->ds = sc.ds; #endif /* CONFIG_X86_32 */ - COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); - COPY(dx); COPY(cx); COPY(ip); COPY(ax); + regs->bx = sc.bx; + regs->cx = sc.cx; + regs->dx = sc.dx; + regs->si = sc.si; + regs->di = sc.di; + regs->bp = sc.bp; + regs->ax = sc.ax; + regs->sp = sc.sp; + regs->ip = sc.ip; #ifdef CONFIG_X86_64 - COPY(r8); - COPY(r9); - COPY(r10); - COPY(r11); - COPY(r12); - COPY(r13); - COPY(r14); - COPY(r15); + regs->r8 = sc.r8; + regs->r9 = sc.r9; + regs->r10 = sc.r10; + regs->r11 = sc.r11; + regs->r12 = sc.r12; + regs->r13 = sc.r13; + regs->r14 = sc.r14; + regs->r15 = sc.r15; #endif /* CONFIG_X86_64 */ - COPY_SEG_CPL3(cs); - COPY_SEG_CPL3(ss); - - get_user_ex(tmpflags, &sc->flags); - regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); - regs->orig_ax = -1; /* disable syscall checks */ + /* Get CS/SS and force CPL3 */ + regs->cs = sc.cs | 0x03; + regs->ss = sc.ss | 0x03; - get_user_ex(buf_val, &sc->fpstate); - buf = (void __user *)buf_val; - } get_user_catch(err); + regs->flags = (regs->flags & ~FIX_EFLAGS) | (sc.flags & FIX_EFLAGS); + /* disable syscall checks */ + regs->orig_ax = -1; #ifdef CONFIG_X86_64 /* @@ -149,70 +134,78 @@ static int restore_sigcontext(struct pt_regs *regs, force_valid_ss(regs); #endif - err |= fpu__restore_sig(buf, IS_ENABLED(CONFIG_X86_32)); - - return err; + return fpu__restore_sig((void __user *)sc.fpstate, + IS_ENABLED(CONFIG_X86_32)); } -int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, +static __always_inline int +__unsafe_setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, struct pt_regs *regs, unsigned long mask) { - int err = 0; - - put_user_try { - #ifdef CONFIG_X86_32 - put_user_ex(get_user_gs(regs), (unsigned int __user *)&sc->gs); - put_user_ex(regs->fs, (unsigned int __user *)&sc->fs); - put_user_ex(regs->es, (unsigned int __user *)&sc->es); - put_user_ex(regs->ds, (unsigned int __user *)&sc->ds); + unsafe_put_user(get_user_gs(regs), + (unsigned int __user *)&sc->gs, Efault); + unsafe_put_user(regs->fs, (unsigned int __user *)&sc->fs, Efault); + unsafe_put_user(regs->es, (unsigned int __user *)&sc->es, Efault); + unsafe_put_user(regs->ds, (unsigned int __user *)&sc->ds, Efault); #endif /* CONFIG_X86_32 */ - put_user_ex(regs->di, &sc->di); - put_user_ex(regs->si, &sc->si); - put_user_ex(regs->bp, &sc->bp); - put_user_ex(regs->sp, &sc->sp); - put_user_ex(regs->bx, &sc->bx); - put_user_ex(regs->dx, &sc->dx); - put_user_ex(regs->cx, &sc->cx); - put_user_ex(regs->ax, &sc->ax); + unsafe_put_user(regs->di, &sc->di, Efault); + unsafe_put_user(regs->si, &sc->si, Efault); + unsafe_put_user(regs->bp, &sc->bp, Efault); + unsafe_put_user(regs->sp, &sc->sp, Efault); + unsafe_put_user(regs->bx, &sc->bx, Efault); + unsafe_put_user(regs->dx, &sc->dx, Efault); + unsafe_put_user(regs->cx, &sc->cx, Efault); + unsafe_put_user(regs->ax, &sc->ax, Efault); #ifdef CONFIG_X86_64 - put_user_ex(regs->r8, &sc->r8); - put_user_ex(regs->r9, &sc->r9); - put_user_ex(regs->r10, &sc->r10); - put_user_ex(regs->r11, &sc->r11); - put_user_ex(regs->r12, &sc->r12); - put_user_ex(regs->r13, &sc->r13); - put_user_ex(regs->r14, &sc->r14); - put_user_ex(regs->r15, &sc->r15); + unsafe_put_user(regs->r8, &sc->r8, Efault); + unsafe_put_user(regs->r9, &sc->r9, Efault); + unsafe_put_user(regs->r10, &sc->r10, Efault); + unsafe_put_user(regs->r11, &sc->r11, Efault); + unsafe_put_user(regs->r12, &sc->r12, Efault); + unsafe_put_user(regs->r13, &sc->r13, Efault); + unsafe_put_user(regs->r14, &sc->r14, Efault); + unsafe_put_user(regs->r15, &sc->r15, Efault); #endif /* CONFIG_X86_64 */ - put_user_ex(current->thread.trap_nr, &sc->trapno); - put_user_ex(current->thread.error_code, &sc->err); - put_user_ex(regs->ip, &sc->ip); + unsafe_put_user(current->thread.trap_nr, &sc->trapno, Efault); + unsafe_put_user(current->thread.error_code, &sc->err, Efault); + unsafe_put_user(regs->ip, &sc->ip, Efault); #ifdef CONFIG_X86_32 - put_user_ex(regs->cs, (unsigned int __user *)&sc->cs); - put_user_ex(regs->flags, &sc->flags); - put_user_ex(regs->sp, &sc->sp_at_signal); - put_user_ex(regs->ss, (unsigned int __user *)&sc->ss); + unsafe_put_user(regs->cs, (unsigned int __user *)&sc->cs, Efault); + unsafe_put_user(regs->flags, &sc->flags, Efault); + unsafe_put_user(regs->sp, &sc->sp_at_signal, Efault); + unsafe_put_user(regs->ss, (unsigned int __user *)&sc->ss, Efault); #else /* !CONFIG_X86_32 */ - put_user_ex(regs->flags, &sc->flags); - put_user_ex(regs->cs, &sc->cs); - put_user_ex(0, &sc->gs); - put_user_ex(0, &sc->fs); - put_user_ex(regs->ss, &sc->ss); + unsafe_put_user(regs->flags, &sc->flags, Efault); + unsafe_put_user(regs->cs, &sc->cs, Efault); + unsafe_put_user(0, &sc->gs, Efault); + unsafe_put_user(0, &sc->fs, Efault); + unsafe_put_user(regs->ss, &sc->ss, Efault); #endif /* CONFIG_X86_32 */ - put_user_ex(fpstate, (unsigned long __user *)&sc->fpstate); - - /* non-iBCS2 extensions.. */ - put_user_ex(mask, &sc->oldmask); - put_user_ex(current->thread.cr2, &sc->cr2); - } put_user_catch(err); + unsafe_put_user(fpstate, (unsigned long __user *)&sc->fpstate, Efault); - return err; + /* non-iBCS2 extensions.. */ + unsafe_put_user(mask, &sc->oldmask, Efault); + unsafe_put_user(current->thread.cr2, &sc->cr2, Efault); + return 0; +Efault: + return -EFAULT; } +#define unsafe_put_sigcontext(sc, fp, regs, set, label) \ +do { \ + if (__unsafe_setup_sigcontext(sc, fp, regs, set->sig[0])) \ + goto label; \ +} while(0); + +#define unsafe_put_sigmask(set, frame, label) \ + unsafe_put_user(*(__u64 *)(set), \ + (__u64 __user *)&(frame)->uc.uc_sigmask, \ + label) + /* * Set up a signal frame. */ @@ -312,26 +305,16 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set, { struct sigframe __user *frame; void __user *restorer; - int err = 0; - void __user *fpstate = NULL; - - frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate); - - if (!access_ok(frame, sizeof(*frame))) - return -EFAULT; + void __user *fp = NULL; - if (__put_user(sig, &frame->sig)) - return -EFAULT; + frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fp); - if (setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0])) + if (!user_access_begin(frame, sizeof(*frame))) return -EFAULT; - if (_NSIG_WORDS > 1) { - if (__copy_to_user(&frame->extramask, &set->sig[1], - sizeof(frame->extramask))) - return -EFAULT; - } - + unsafe_put_user(sig, &frame->sig, Efault); + unsafe_put_sigcontext(&frame->sc, fp, regs, set, Efault); + unsafe_put_user(set->sig[1], &frame->extramask[0], Efault); if (current->mm->context.vdso) restorer = current->mm->context.vdso + vdso_image_32.sym___kernel_sigreturn; @@ -341,7 +324,7 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set, restorer = ksig->ka.sa.sa_restorer; /* Set up to return from userspace. */ - err |= __put_user(restorer, &frame->pretcode); + unsafe_put_user(restorer, &frame->pretcode, Efault); /* * This is popl %eax ; movl $__NR_sigreturn, %eax ; int $0x80 @@ -350,10 +333,8 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set, * reasons and because gdb uses it as a signature to notice * signal handler stack frames. */ - err |= __put_user(*((u64 *)&retcode), (u64 *)frame->retcode); - - if (err) - return -EFAULT; + unsafe_put_user(*((u64 *)&retcode), (u64 *)frame->retcode, Efault); + user_access_end(); /* Set up registers for signal handler */ regs->sp = (unsigned long)frame; @@ -368,6 +349,10 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set, regs->cs = __USER_CS; return 0; + +Efault: + user_access_end(); + return -EFAULT; } static int __setup_rt_frame(int sig, struct ksignal *ksig, @@ -375,50 +360,45 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, { struct rt_sigframe __user *frame; void __user *restorer; - int err = 0; - void __user *fpstate = NULL; + void __user *fp = NULL; - frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate); + frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fp); - if (!access_ok(frame, sizeof(*frame))) + if (!user_access_begin(frame, sizeof(*frame))) return -EFAULT; - put_user_try { - put_user_ex(sig, &frame->sig); - put_user_ex(&frame->info, &frame->pinfo); - put_user_ex(&frame->uc, &frame->puc); + unsafe_put_user(sig, &frame->sig, Efault); + unsafe_put_user(&frame->info, &frame->pinfo, Efault); + unsafe_put_user(&frame->uc, &frame->puc, Efault); - /* Create the ucontext. */ - if (static_cpu_has(X86_FEATURE_XSAVE)) - put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags); - else - put_user_ex(0, &frame->uc.uc_flags); - put_user_ex(0, &frame->uc.uc_link); - save_altstack_ex(&frame->uc.uc_stack, regs->sp); + /* Create the ucontext. */ + if (static_cpu_has(X86_FEATURE_XSAVE)) + unsafe_put_user(UC_FP_XSTATE, &frame->uc.uc_flags, Efault); + else + unsafe_put_user(0, &frame->uc.uc_flags, Efault); + unsafe_put_user(0, &frame->uc.uc_link, Efault); + unsafe_save_altstack(&frame->uc.uc_stack, regs->sp, Efault); - /* Set up to return from userspace. */ - restorer = current->mm->context.vdso + - vdso_image_32.sym___kernel_rt_sigreturn; - if (ksig->ka.sa.sa_flags & SA_RESTORER) - restorer = ksig->ka.sa.sa_restorer; - put_user_ex(restorer, &frame->pretcode); + /* Set up to return from userspace. */ + restorer = current->mm->context.vdso + + vdso_image_32.sym___kernel_rt_sigreturn; + if (ksig->ka.sa.sa_flags & SA_RESTORER) + restorer = ksig->ka.sa.sa_restorer; + unsafe_put_user(restorer, &frame->pretcode, Efault); - /* - * This is movl $__NR_rt_sigreturn, %ax ; int $0x80 - * - * WE DO NOT USE IT ANY MORE! It's only left here for historical - * reasons and because gdb uses it as a signature to notice - * signal handler stack frames. - */ - put_user_ex(*((u64 *)&rt_retcode), (u64 *)frame->retcode); - } put_user_catch(err); + /* + * This is movl $__NR_rt_sigreturn, %ax ; int $0x80 + * + * WE DO NOT USE IT ANY MORE! It's only left here for historical + * reasons and because gdb uses it as a signature to notice + * signal handler stack frames. + */ + unsafe_put_user(*((u64 *)&rt_retcode), (u64 *)frame->retcode, Efault); + unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault); + unsafe_put_sigmask(set, frame, Efault); + user_access_end(); - err |= copy_siginfo_to_user(&frame->info, &ksig->info); - err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, - regs, set->sig[0]); - err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); - - if (err) + if (copy_siginfo_to_user(&frame->info, &ksig->info)) return -EFAULT; /* Set up registers for signal handler */ @@ -434,6 +414,9 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, regs->cs = __USER_CS; return 0; +Efault: + user_access_end(); + return -EFAULT; } #else /* !CONFIG_X86_32 */ static unsigned long frame_uc_flags(struct pt_regs *regs) @@ -457,43 +440,34 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, struct rt_sigframe __user *frame; void __user *fp = NULL; unsigned long uc_flags; - int err = 0; + + /* x86-64 should always use SA_RESTORER. */ + if (!(ksig->ka.sa.sa_flags & SA_RESTORER)) + return -EFAULT; frame = get_sigframe(&ksig->ka, regs, sizeof(struct rt_sigframe), &fp); + uc_flags = frame_uc_flags(regs); - if (!access_ok(frame, sizeof(*frame))) + if (!user_access_begin(frame, sizeof(*frame))) return -EFAULT; + /* Create the ucontext. */ + unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault); + unsafe_put_user(0, &frame->uc.uc_link, Efault); + unsafe_save_altstack(&frame->uc.uc_stack, regs->sp, Efault); + + /* Set up to return from userspace. If provided, use a stub + already in userspace. */ + unsafe_put_user(ksig->ka.sa.sa_restorer, &frame->pretcode, Efault); + unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault); + unsafe_put_sigmask(set, frame, Efault); + user_access_end(); + if (ksig->ka.sa.sa_flags & SA_SIGINFO) { if (copy_siginfo_to_user(&frame->info, &ksig->info)) return -EFAULT; } - uc_flags = frame_uc_flags(regs); - - put_user_try { - /* Create the ucontext. */ - put_user_ex(uc_flags, &frame->uc.uc_flags); - put_user_ex(0, &frame->uc.uc_link); - save_altstack_ex(&frame->uc.uc_stack, regs->sp); - - /* Set up to return from userspace. If provided, use a stub - already in userspace. */ - /* x86-64 should always use SA_RESTORER. */ - if (ksig->ka.sa.sa_flags & SA_RESTORER) { - put_user_ex(ksig->ka.sa.sa_restorer, &frame->pretcode); - } else { - /* could use a vstub here */ - err |= -EFAULT; - } - } put_user_catch(err); - - err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]); - err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); - - if (err) - return -EFAULT; - /* Set up registers for signal handler */ regs->di = sig; /* In case the signal handler was declared without prototypes */ @@ -530,6 +504,10 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, force_valid_ss(regs); return 0; + +Efault: + user_access_end(); + return -EFAULT; } #endif /* CONFIG_X86_32 */ @@ -541,44 +519,33 @@ static int x32_setup_rt_frame(struct ksignal *ksig, struct rt_sigframe_x32 __user *frame; unsigned long uc_flags; void __user *restorer; - int err = 0; - void __user *fpstate = NULL; - - frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate); + void __user *fp = NULL; - if (!access_ok(frame, sizeof(*frame))) + if (!(ksig->ka.sa.sa_flags & SA_RESTORER)) return -EFAULT; - if (ksig->ka.sa.sa_flags & SA_SIGINFO) { - if (__copy_siginfo_to_user32(&frame->info, &ksig->info, true)) - return -EFAULT; - } + frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fp); uc_flags = frame_uc_flags(regs); - put_user_try { - /* Create the ucontext. */ - put_user_ex(uc_flags, &frame->uc.uc_flags); - put_user_ex(0, &frame->uc.uc_link); - compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp); - put_user_ex(0, &frame->uc.uc__pad0); - - if (ksig->ka.sa.sa_flags & SA_RESTORER) { - restorer = ksig->ka.sa.sa_restorer; - } else { - /* could use a vstub here */ - restorer = NULL; - err |= -EFAULT; - } - put_user_ex(restorer, (unsigned long __user *)&frame->pretcode); - } put_user_catch(err); + if (!user_access_begin(frame, sizeof(*frame))) + return -EFAULT; - err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, - regs, set->sig[0]); - err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); + /* Create the ucontext. */ + unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault); + unsafe_put_user(0, &frame->uc.uc_link, Efault); + unsafe_compat_save_altstack(&frame->uc.uc_stack, regs->sp, Efault); + unsafe_put_user(0, &frame->uc.uc__pad0, Efault); + restorer = ksig->ka.sa.sa_restorer; + unsafe_put_user(restorer, (unsigned long __user *)&frame->pretcode, Efault); + unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault); + unsafe_put_sigmask(set, frame, Efault); + user_access_end(); - if (err) - return -EFAULT; + if (ksig->ka.sa.sa_flags & SA_SIGINFO) { + if (__copy_siginfo_to_user32(&frame->info, &ksig->info, true)) + return -EFAULT; + } /* Set up registers for signal handler */ regs->sp = (unsigned long) frame; @@ -597,6 +564,11 @@ static int x32_setup_rt_frame(struct ksignal *ksig, #endif /* CONFIG_X86_X32_ABI */ return 0; +#ifdef CONFIG_X86_X32_ABI +Efault: + user_access_end(); + return -EFAULT; +#endif } /* @@ -613,9 +585,8 @@ SYSCALL_DEFINE0(sigreturn) if (!access_ok(frame, sizeof(*frame))) goto badframe; - if (__get_user(set.sig[0], &frame->sc.oldmask) || (_NSIG_WORDS > 1 - && __copy_from_user(&set.sig[1], &frame->extramask, - sizeof(frame->extramask)))) + if (__get_user(set.sig[0], &frame->sc.oldmask) || + __get_user(set.sig[1], &frame->extramask[0])) goto badframe; set_current_blocked(&set); @@ -645,7 +616,7 @@ SYSCALL_DEFINE0(rt_sigreturn) frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); if (!access_ok(frame, sizeof(*frame))) goto badframe; - if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) + if (__get_user(*(__u64 *)&set, (__u64 __user *)&frame->uc.uc_sigmask)) goto badframe; if (__get_user(uc_flags, &frame->uc.uc_flags)) goto badframe; @@ -859,7 +830,7 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where) } #ifdef CONFIG_X86_X32_ABI -asmlinkage long sys32_x32_rt_sigreturn(void) +COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn) { struct pt_regs *regs = current_pt_regs(); struct rt_sigframe_x32 __user *frame; @@ -870,7 +841,7 @@ asmlinkage long sys32_x32_rt_sigreturn(void) if (!access_ok(frame, sizeof(*frame))) goto badframe; - if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) + if (__get_user(set.sig[0], (__u64 __user *)&frame->uc.uc_sigmask)) goto badframe; if (__get_user(uc_flags, &frame->uc.uc_flags)) goto badframe; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 69881b2d446c..fe3ab9632f3b 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -147,6 +147,8 @@ static inline void smpboot_restore_warm_reset_vector(void) *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0; } +static void init_freq_invariance(void); + /* * Report back to the Boot Processor during boot time or to the caller processor * during CPU online. @@ -183,6 +185,8 @@ static void smp_callin(void) */ set_cpu_sibling_map(raw_smp_processor_id()); + init_freq_invariance(); + /* * Get our bogomips. * Update loops_per_jiffy in cpu_data. Previous call to @@ -466,7 +470,7 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) */ static const struct x86_cpu_id snc_cpu[] = { - { X86_VENDOR_INTEL, 6, INTEL_FAM6_SKYLAKE_X }, + X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, NULL), {} }; @@ -1337,7 +1341,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) set_sched_topology(x86_topology); set_cpu_sibling_map(0); - + init_freq_invariance(); smp_sanity_check(); switch (apic_intr_mode) { @@ -1434,7 +1438,7 @@ early_param("possible_cpus", _setup_possible_cpus); /* * cpu_possible_mask should be static, it cannot change as cpu's * are onlined, or offlined. The reason is per-cpu data-structures - * are allocated by some modules at init time, and dont expect to + * are allocated by some modules at init time, and don't expect to * do this dynamically on cpu arrival/departure. * cpu_present_mask on the other hand can change dynamically. * In case when cpu_hotplug is not compiled, then we resort to current @@ -1764,3 +1768,287 @@ void native_play_dead(void) } #endif + +/* + * APERF/MPERF frequency ratio computation. + * + * The scheduler wants to do frequency invariant accounting and needs a <1 + * ratio to account for the 'current' frequency, corresponding to + * freq_curr / freq_max. + * + * Since the frequency freq_curr on x86 is controlled by micro-controller and + * our P-state setting is little more than a request/hint, we need to observe + * the effective frequency 'BusyMHz', i.e. the average frequency over a time + * interval after discarding idle time. This is given by: + * + * BusyMHz = delta_APERF / delta_MPERF * freq_base + * + * where freq_base is the max non-turbo P-state. + * + * The freq_max term has to be set to a somewhat arbitrary value, because we + * can't know which turbo states will be available at a given point in time: + * it all depends on the thermal headroom of the entire package. We set it to + * the turbo level with 4 cores active. + * + * Benchmarks show that's a good compromise between the 1C turbo ratio + * (freq_curr/freq_max would rarely reach 1) and something close to freq_base, + * which would ignore the entire turbo range (a conspicuous part, making + * freq_curr/freq_max always maxed out). + * + * An exception to the heuristic above is the Atom uarch, where we choose the + * highest turbo level for freq_max since Atom's are generally oriented towards + * power efficiency. + * + * Setting freq_max to anything less than the 1C turbo ratio makes the ratio + * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1. + */ + +DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key); + +static DEFINE_PER_CPU(u64, arch_prev_aperf); +static DEFINE_PER_CPU(u64, arch_prev_mperf); +static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE; +static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE; + +void arch_set_max_freq_ratio(bool turbo_disabled) +{ + arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE : + arch_turbo_freq_ratio; +} + +static bool turbo_disabled(void) +{ + u64 misc_en; + int err; + + err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en); + if (err) + return false; + + return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE); +} + +static bool slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) +{ + int err; + + err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq); + if (err) + return false; + + err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq); + if (err) + return false; + + *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */ + *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */ + + return true; +} + +#include <asm/cpu_device_id.h> +#include <asm/intel-family.h> + +#define ICPU(model) \ + {X86_VENDOR_INTEL, 6, model, X86_FEATURE_APERFMPERF, 0} + +static const struct x86_cpu_id has_knl_turbo_ratio_limits[] = { + ICPU(INTEL_FAM6_XEON_PHI_KNL), + ICPU(INTEL_FAM6_XEON_PHI_KNM), + {} +}; + +static const struct x86_cpu_id has_skx_turbo_ratio_limits[] = { + ICPU(INTEL_FAM6_SKYLAKE_X), + {} +}; + +static const struct x86_cpu_id has_glm_turbo_ratio_limits[] = { + ICPU(INTEL_FAM6_ATOM_GOLDMONT), + ICPU(INTEL_FAM6_ATOM_GOLDMONT_D), + ICPU(INTEL_FAM6_ATOM_GOLDMONT_PLUS), + {} +}; + +static bool knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, + int num_delta_fratio) +{ + int fratio, delta_fratio, found; + int err, i; + u64 msr; + + if (!x86_match_cpu(has_knl_turbo_ratio_limits)) + return false; + + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + if (err) + return false; + + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); + if (err) + return false; + + fratio = (msr >> 8) & 0xFF; + i = 16; + found = 0; + do { + if (found >= num_delta_fratio) { + *turbo_freq = fratio; + return true; + } + + delta_fratio = (msr >> (i + 5)) & 0x7; + + if (delta_fratio) { + found += 1; + fratio -= delta_fratio; + } + + i += 8; + } while (i < 64); + + return true; +} + +static bool skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size) +{ + u64 ratios, counts; + u32 group_size; + int err, i; + + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + if (err) + return false; + + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios); + if (err) + return false; + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts); + if (err) + return false; + + for (i = 0; i < 64; i += 8) { + group_size = (counts >> i) & 0xFF; + if (group_size >= size) { + *turbo_freq = (ratios >> i) & 0xFF; + return true; + } + } + + return false; +} + +static bool core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) +{ + int err; + + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + if (err) + return false; + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, turbo_freq); + if (err) + return false; + + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ + *turbo_freq = (*turbo_freq >> 24) & 0xFF; /* 4C turbo */ + + return true; +} + +static bool intel_set_max_freq_ratio(void) +{ + u64 base_freq, turbo_freq; + + if (slv_set_max_freq_ratio(&base_freq, &turbo_freq)) + goto out; + + if (x86_match_cpu(has_glm_turbo_ratio_limits) && + skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) + goto out; + + if (knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) + goto out; + + if (x86_match_cpu(has_skx_turbo_ratio_limits) && + skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4)) + goto out; + + if (core_set_max_freq_ratio(&base_freq, &turbo_freq)) + goto out; + + return false; + +out: + arch_turbo_freq_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, + base_freq); + arch_set_max_freq_ratio(turbo_disabled()); + return true; +} + +static void init_counter_refs(void *arg) +{ + u64 aperf, mperf; + + rdmsrl(MSR_IA32_APERF, aperf); + rdmsrl(MSR_IA32_MPERF, mperf); + + this_cpu_write(arch_prev_aperf, aperf); + this_cpu_write(arch_prev_mperf, mperf); +} + +static void init_freq_invariance(void) +{ + bool ret = false; + + if (smp_processor_id() != 0 || !boot_cpu_has(X86_FEATURE_APERFMPERF)) + return; + + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + ret = intel_set_max_freq_ratio(); + + if (ret) { + on_each_cpu(init_counter_refs, NULL, 1); + static_branch_enable(&arch_scale_freq_key); + } else { + pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n"); + } +} + +DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; + +void arch_scale_freq_tick(void) +{ + u64 freq_scale; + u64 aperf, mperf; + u64 acnt, mcnt; + + if (!arch_scale_freq_invariant()) + return; + + rdmsrl(MSR_IA32_APERF, aperf); + rdmsrl(MSR_IA32_MPERF, mperf); + + acnt = aperf - this_cpu_read(arch_prev_aperf); + mcnt = mperf - this_cpu_read(arch_prev_mperf); + if (!mcnt) + return; + + this_cpu_write(arch_prev_aperf, aperf); + this_cpu_write(arch_prev_mperf, mperf); + + acnt <<= 2*SCHED_CAPACITY_SHIFT; + mcnt *= arch_max_freq_ratio; + + freq_scale = div64_u64(acnt, mcnt); + + if (freq_scale > SCHED_CAPACITY_SCALE) + freq_scale = SCHED_CAPACITY_SCALE; + + this_cpu_write(arch_freq_scale, freq_scale); +} diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 2d6898c2cb64..6ad43fc44556 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -96,7 +96,8 @@ struct stack_frame_user { }; static int -copy_stack_frame(const void __user *fp, struct stack_frame_user *frame) +copy_stack_frame(const struct stack_frame_user __user *fp, + struct stack_frame_user *frame) { int ret; @@ -105,7 +106,8 @@ copy_stack_frame(const void __user *fp, struct stack_frame_user *frame) ret = 1; pagefault_disable(); - if (__copy_from_user_inatomic(frame, fp, sizeof(*frame))) + if (__get_user(frame->next_fp, &fp->next_fp) || + __get_user(frame->ret_addr, &fp->ret_addr)) ret = 0; pagefault_enable(); diff --git a/arch/x86/kernel/sys_ia32.c b/arch/x86/kernel/sys_ia32.c new file mode 100644 index 000000000000..ab03fede1422 --- /dev/null +++ b/arch/x86/kernel/sys_ia32.c @@ -0,0 +1,255 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * sys_ia32.c: Conversion between 32bit and 64bit native syscalls. Based on + * sys_sparc32 + * + * Copyright (C) 2000 VA Linux Co + * Copyright (C) 2000 Don Dugger <n0ano@valinux.com> + * Copyright (C) 1999 Arun Sharma <arun.sharma@intel.com> + * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz) + * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) + * Copyright (C) 2000 Hewlett-Packard Co. + * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com> + * Copyright (C) 2000,2001,2002 Andi Kleen, SuSE Labs (x86-64 port) + * + * These routines maintain argument size conversion between 32bit and 64bit + * environment. In 2.5 most of this should be moved to a generic directory. + * + * This file assumes that there is a hole at the end of user address space. + * + * Some of the functions are LE specific currently. These are + * hopefully all marked. This should be fixed. + */ + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/signal.h> +#include <linux/syscalls.h> +#include <linux/times.h> +#include <linux/utsname.h> +#include <linux/mm.h> +#include <linux/uio.h> +#include <linux/poll.h> +#include <linux/personality.h> +#include <linux/stat.h> +#include <linux/rwsem.h> +#include <linux/compat.h> +#include <linux/vfs.h> +#include <linux/ptrace.h> +#include <linux/highuid.h> +#include <linux/sysctl.h> +#include <linux/slab.h> +#include <linux/sched/task.h> +#include <asm/mman.h> +#include <asm/types.h> +#include <linux/uaccess.h> +#include <linux/atomic.h> +#include <asm/vgtod.h> +#include <asm/ia32.h> + +#define AA(__x) ((unsigned long)(__x)) + +SYSCALL_DEFINE3(ia32_truncate64, const char __user *, filename, + unsigned long, offset_low, unsigned long, offset_high) +{ + return ksys_truncate(filename, + ((loff_t) offset_high << 32) | offset_low); +} + +SYSCALL_DEFINE3(ia32_ftruncate64, unsigned int, fd, + unsigned long, offset_low, unsigned long, offset_high) +{ + return ksys_ftruncate(fd, ((loff_t) offset_high << 32) | offset_low); +} + +/* warning: next two assume little endian */ +SYSCALL_DEFINE5(ia32_pread64, unsigned int, fd, char __user *, ubuf, + u32, count, u32, poslo, u32, poshi) +{ + return ksys_pread64(fd, ubuf, count, + ((loff_t)AA(poshi) << 32) | AA(poslo)); +} + +SYSCALL_DEFINE5(ia32_pwrite64, unsigned int, fd, const char __user *, ubuf, + u32, count, u32, poslo, u32, poshi) +{ + return ksys_pwrite64(fd, ubuf, count, + ((loff_t)AA(poshi) << 32) | AA(poslo)); +} + + +/* + * Some system calls that need sign extended arguments. This could be + * done by a generic wrapper. + */ +SYSCALL_DEFINE6(ia32_fadvise64_64, int, fd, __u32, offset_low, + __u32, offset_high, __u32, len_low, __u32, len_high, + int, advice) +{ + return ksys_fadvise64_64(fd, + (((u64)offset_high)<<32) | offset_low, + (((u64)len_high)<<32) | len_low, + advice); +} + +SYSCALL_DEFINE4(ia32_readahead, int, fd, unsigned int, off_lo, + unsigned int, off_hi, size_t, count) +{ + return ksys_readahead(fd, ((u64)off_hi << 32) | off_lo, count); +} + +SYSCALL_DEFINE6(ia32_sync_file_range, int, fd, unsigned int, off_low, + unsigned int, off_hi, unsigned int, n_low, + unsigned int, n_hi, int, flags) +{ + return ksys_sync_file_range(fd, + ((u64)off_hi << 32) | off_low, + ((u64)n_hi << 32) | n_low, flags); +} + +SYSCALL_DEFINE5(ia32_fadvise64, int, fd, unsigned int, offset_lo, + unsigned int, offset_hi, size_t, len, int, advice) +{ + return ksys_fadvise64_64(fd, ((u64)offset_hi << 32) | offset_lo, + len, advice); +} + +SYSCALL_DEFINE6(ia32_fallocate, int, fd, int, mode, + unsigned int, offset_lo, unsigned int, offset_hi, + unsigned int, len_lo, unsigned int, len_hi) +{ + return ksys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo, + ((u64)len_hi << 32) | len_lo); +} + +#ifdef CONFIG_IA32_EMULATION +/* + * Another set for IA32/LFS -- x86_64 struct stat is different due to + * support for 64bit inode numbers. + */ +static int cp_stat64(struct stat64 __user *ubuf, struct kstat *stat) +{ + typeof(ubuf->st_uid) uid = 0; + typeof(ubuf->st_gid) gid = 0; + SET_UID(uid, from_kuid_munged(current_user_ns(), stat->uid)); + SET_GID(gid, from_kgid_munged(current_user_ns(), stat->gid)); + if (!access_ok(ubuf, sizeof(struct stat64)) || + __put_user(huge_encode_dev(stat->dev), &ubuf->st_dev) || + __put_user(stat->ino, &ubuf->__st_ino) || + __put_user(stat->ino, &ubuf->st_ino) || + __put_user(stat->mode, &ubuf->st_mode) || + __put_user(stat->nlink, &ubuf->st_nlink) || + __put_user(uid, &ubuf->st_uid) || + __put_user(gid, &ubuf->st_gid) || + __put_user(huge_encode_dev(stat->rdev), &ubuf->st_rdev) || + __put_user(stat->size, &ubuf->st_size) || + __put_user(stat->atime.tv_sec, &ubuf->st_atime) || + __put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec) || + __put_user(stat->mtime.tv_sec, &ubuf->st_mtime) || + __put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec) || + __put_user(stat->ctime.tv_sec, &ubuf->st_ctime) || + __put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec) || + __put_user(stat->blksize, &ubuf->st_blksize) || + __put_user(stat->blocks, &ubuf->st_blocks)) + return -EFAULT; + return 0; +} + +COMPAT_SYSCALL_DEFINE2(ia32_stat64, const char __user *, filename, + struct stat64 __user *, statbuf) +{ + struct kstat stat; + int ret = vfs_stat(filename, &stat); + + if (!ret) + ret = cp_stat64(statbuf, &stat); + return ret; +} + +COMPAT_SYSCALL_DEFINE2(ia32_lstat64, const char __user *, filename, + struct stat64 __user *, statbuf) +{ + struct kstat stat; + int ret = vfs_lstat(filename, &stat); + if (!ret) + ret = cp_stat64(statbuf, &stat); + return ret; +} + +COMPAT_SYSCALL_DEFINE2(ia32_fstat64, unsigned int, fd, + struct stat64 __user *, statbuf) +{ + struct kstat stat; + int ret = vfs_fstat(fd, &stat); + if (!ret) + ret = cp_stat64(statbuf, &stat); + return ret; +} + +COMPAT_SYSCALL_DEFINE4(ia32_fstatat64, unsigned int, dfd, + const char __user *, filename, + struct stat64 __user *, statbuf, int, flag) +{ + struct kstat stat; + int error; + + error = vfs_fstatat(dfd, filename, &stat, flag); + if (error) + return error; + return cp_stat64(statbuf, &stat); +} + +/* + * Linux/i386 didn't use to be able to handle more than + * 4 system call parameters, so these system calls used a memory + * block for parameter passing.. + */ + +struct mmap_arg_struct32 { + unsigned int addr; + unsigned int len; + unsigned int prot; + unsigned int flags; + unsigned int fd; + unsigned int offset; +}; + +COMPAT_SYSCALL_DEFINE1(ia32_mmap, struct mmap_arg_struct32 __user *, arg) +{ + struct mmap_arg_struct32 a; + + if (copy_from_user(&a, arg, sizeof(a))) + return -EFAULT; + + if (a.offset & ~PAGE_MASK) + return -EINVAL; + + return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, + a.offset>>PAGE_SHIFT); +} + +/* + * The 32-bit clone ABI is CONFIG_CLONE_BACKWARDS + */ +COMPAT_SYSCALL_DEFINE5(ia32_clone, unsigned long, clone_flags, + unsigned long, newsp, int __user *, parent_tidptr, + unsigned long, tls_val, int __user *, child_tidptr) +{ + struct kernel_clone_args args = { + .flags = (clone_flags & ~CSIGNAL), + .pidfd = parent_tidptr, + .child_tid = child_tidptr, + .parent_tid = parent_tidptr, + .exit_signal = (clone_flags & CSIGNAL), + .stack = newsp, + .tls = tls_val, + }; + + if (!legacy_clone_args_valid(&args)) + return -EINVAL; + + return _do_fork(&args); +} +#endif /* CONFIG_IA32_EMULATION */ diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index ca3c11a17b5a..504fa5425bce 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -21,7 +21,6 @@ #include <asm/elf.h> #include <asm/ia32.h> -#include <asm/syscalls.h> /* * Align a virtual address to avoid aliasing in the I$ on AMD F15h. diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index d8673d8a779b..106e7f87f534 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -62,19 +62,16 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -static struct irqaction irq0 = { - .handler = timer_interrupt, - .flags = IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, - .name = "timer" -}; - static void __init setup_default_timer_irq(void) { + unsigned long flags = IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER; + /* - * Unconditionally register the legacy timer; even without legacy - * PIC/PIT we need this for the HPET0 in legacy replacement mode. + * Unconditionally register the legacy timer interrupt; even + * without legacy PIC/PIT we need this for the HPET0 in legacy + * replacement mode. */ - if (setup_irq(0, &irq0)) + if (request_irq(0, timer_interrupt, flags, "timer", NULL)) pr_info("Failed to register legacy timer interrupt\n"); } @@ -122,18 +119,12 @@ void __init time_init(void) */ void clocksource_arch_init(struct clocksource *cs) { - if (cs->archdata.vclock_mode == VCLOCK_NONE) + if (cs->vdso_clock_mode == VDSO_CLOCKMODE_NONE) return; - if (cs->archdata.vclock_mode > VCLOCK_MAX) { - pr_warn("clocksource %s registered with invalid vclock_mode %d. Disabling vclock.\n", - cs->name, cs->archdata.vclock_mode); - cs->archdata.vclock_mode = VCLOCK_NONE; - } - if (cs->mask != CLOCKSOURCE_MASK(64)) { - pr_warn("clocksource %s registered with invalid mask %016llx. Disabling vclock.\n", + pr_warn("clocksource %s registered with invalid mask %016llx for VDSO. Disabling VDSO support.\n", cs->name, cs->mask); - cs->archdata.vclock_mode = VCLOCK_NONE; + cs->vdso_clock_mode = VDSO_CLOCKMODE_NONE; } } diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index be5bc2e47c71..b8810ebbc8ae 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -59,39 +59,29 @@ __setup("cpu0_hotplug", enable_cpu0_hotplug); */ int _debug_hotplug_cpu(int cpu, int action) { - struct device *dev = get_cpu_device(cpu); int ret; if (!cpu_is_hotpluggable(cpu)) return -EINVAL; - lock_device_hotplug(); - switch (action) { case 0: - ret = cpu_down(cpu); - if (!ret) { + ret = remove_cpu(cpu); + if (!ret) pr_info("DEBUG_HOTPLUG_CPU0: CPU %u is now offline\n", cpu); - dev->offline = true; - kobject_uevent(&dev->kobj, KOBJ_OFFLINE); - } else + else pr_debug("Can't offline CPU%d.\n", cpu); break; case 1: - ret = cpu_up(cpu); - if (!ret) { - dev->offline = false; - kobject_uevent(&dev->kobj, KOBJ_ONLINE); - } else { + ret = add_cpu(cpu); + if (ret) pr_debug("Can't online CPU%d.\n", cpu); - } + break; default: ret = -EINVAL; } - unlock_device_hotplug(); - return ret; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 6ef00eb6fbb9..d54cffdc7cac 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -46,6 +46,7 @@ #include <asm/traps.h> #include <asm/desc.h> #include <asm/fpu/internal.h> +#include <asm/cpu.h> #include <asm/cpu_entry_area.h> #include <asm/mce.h> #include <asm/fixmap.h> @@ -242,7 +243,6 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, { struct task_struct *tsk = current; - if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code)) return; @@ -288,9 +288,29 @@ DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, 0, NULL, "coprocessor segment overru DO_ERROR(X86_TRAP_TS, SIGSEGV, 0, NULL, "invalid TSS", invalid_TSS) DO_ERROR(X86_TRAP_NP, SIGBUS, 0, NULL, "segment not present", segment_not_present) DO_ERROR(X86_TRAP_SS, SIGBUS, 0, NULL, "stack segment", stack_segment) -DO_ERROR(X86_TRAP_AC, SIGBUS, BUS_ADRALN, NULL, "alignment check", alignment_check) #undef IP +dotraplinkage void do_alignment_check(struct pt_regs *regs, long error_code) +{ + char *str = "alignment check"; + + RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); + + if (notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_AC, SIGBUS) == NOTIFY_STOP) + return; + + if (!user_mode(regs)) + die("Split lock detected\n", regs, error_code); + + local_irq_enable(); + + if (handle_user_split_lock(regs, error_code)) + return; + + do_trap(X86_TRAP_AC, SIGBUS, "alignment check", regs, + error_code, BUS_ADRALN, NULL); +} + #ifdef CONFIG_VMAP_STACK __visible void __noreturn handle_stack_overflow(const char *message, struct pt_regs *regs, @@ -572,14 +592,20 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) return; /* - * Use ist_enter despite the fact that we don't use an IST stack. - * We can be called from a kprobe in non-CONTEXT_KERNEL kernel - * mode or even during context tracking state changes. + * Unlike any other non-IST entry, we can be called from a kprobe in + * non-CONTEXT_KERNEL kernel mode or even during context tracking + * state changes. Make sure that we wake up RCU even if we're coming + * from kernel code. * - * This means that we can't schedule. That's okay. + * This means that we can't schedule even if we came from a + * preemptible kernel context. That's okay. */ - ist_enter(regs); + if (!user_mode(regs)) { + rcu_nmi_enter(); + preempt_disable(); + } RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); + #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, SIGTRAP) == NOTIFY_STOP) @@ -600,7 +626,10 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) cond_local_irq_disable(regs); exit: - ist_exit(regs); + if (!user_mode(regs)) { + preempt_enable_no_resched(); + rcu_nmi_exit(); + } } NOKPROBE_SYMBOL(do_int3); @@ -862,7 +891,25 @@ do_simd_coprocessor_error(struct pt_regs *regs, long error_code) dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) { - cond_local_irq_enable(regs); + /* + * This addresses a Pentium Pro Erratum: + * + * PROBLEM: If the APIC subsystem is configured in mixed mode with + * Virtual Wire mode implemented through the local APIC, an + * interrupt vector of 0Fh (Intel reserved encoding) may be + * generated by the local APIC (Int 15). This vector may be + * generated upon receipt of a spurious interrupt (an interrupt + * which is removed before the system receives the INTA sequence) + * instead of the programmed 8259 spurious interrupt vector. + * + * IMPLICATION: The spurious interrupt vector programmed in the + * 8259 is normally handled by an operating system's spurious + * interrupt handler. However, a vector of 0Fh is unknown to some + * operating systems, which would crash if this erratum occurred. + * + * In theory this could be limited to 32bit, but the handler is not + * hurting and who knows which other CPUs suffer from this. + */ } dotraplinkage void diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 7e322e2daaf5..fdd4c1078632 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -477,7 +477,7 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin) * transition from one expected value to another with a fairly * high accuracy, and we didn't miss any events. We can thus * use the TSC value at the transitions to calculate a pretty - * good value for the TSC frequencty. + * good value for the TSC frequency. */ static inline int pit_verify_msb(unsigned char val) { @@ -1108,17 +1108,24 @@ static void tsc_cs_tick_stable(struct clocksource *cs) sched_clock_tick_stable(); } +static int tsc_cs_enable(struct clocksource *cs) +{ + vclocks_set_used(VDSO_CLOCKMODE_TSC); + return 0; +} + /* * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc() */ static struct clocksource clocksource_tsc_early = { - .name = "tsc-early", - .rating = 299, - .read = read_tsc, - .mask = CLOCKSOURCE_MASK(64), - .flags = CLOCK_SOURCE_IS_CONTINUOUS | + .name = "tsc-early", + .rating = 299, + .read = read_tsc, + .mask = CLOCKSOURCE_MASK(64), + .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_MUST_VERIFY, - .archdata = { .vclock_mode = VCLOCK_TSC }, + .vdso_clock_mode = VDSO_CLOCKMODE_TSC, + .enable = tsc_cs_enable, .resume = tsc_resume, .mark_unstable = tsc_cs_mark_unstable, .tick_stable = tsc_cs_tick_stable, @@ -1131,14 +1138,15 @@ static struct clocksource clocksource_tsc_early = { * been found good. */ static struct clocksource clocksource_tsc = { - .name = "tsc", - .rating = 300, - .read = read_tsc, - .mask = CLOCKSOURCE_MASK(64), - .flags = CLOCK_SOURCE_IS_CONTINUOUS | + .name = "tsc", + .rating = 300, + .read = read_tsc, + .mask = CLOCKSOURCE_MASK(64), + .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_MUST_VERIFY, - .archdata = { .vclock_mode = VCLOCK_TSC }, + .vdso_clock_mode = VDSO_CLOCKMODE_TSC, + .enable = tsc_cs_enable, .resume = tsc_resume, .mark_unstable = tsc_cs_mark_unstable, .tick_stable = tsc_cs_tick_stable, diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index e0cbe4f2af49..4fec6f3a1858 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -15,18 +15,46 @@ #include <asm/param.h> #include <asm/tsc.h> -#define MAX_NUM_FREQS 9 +#define MAX_NUM_FREQS 16 /* 4 bits to select the frequency */ + +/* + * The frequency numbers in the SDM are e.g. 83.3 MHz, which does not contain a + * lot of accuracy which leads to clock drift. As far as we know Bay Trail SoCs + * use a 25 MHz crystal and Cherry Trail uses a 19.2 MHz crystal, the crystal + * is the source clk for a root PLL which outputs 1600 and 100 MHz. It is + * unclear if the root PLL outputs are used directly by the CPU clock PLL or + * if there is another PLL in between. + * This does not matter though, we can model the chain of PLLs as a single PLL + * with a quotient equal to the quotients of all PLLs in the chain multiplied. + * So we can create a simplified model of the CPU clock setup using a reference + * clock of 100 MHz plus a quotient which gets us as close to the frequency + * from the SDM as possible. + * For the 83.3 MHz example from above this would give us 100 MHz * 5 / 6 = + * 83 and 1/3 MHz, which matches exactly what has been measured on actual hw. + */ +#define TSC_REFERENCE_KHZ 100000 + +struct muldiv { + u32 multiplier; + u32 divider; +}; /* * If MSR_PERF_STAT[31] is set, the maximum resolved bus ratio can be * read in MSR_PLATFORM_ID[12:8], otherwise in MSR_PERF_STAT[44:40]. * Unfortunately some Intel Atom SoCs aren't quite compliant to this, * so we need manually differentiate SoC families. This is what the - * field msr_plat does. + * field use_msr_plat does. */ struct freq_desc { - u8 msr_plat; /* 1: use MSR_PLATFORM_INFO, 0: MSR_IA32_PERF_STATUS */ + bool use_msr_plat; + struct muldiv muldiv[MAX_NUM_FREQS]; + /* + * Some CPU frequencies in the SDM do not map to known PLL freqs, in + * that case the muldiv array is empty and the freqs array is used. + */ u32 freqs[MAX_NUM_FREQS]; + u32 mask; }; /* @@ -35,41 +63,91 @@ struct freq_desc { * by MSR based on SDM. */ static const struct freq_desc freq_desc_pnw = { - 0, { 0, 0, 0, 0, 0, 99840, 0, 83200 } + .use_msr_plat = false, + .freqs = { 0, 0, 0, 0, 0, 99840, 0, 83200 }, + .mask = 0x07, }; static const struct freq_desc freq_desc_clv = { - 0, { 0, 133200, 0, 0, 0, 99840, 0, 83200 } + .use_msr_plat = false, + .freqs = { 0, 133200, 0, 0, 0, 99840, 0, 83200 }, + .mask = 0x07, }; +/* + * Bay Trail SDM MSR_FSB_FREQ frequencies simplified PLL model: + * 000: 100 * 5 / 6 = 83.3333 MHz + * 001: 100 * 1 / 1 = 100.0000 MHz + * 010: 100 * 4 / 3 = 133.3333 MHz + * 011: 100 * 7 / 6 = 116.6667 MHz + * 100: 100 * 4 / 5 = 80.0000 MHz + */ static const struct freq_desc freq_desc_byt = { - 1, { 83300, 100000, 133300, 116700, 80000, 0, 0, 0 } + .use_msr_plat = true, + .muldiv = { { 5, 6 }, { 1, 1 }, { 4, 3 }, { 7, 6 }, + { 4, 5 } }, + .mask = 0x07, }; +/* + * Cherry Trail SDM MSR_FSB_FREQ frequencies simplified PLL model: + * 0000: 100 * 5 / 6 = 83.3333 MHz + * 0001: 100 * 1 / 1 = 100.0000 MHz + * 0010: 100 * 4 / 3 = 133.3333 MHz + * 0011: 100 * 7 / 6 = 116.6667 MHz + * 0100: 100 * 4 / 5 = 80.0000 MHz + * 0101: 100 * 14 / 15 = 93.3333 MHz + * 0110: 100 * 9 / 10 = 90.0000 MHz + * 0111: 100 * 8 / 9 = 88.8889 MHz + * 1000: 100 * 7 / 8 = 87.5000 MHz + */ static const struct freq_desc freq_desc_cht = { - 1, { 83300, 100000, 133300, 116700, 80000, 93300, 90000, 88900, 87500 } + .use_msr_plat = true, + .muldiv = { { 5, 6 }, { 1, 1 }, { 4, 3 }, { 7, 6 }, + { 4, 5 }, { 14, 15 }, { 9, 10 }, { 8, 9 }, + { 7, 8 } }, + .mask = 0x0f, }; +/* + * Merriefield SDM MSR_FSB_FREQ frequencies simplified PLL model: + * 0001: 100 * 1 / 1 = 100.0000 MHz + * 0010: 100 * 4 / 3 = 133.3333 MHz + */ static const struct freq_desc freq_desc_tng = { - 1, { 0, 100000, 133300, 0, 0, 0, 0, 0 } + .use_msr_plat = true, + .muldiv = { { 0, 0 }, { 1, 1 }, { 4, 3 } }, + .mask = 0x07, }; +/* + * Moorefield SDM MSR_FSB_FREQ frequencies simplified PLL model: + * 0000: 100 * 5 / 6 = 83.3333 MHz + * 0001: 100 * 1 / 1 = 100.0000 MHz + * 0010: 100 * 4 / 3 = 133.3333 MHz + * 0011: 100 * 1 / 1 = 100.0000 MHz + */ static const struct freq_desc freq_desc_ann = { - 1, { 83300, 100000, 133300, 100000, 0, 0, 0, 0 } + .use_msr_plat = true, + .muldiv = { { 5, 6 }, { 1, 1 }, { 4, 3 }, { 1, 1 } }, + .mask = 0x0f, }; +/* 24 MHz crystal? : 24 * 13 / 4 = 78 MHz */ static const struct freq_desc freq_desc_lgm = { - 1, { 78000, 78000, 78000, 78000, 78000, 78000, 78000, 78000 } + .use_msr_plat = true, + .freqs = { 78000, 78000, 78000, 78000, 78000, 78000, 78000, 78000 }, + .mask = 0x0f, }; static const struct x86_cpu_id tsc_msr_cpu_ids[] = { - INTEL_CPU_FAM6(ATOM_SALTWELL_MID, freq_desc_pnw), - INTEL_CPU_FAM6(ATOM_SALTWELL_TABLET, freq_desc_clv), - INTEL_CPU_FAM6(ATOM_SILVERMONT, freq_desc_byt), - INTEL_CPU_FAM6(ATOM_SILVERMONT_MID, freq_desc_tng), - INTEL_CPU_FAM6(ATOM_AIRMONT, freq_desc_cht), - INTEL_CPU_FAM6(ATOM_AIRMONT_MID, freq_desc_ann), - INTEL_CPU_FAM6(ATOM_AIRMONT_NP, freq_desc_lgm), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_SALTWELL_MID, &freq_desc_pnw), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_SALTWELL_TABLET,&freq_desc_clv), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT, &freq_desc_byt), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT_MID, &freq_desc_tng), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT, &freq_desc_cht), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT_MID, &freq_desc_ann), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT_NP, &freq_desc_lgm), {} }; @@ -81,17 +159,19 @@ static const struct x86_cpu_id tsc_msr_cpu_ids[] = { */ unsigned long cpu_khz_from_msr(void) { - u32 lo, hi, ratio, freq; + u32 lo, hi, ratio, freq, tscref; const struct freq_desc *freq_desc; const struct x86_cpu_id *id; + const struct muldiv *md; unsigned long res; + int index; id = x86_match_cpu(tsc_msr_cpu_ids); if (!id) return 0; freq_desc = (struct freq_desc *)id->driver_data; - if (freq_desc->msr_plat) { + if (freq_desc->use_msr_plat) { rdmsr(MSR_PLATFORM_INFO, lo, hi); ratio = (lo >> 8) & 0xff; } else { @@ -101,12 +181,28 @@ unsigned long cpu_khz_from_msr(void) /* Get FSB FREQ ID */ rdmsr(MSR_FSB_FREQ, lo, hi); + index = lo & freq_desc->mask; + md = &freq_desc->muldiv[index]; - /* Map CPU reference clock freq ID(0-7) to CPU reference clock freq(KHz) */ - freq = freq_desc->freqs[lo & 0x7]; + /* + * Note this also catches cases where the index points to an unpopulated + * part of muldiv, in that case the else will set freq and res to 0. + */ + if (md->divider) { + tscref = TSC_REFERENCE_KHZ * md->multiplier; + freq = DIV_ROUND_CLOSEST(tscref, md->divider); + /* + * Multiplying by ratio before the division has better + * accuracy than just calculating freq * ratio. + */ + res = DIV_ROUND_CLOSEST(tscref * ratio, md->divider); + } else { + freq = freq_desc->freqs[index]; + res = freq * ratio; + } - /* TSC frequency = maximum resolved freq * maximum resolved bus ratio */ - res = freq * ratio; + if (freq == 0) + pr_err("Error MSR_FSB_FREQ index %d is unknown\n", index); #ifdef CONFIG_X86_LOCAL_APIC lapic_timer_period = (freq * 1000) / HZ; diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 32a818764e03..3d3c761eb74a 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -295,7 +295,7 @@ static cycles_t check_tsc_warp(unsigned int timeout) * But as the TSC is per-logical CPU and can potentially be modified wrongly * by the bios, TSC sync test for smaller duration should be able * to catch such errors. Also this will catch the condition where all the - * cores in the socket doesn't get reset at the same time. + * cores in the socket don't get reset at the same time. */ static inline unsigned int loop_timeout(int cpu) { diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 91d55454e702..47a8676c7395 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -98,7 +98,6 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) struct task_struct *tsk = current; struct vm86plus_struct __user *user; struct vm86 *vm86 = current->thread.vm86; - long err = 0; /* * This gets called from entry.S with interrupts disabled, but @@ -114,37 +113,30 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->veflags_mask); user = vm86->user_vm86; - if (!access_ok(user, vm86->vm86plus.is_vm86pus ? + if (!user_access_begin(user, vm86->vm86plus.is_vm86pus ? sizeof(struct vm86plus_struct) : - sizeof(struct vm86_struct))) { - pr_alert("could not access userspace vm86 info\n"); - do_exit(SIGSEGV); - } - - put_user_try { - put_user_ex(regs->pt.bx, &user->regs.ebx); - put_user_ex(regs->pt.cx, &user->regs.ecx); - put_user_ex(regs->pt.dx, &user->regs.edx); - put_user_ex(regs->pt.si, &user->regs.esi); - put_user_ex(regs->pt.di, &user->regs.edi); - put_user_ex(regs->pt.bp, &user->regs.ebp); - put_user_ex(regs->pt.ax, &user->regs.eax); - put_user_ex(regs->pt.ip, &user->regs.eip); - put_user_ex(regs->pt.cs, &user->regs.cs); - put_user_ex(regs->pt.flags, &user->regs.eflags); - put_user_ex(regs->pt.sp, &user->regs.esp); - put_user_ex(regs->pt.ss, &user->regs.ss); - put_user_ex(regs->es, &user->regs.es); - put_user_ex(regs->ds, &user->regs.ds); - put_user_ex(regs->fs, &user->regs.fs); - put_user_ex(regs->gs, &user->regs.gs); - - put_user_ex(vm86->screen_bitmap, &user->screen_bitmap); - } put_user_catch(err); - if (err) { - pr_alert("could not access userspace vm86 info\n"); - do_exit(SIGSEGV); - } + sizeof(struct vm86_struct))) + goto Efault; + + unsafe_put_user(regs->pt.bx, &user->regs.ebx, Efault_end); + unsafe_put_user(regs->pt.cx, &user->regs.ecx, Efault_end); + unsafe_put_user(regs->pt.dx, &user->regs.edx, Efault_end); + unsafe_put_user(regs->pt.si, &user->regs.esi, Efault_end); + unsafe_put_user(regs->pt.di, &user->regs.edi, Efault_end); + unsafe_put_user(regs->pt.bp, &user->regs.ebp, Efault_end); + unsafe_put_user(regs->pt.ax, &user->regs.eax, Efault_end); + unsafe_put_user(regs->pt.ip, &user->regs.eip, Efault_end); + unsafe_put_user(regs->pt.cs, &user->regs.cs, Efault_end); + unsafe_put_user(regs->pt.flags, &user->regs.eflags, Efault_end); + unsafe_put_user(regs->pt.sp, &user->regs.esp, Efault_end); + unsafe_put_user(regs->pt.ss, &user->regs.ss, Efault_end); + unsafe_put_user(regs->es, &user->regs.es, Efault_end); + unsafe_put_user(regs->ds, &user->regs.ds, Efault_end); + unsafe_put_user(regs->fs, &user->regs.fs, Efault_end); + unsafe_put_user(regs->gs, &user->regs.gs, Efault_end); + unsafe_put_user(vm86->screen_bitmap, &user->screen_bitmap, Efault_end); + + user_access_end(); preempt_disable(); tsk->thread.sp0 = vm86->saved_sp0; @@ -159,6 +151,13 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) lazy_load_gs(vm86->regs32.gs); regs->pt.ax = retval; + return; + +Efault_end: + user_access_end(); +Efault: + pr_alert("could not access userspace vm86 info\n"); + do_exit(SIGSEGV); } static void mark_screen_rdonly(struct mm_struct *mm) @@ -243,6 +242,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) struct kernel_vm86_regs vm86regs; struct pt_regs *regs = current_pt_regs(); unsigned long err = 0; + struct vm86_struct v; err = security_mmap_addr(0); if (err) { @@ -278,39 +278,32 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) if (vm86->saved_sp0) return -EPERM; - if (!access_ok(user_vm86, plus ? - sizeof(struct vm86_struct) : - sizeof(struct vm86plus_struct))) + if (copy_from_user(&v, user_vm86, + offsetof(struct vm86_struct, int_revectored))) return -EFAULT; memset(&vm86regs, 0, sizeof(vm86regs)); - get_user_try { - unsigned short seg; - get_user_ex(vm86regs.pt.bx, &user_vm86->regs.ebx); - get_user_ex(vm86regs.pt.cx, &user_vm86->regs.ecx); - get_user_ex(vm86regs.pt.dx, &user_vm86->regs.edx); - get_user_ex(vm86regs.pt.si, &user_vm86->regs.esi); - get_user_ex(vm86regs.pt.di, &user_vm86->regs.edi); - get_user_ex(vm86regs.pt.bp, &user_vm86->regs.ebp); - get_user_ex(vm86regs.pt.ax, &user_vm86->regs.eax); - get_user_ex(vm86regs.pt.ip, &user_vm86->regs.eip); - get_user_ex(seg, &user_vm86->regs.cs); - vm86regs.pt.cs = seg; - get_user_ex(vm86regs.pt.flags, &user_vm86->regs.eflags); - get_user_ex(vm86regs.pt.sp, &user_vm86->regs.esp); - get_user_ex(seg, &user_vm86->regs.ss); - vm86regs.pt.ss = seg; - get_user_ex(vm86regs.es, &user_vm86->regs.es); - get_user_ex(vm86regs.ds, &user_vm86->regs.ds); - get_user_ex(vm86regs.fs, &user_vm86->regs.fs); - get_user_ex(vm86regs.gs, &user_vm86->regs.gs); - - get_user_ex(vm86->flags, &user_vm86->flags); - get_user_ex(vm86->screen_bitmap, &user_vm86->screen_bitmap); - get_user_ex(vm86->cpu_type, &user_vm86->cpu_type); - } get_user_catch(err); - if (err) - return err; + + vm86regs.pt.bx = v.regs.ebx; + vm86regs.pt.cx = v.regs.ecx; + vm86regs.pt.dx = v.regs.edx; + vm86regs.pt.si = v.regs.esi; + vm86regs.pt.di = v.regs.edi; + vm86regs.pt.bp = v.regs.ebp; + vm86regs.pt.ax = v.regs.eax; + vm86regs.pt.ip = v.regs.eip; + vm86regs.pt.cs = v.regs.cs; + vm86regs.pt.flags = v.regs.eflags; + vm86regs.pt.sp = v.regs.esp; + vm86regs.pt.ss = v.regs.ss; + vm86regs.es = v.regs.es; + vm86regs.ds = v.regs.ds; + vm86regs.fs = v.regs.fs; + vm86regs.gs = v.regs.gs; + + vm86->flags = v.flags; + vm86->screen_bitmap = v.screen_bitmap; + vm86->cpu_type = v.cpu_type; if (copy_from_user(&vm86->int_revectored, &user_vm86->int_revectored, diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index e3296aa028fe..1bf7e312361f 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -21,6 +21,7 @@ #define LOAD_OFFSET __START_KERNEL_map #endif +#define RUNTIME_DISCARD_EXIT #define EMITS_PT_NOTE #define RO_EXCEPTION_TABLE_ALIGN 16 @@ -313,8 +314,8 @@ SECTIONS . = ALIGN(8); /* - * .exit.text is discard at runtime, not link time, to deal with - * references from .altinstructions and .eh_frame + * .exit.text is discarded at runtime, not link time, to deal with + * references from .altinstructions */ .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { EXIT_TEXT @@ -412,9 +413,6 @@ SECTIONS DWARF_DEBUG DISCARDS - /DISCARD/ : { - *(.eh_frame) - } } |