diff options
Diffstat (limited to 'arch/x86/kernel')
91 files changed, 1760 insertions, 1590 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 2ddf08351f0b..0704c2a94272 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -35,7 +35,6 @@ KASAN_SANITIZE_sev-es.o := n KCSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD_test_nx.o := y -OBJECT_FILES_NON_STANDARD_paravirt_patch.o := y ifdef CONFIG_FRAME_POINTER OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y @@ -121,7 +120,7 @@ obj-$(CONFIG_AMD_NB) += amd_nb.o obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o -obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch.o +obj-$(CONFIG_PARAVIRT) += paravirt.o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 7bdc0239a943..e90310cbe73a 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -830,7 +830,7 @@ int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base) EXPORT_SYMBOL(acpi_unregister_ioapic); /** - * acpi_ioapic_registered - Check whether IOAPIC assoicatied with @gsi_base + * acpi_ioapic_registered - Check whether IOAPIC associated with @gsi_base * has been registered * @handle: ACPI handle of the IOAPIC device * @gsi_base: GSI base associated with the IOAPIC @@ -1554,10 +1554,18 @@ void __init acpi_boot_table_init(void) /* * Initialize the ACPI boot-time table parser. */ - if (acpi_table_init()) { + if (acpi_locate_initial_tables()) disable_acpi(); - return; - } + else + acpi_reserve_initial_tables(); +} + +int __init early_acpi_boot_init(void) +{ + if (acpi_disabled) + return 1; + + acpi_table_init_complete(); acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf); @@ -1570,18 +1578,9 @@ void __init acpi_boot_table_init(void) } else { printk(KERN_WARNING PREFIX "Disabling ACPI support\n"); disable_acpi(); - return; + return 1; } } -} - -int __init early_acpi_boot_init(void) -{ - /* - * If acpi_disabled, bail out - */ - if (acpi_disabled) - return 1; /* * Process the Multiple APIC Description Table (MADT), if present @@ -1657,7 +1656,7 @@ static int __init parse_acpi(char *arg) else if (strcmp(arg, "noirq") == 0) { acpi_noirq_set(); } - /* "acpi=copy_dsdt" copys DSDT */ + /* "acpi=copy_dsdt" copies DSDT */ else if (strcmp(arg, "copy_dsdt") == 0) { acpi_gbl_copy_dsdt_locally = 1; } diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index cc1fea76aab0..3f85fcae450c 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -41,7 +41,7 @@ unsigned long acpi_get_wakeup_address(void) * x86_acpi_enter_sleep_state - enter sleep state * @state: Sleep state to enter. * - * Wrapper around acpi_enter_sleep_state() to be called by assmebly. + * Wrapper around acpi_enter_sleep_state() to be called by assembly. */ asmlinkage acpi_status __visible x86_acpi_enter_sleep_state(u8 state) { diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index 56b6865afb2a..d5d8a352eafa 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -115,7 +115,7 @@ SYM_FUNC_START(do_suspend_lowlevel) movq pt_regs_r14(%rax), %r14 movq pt_regs_r15(%rax), %r15 -#if defined(CONFIG_KASAN) && CONFIG_KASAN_STACK +#if defined(CONFIG_KASAN) && defined(CONFIG_KASAN_STACK) /* * The suspend path may have poisoned some areas deeper in the stack, * which we now need to unpoison. diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 8d778e46725d..84ec0ba491e4 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -28,6 +28,7 @@ #include <asm/insn.h> #include <asm/io.h> #include <asm/fixmap.h> +#include <asm/paravirt.h> int __read_mostly alternatives_patched; @@ -74,186 +75,30 @@ do { \ } \ } while (0) -/* - * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes - * that correspond to that nop. Getting from one nop to the next, we - * add to the array the offset that is equal to the sum of all sizes of - * nops preceding the one we are after. - * - * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the - * nice symmetry of sizes of the previous nops. - */ -#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64) -static const unsigned char intelnops[] = -{ - GENERIC_NOP1, - GENERIC_NOP2, - GENERIC_NOP3, - GENERIC_NOP4, - GENERIC_NOP5, - GENERIC_NOP6, - GENERIC_NOP7, - GENERIC_NOP8, - GENERIC_NOP5_ATOMIC -}; -static const unsigned char * const intel_nops[ASM_NOP_MAX+2] = +const unsigned char x86nops[] = { - NULL, - intelnops, - intelnops + 1, - intelnops + 1 + 2, - intelnops + 1 + 2 + 3, - intelnops + 1 + 2 + 3 + 4, - intelnops + 1 + 2 + 3 + 4 + 5, - intelnops + 1 + 2 + 3 + 4 + 5 + 6, - intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7, - intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, + BYTES_NOP1, + BYTES_NOP2, + BYTES_NOP3, + BYTES_NOP4, + BYTES_NOP5, + BYTES_NOP6, + BYTES_NOP7, + BYTES_NOP8, }; -#endif -#ifdef K8_NOP1 -static const unsigned char k8nops[] = -{ - K8_NOP1, - K8_NOP2, - K8_NOP3, - K8_NOP4, - K8_NOP5, - K8_NOP6, - K8_NOP7, - K8_NOP8, - K8_NOP5_ATOMIC -}; -static const unsigned char * const k8_nops[ASM_NOP_MAX+2] = +const unsigned char * const x86_nops[ASM_NOP_MAX+1] = { NULL, - k8nops, - k8nops + 1, - k8nops + 1 + 2, - k8nops + 1 + 2 + 3, - k8nops + 1 + 2 + 3 + 4, - k8nops + 1 + 2 + 3 + 4 + 5, - k8nops + 1 + 2 + 3 + 4 + 5 + 6, - k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, - k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, + x86nops, + x86nops + 1, + x86nops + 1 + 2, + x86nops + 1 + 2 + 3, + x86nops + 1 + 2 + 3 + 4, + x86nops + 1 + 2 + 3 + 4 + 5, + x86nops + 1 + 2 + 3 + 4 + 5 + 6, + x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, }; -#endif - -#if defined(K7_NOP1) && !defined(CONFIG_X86_64) -static const unsigned char k7nops[] = -{ - K7_NOP1, - K7_NOP2, - K7_NOP3, - K7_NOP4, - K7_NOP5, - K7_NOP6, - K7_NOP7, - K7_NOP8, - K7_NOP5_ATOMIC -}; -static const unsigned char * const k7_nops[ASM_NOP_MAX+2] = -{ - NULL, - k7nops, - k7nops + 1, - k7nops + 1 + 2, - k7nops + 1 + 2 + 3, - k7nops + 1 + 2 + 3 + 4, - k7nops + 1 + 2 + 3 + 4 + 5, - k7nops + 1 + 2 + 3 + 4 + 5 + 6, - k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, - k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, -}; -#endif - -#ifdef P6_NOP1 -static const unsigned char p6nops[] = -{ - P6_NOP1, - P6_NOP2, - P6_NOP3, - P6_NOP4, - P6_NOP5, - P6_NOP6, - P6_NOP7, - P6_NOP8, - P6_NOP5_ATOMIC -}; -static const unsigned char * const p6_nops[ASM_NOP_MAX+2] = -{ - NULL, - p6nops, - p6nops + 1, - p6nops + 1 + 2, - p6nops + 1 + 2 + 3, - p6nops + 1 + 2 + 3 + 4, - p6nops + 1 + 2 + 3 + 4 + 5, - p6nops + 1 + 2 + 3 + 4 + 5 + 6, - p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, - p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, -}; -#endif - -/* Initialize these to a safe default */ -#ifdef CONFIG_X86_64 -const unsigned char * const *ideal_nops = p6_nops; -#else -const unsigned char * const *ideal_nops = intel_nops; -#endif - -void __init arch_init_ideal_nops(void) -{ - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - /* - * Due to a decoder implementation quirk, some - * specific Intel CPUs actually perform better with - * the "k8_nops" than with the SDM-recommended NOPs. - */ - if (boot_cpu_data.x86 == 6 && - boot_cpu_data.x86_model >= 0x0f && - boot_cpu_data.x86_model != 0x1c && - boot_cpu_data.x86_model != 0x26 && - boot_cpu_data.x86_model != 0x27 && - boot_cpu_data.x86_model < 0x30) { - ideal_nops = k8_nops; - } else if (boot_cpu_has(X86_FEATURE_NOPL)) { - ideal_nops = p6_nops; - } else { -#ifdef CONFIG_X86_64 - ideal_nops = k8_nops; -#else - ideal_nops = intel_nops; -#endif - } - break; - - case X86_VENDOR_HYGON: - ideal_nops = p6_nops; - return; - - case X86_VENDOR_AMD: - if (boot_cpu_data.x86 > 0xf) { - ideal_nops = p6_nops; - return; - } - - fallthrough; - - default: -#ifdef CONFIG_X86_64 - ideal_nops = k8_nops; -#else - if (boot_cpu_has(X86_FEATURE_K8)) - ideal_nops = k8_nops; - else if (boot_cpu_has(X86_FEATURE_K7)) - ideal_nops = k7_nops; - else - ideal_nops = intel_nops; -#endif - } -} /* Use this to add nops to a buffer, then text_poke the whole buffer. */ static void __init_or_module add_nops(void *insns, unsigned int len) @@ -262,7 +107,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len) unsigned int noplen = len; if (noplen > ASM_NOP_MAX) noplen = ASM_NOP_MAX; - memcpy(insns, ideal_nops[noplen], noplen); + memcpy(insns, x86_nops[noplen], noplen); insns += noplen; len -= noplen; } @@ -344,19 +189,35 @@ done: static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr) { unsigned long flags; - int i; + struct insn insn; + int nop, i = 0; + + /* + * Jump over the non-NOP insns, the remaining bytes must be single-byte + * NOPs, optimize them. + */ + for (;;) { + if (insn_decode_kernel(&insn, &instr[i])) + return; + + if (insn.length == 1 && insn.opcode.bytes[0] == 0x90) + break; + + if ((i += insn.length) >= a->instrlen) + return; + } - for (i = 0; i < a->padlen; i++) { - if (instr[i] != 0x90) + for (nop = i; i < a->instrlen; i++) { + if (WARN_ONCE(instr[i] != 0x90, "Not a NOP at 0x%px\n", &instr[i])) return; } local_irq_save(flags); - add_nops(instr + (a->instrlen - a->padlen), a->padlen); + add_nops(instr + nop, i - nop); local_irq_restore(flags); DUMP_BYTES(instr, a->instrlen, "%px: [%d:%d) optimized NOPs: ", - instr, a->instrlen - a->padlen, a->padlen); + instr, nop, a->instrlen); } /* @@ -388,23 +249,29 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, */ for (a = start; a < end; a++) { int insn_buff_sz = 0; + /* Mask away "NOT" flag bit for feature to test. */ + u16 feature = a->cpuid & ~ALTINSTR_FLAG_INV; instr = (u8 *)&a->instr_offset + a->instr_offset; replacement = (u8 *)&a->repl_offset + a->repl_offset; BUG_ON(a->instrlen > sizeof(insn_buff)); - BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); - if (!boot_cpu_has(a->cpuid)) { - if (a->padlen > 1) - optimize_nops(a, instr); + BUG_ON(feature >= (NCAPINTS + NBUGINTS) * 32); - continue; - } + /* + * Patch if either: + * - feature is present + * - feature not present but ALTINSTR_FLAG_INV is set to mean, + * patch if feature is *NOT* present. + */ + if (!boot_cpu_has(feature) == !(a->cpuid & ALTINSTR_FLAG_INV)) + goto next; - DPRINTK("feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d), pad: %d", - a->cpuid >> 5, - a->cpuid & 0x1f, + DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)", + (a->cpuid & ALTINSTR_FLAG_INV) ? "!" : "", + feature >> 5, + feature & 0x1f, instr, instr, a->instrlen, - replacement, a->replacementlen, a->padlen); + replacement, a->replacementlen); DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr); DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement); @@ -428,14 +295,15 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, if (a->replacementlen && is_jmp(replacement[0])) recompute_jump(a, instr, replacement, insn_buff); - if (a->instrlen > a->replacementlen) { - add_nops(insn_buff + a->replacementlen, - a->instrlen - a->replacementlen); - insn_buff_sz += a->instrlen - a->replacementlen; - } + for (; insn_buff_sz < a->instrlen; insn_buff_sz++) + insn_buff[insn_buff_sz] = 0x90; + DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr); text_poke_early(instr, insn_buff, insn_buff_sz); + +next: + optimize_nops(a, instr); } } @@ -605,7 +473,7 @@ void __init_or_module apply_paravirt(struct paravirt_patch_site *start, BUG_ON(p->len > MAX_PATCH_LEN); /* prep the buffer with the original instructions */ memcpy(insn_buff, p->instr, p->len); - used = pv_ops.init.patch(p->type, insn_buff, (unsigned long)p->instr, p->len); + used = paravirt_patch(p->type, insn_buff, (unsigned long)p->instr, p->len); BUG_ON(used > p->len); @@ -723,6 +591,33 @@ void __init alternative_instructions(void) * patching. */ + /* + * Paravirt patching and alternative patching can be combined to + * replace a function call with a short direct code sequence (e.g. + * by setting a constant return value instead of doing that in an + * external function). + * In order to make this work the following sequence is required: + * 1. set (artificial) features depending on used paravirt + * functions which can later influence alternative patching + * 2. apply paravirt patching (generally replacing an indirect + * function call with a direct one) + * 3. apply alternative patching (e.g. replacing a direct function + * call with a custom code sequence) + * Doing paravirt patching after alternative patching would clobber + * the optimization of the custom code with a function call again. + */ + paravirt_set_cap(); + + /* + * First patch paravirt functions, such that we overwrite the indirect + * call with the direct call. + */ + apply_paravirt(__parainstructions, __parainstructions_end); + + /* + * Then patch alternatives, such that those paravirt calls that are in + * alternatives can be overwritten by their immediate fragments. + */ apply_alternatives(__alt_instructions, __alt_instructions_end); #ifdef CONFIG_SMP @@ -741,8 +636,6 @@ void __init alternative_instructions(void) } #endif - apply_paravirt(__parainstructions, __parainstructions_end); - restart_nmi(); alternatives_patched = 1; } @@ -1274,15 +1167,15 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, const void *opcode, size_t len, const void *emulate) { struct insn insn; + int ret; memcpy((void *)tp->text, opcode, len); if (!emulate) emulate = opcode; - kernel_insn_init(&insn, emulate, MAX_INSN_SIZE); - insn_get_length(&insn); + ret = insn_decode_kernel(&insn, emulate); - BUG_ON(!insn_complete(&insn)); + BUG_ON(ret < 0); BUG_ON(len != insn.length); tp->rel_addr = addr - (void *)_stext; @@ -1302,13 +1195,13 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, default: /* assume NOP */ switch (len) { case 2: /* NOP2 -- emulate as JMP8+0 */ - BUG_ON(memcmp(emulate, ideal_nops[len], len)); + BUG_ON(memcmp(emulate, x86_nops[len], len)); tp->opcode = JMP8_INSN_OPCODE; tp->rel32 = 0; break; case 5: /* NOP5 -- emulate as JMP32+0 */ - BUG_ON(memcmp(emulate, ideal_nops[NOP_ATOMIC5], len)); + BUG_ON(memcmp(emulate, x86_nops[len], len)); tp->opcode = JMP32_INSN_OPCODE; tp->rel32 = 0; break; diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index b4396952c9a6..09083094eb57 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Shared support code for AMD K8 northbridges and derivates. + * Shared support code for AMD K8 northbridges and derivatives. * Copyright 2006 Andi Kleen, SUSE Labs. */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index bda4f2a36868..4a39fb429f15 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -619,7 +619,7 @@ static void setup_APIC_timer(void) if (this_cpu_has(X86_FEATURE_ARAT)) { lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP; - /* Make LAPIC timer preferrable over percpu HPET */ + /* Make LAPIC timer preferable over percpu HPET */ lapic_clockevent.rating = 150; } @@ -666,7 +666,7 @@ void lapic_update_tsc_freq(void) * In this functions we calibrate APIC bus clocks to the external timer. * * We want to do the calibration only once since we want to have local timer - * irqs syncron. CPUs connected by the same APIC bus have the very same bus + * irqs synchronous. CPUs connected by the same APIC bus have the very same bus * frequency. * * This was previously done by reading the PIT/HPET and waiting for a wrap @@ -1532,7 +1532,7 @@ static bool apic_check_and_ack(union apic_ir *irr, union apic_ir *isr) * Most probably by now the CPU has serviced that pending interrupt and it * might not have done the ack_APIC_irq() because it thought, interrupt * came from i8259 as ExtInt. LAPIC did not get EOI so it does not clear - * the ISR bit and cpu thinks it has already serivced the interrupt. Hence + * the ISR bit and cpu thinks it has already serviced the interrupt. Hence * a vector might get locked. It was noticed for timer irq (vector * 0x31). Issue an extra EOI to clear ISR. * @@ -1657,7 +1657,7 @@ static void setup_local_APIC(void) */ /* * Actually disabling the focus CPU check just makes the hang less - * frequent as it makes the interrupt distributon model be more + * frequent as it makes the interrupt distribution model be more * like LRU than MRU (the short-term load is more even across CPUs). */ @@ -1875,7 +1875,7 @@ static __init void try_to_enable_x2apic(int remap_mode) /* * Without IR, all CPUs can be addressed by IOAPIC/MSI only - * in physical mode, and CPUs with an APIC ID that cannnot + * in physical mode, and CPUs with an APIC ID that cannot * be addressed must not be brought online. */ x2apic_set_max_apicid(apic_limit); @@ -2342,6 +2342,11 @@ static int cpuid_to_apicid[] = { [0 ... NR_CPUS - 1] = -1, }; +bool arch_match_cpu_phys_id(int cpu, u64 phys_id) +{ + return phys_id == cpuid_to_apicid[cpu]; +} + #ifdef CONFIG_SMP /** * apic_id_is_primary_thread - Check whether APIC ID belongs to a primary thread diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index c3b60c37c728..d5c691a3208b 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -928,7 +928,7 @@ static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info) /* * setup_IO_APIC_irqs() programs all legacy IRQs with default trigger - * and polarity attirbutes. So allow the first user to reprogram the + * and polarity attributes. So allow the first user to reprogram the * pin with real trigger and polarity attributes. */ if (irq < nr_legacy_irqs() && data->count == 1) { @@ -994,7 +994,7 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain, /* * Legacy ISA IRQ has already been allocated, just add pin to - * the pin list assoicated with this IRQ and program the IOAPIC + * the pin list associated with this IRQ and program the IOAPIC * entry. The IOAPIC entry */ if (irq_data && irq_data->parent_data) { @@ -1032,6 +1032,16 @@ static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin, if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) { irq = mp_irqs[idx].srcbusirq; legacy = mp_is_legacy_irq(irq); + /* + * IRQ2 is unusable for historical reasons on systems which + * have a legacy PIC. See the comment vs. IRQ2 further down. + * + * If this gets removed at some point then the related code + * in lapic_assign_system_vectors() needs to be adjusted as + * well. + */ + if (legacy && irq == PIC_CASCADE_IR) + return -EINVAL; } mutex_lock(&ioapic_mutex); @@ -1742,7 +1752,7 @@ static inline void ioapic_finish_move(struct irq_data *data, bool moveit) * with masking the ioapic entry and then polling until * Remote IRR was clear before reprogramming the * ioapic I don't trust the Remote IRR bit to be - * completey accurate. + * completely accurate. * * However there appears to be no other way to plug * this race, so if the Remote IRR bit is not @@ -1820,7 +1830,7 @@ static void ioapic_ack_level(struct irq_data *irq_data) /* * Tail end of clearing remote IRR bit (either by delivering the EOI * message via io-apic EOI register write or simulating it using - * mask+edge followed by unnask+level logic) manually when the + * mask+edge followed by unmask+level logic) manually when the * level triggered interrupt is seen as the edge triggered interrupt * at the cpu. */ diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 3c9c7492252f..6dbdc7c22bb7 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -543,6 +543,14 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, if ((info->flags & X86_IRQ_ALLOC_CONTIGUOUS_VECTORS) && nr_irqs > 1) return -ENOSYS; + /* + * Catch any attempt to touch the cascade interrupt on a PIC + * equipped system. + */ + if (WARN_ON_ONCE(info->flags & X86_IRQ_ALLOC_LEGACY && + virq == PIC_CASCADE_IR)) + return -EINVAL; + for (i = 0; i < nr_irqs; i++) { irqd = irq_domain_get_irq_data(domain, virq + i); BUG_ON(!irqd); @@ -745,6 +753,11 @@ void __init lapic_assign_system_vectors(void) /* Mark the preallocated legacy interrupts */ for (i = 0; i < nr_legacy_irqs(); i++) { + /* + * Don't touch the cascade interrupt. It's unusable + * on PIC equipped machines. See the large comment + * in the IO/APIC code. + */ if (i != PIC_CASCADE_IR) irq_matrix_assign(vector_matrix, ISA_IRQ_VECTOR(i)); } @@ -1045,7 +1058,7 @@ void irq_force_complete_move(struct irq_desc *desc) * * But in case of cpu hotplug this should be a non issue * because if the affinity update happens right before all - * cpus rendevouz in stop machine, there is no way that the + * cpus rendezvous in stop machine, there is no way that the * interrupt can be blocked on the target cpu because all cpus * loops first with interrupts enabled in stop machine, so the * old vector is not yet cleaned up when the interrupt fires. @@ -1054,7 +1067,7 @@ void irq_force_complete_move(struct irq_desc *desc) * of the interrupt on the apic/system bus would be delayed * beyond the point where the target cpu disables interrupts * in stop machine. I doubt that it can happen, but at least - * there is a theroretical chance. Virtualization might be + * there is a theoretical chance. Virtualization might be * able to expose this, but AFAICT the IOAPIC emulation is not * as stupid as the real hardware. * diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 52bc217ca8c3..f5a48e66e4f5 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -369,6 +369,15 @@ static int __init early_get_arch_type(void) return ret; } +/* UV system found, check which APIC MODE BIOS already selected */ +static void __init early_set_apic_mode(void) +{ + if (x2apic_enabled()) + uv_system_type = UV_X2APIC; + else + uv_system_type = UV_LEGACY_APIC; +} + static int __init uv_set_system_type(char *_oem_id, char *_oem_table_id) { /* Save OEM_ID passed from ACPI MADT */ @@ -404,11 +413,12 @@ static int __init uv_set_system_type(char *_oem_id, char *_oem_table_id) else uv_hubless_system |= 0x8; - /* Copy APIC type */ + /* Copy OEM Table ID */ uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id); pr_info("UV: OEM IDs %s/%s, SystemType %d, HUBLESS ID %x\n", oem_id, oem_table_id, uv_system_type, uv_hubless_system); + return 0; } @@ -453,6 +463,7 @@ static int __init uv_set_system_type(char *_oem_id, char *_oem_table_id) early_set_hub_type(); /* Other UV setup functions */ + early_set_apic_mode(); early_get_pnodeid(); early_get_apic_socketid_shift(); x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; @@ -472,29 +483,14 @@ static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id) if (uv_set_system_type(_oem_id, _oem_table_id) == 0) return 0; - /* Save and Decode OEM Table ID */ + /* Save for display of the OEM Table ID */ uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id); - /* This is the most common hardware variant, x2apic mode */ - if (!strcmp(oem_table_id, "UVX")) - uv_system_type = UV_X2APIC; - - /* Only used for very small systems, usually 1 chassis, legacy mode */ - else if (!strcmp(oem_table_id, "UVL")) - uv_system_type = UV_LEGACY_APIC; - - else - goto badbios; - pr_info("UV: OEM IDs %s/%s, System/UVType %d/0x%x, HUB RevID %d\n", oem_id, oem_table_id, uv_system_type, is_uv(UV_ANY), uv_min_hub_revision_id); return 0; - -badbios: - pr_err("UV: UVarchtype:%s not supported\n", uv_archtype); - BUG(); } enum uv_system_type get_uv_system_type(void) @@ -1671,6 +1667,9 @@ static __init int uv_system_init_hubless(void) if (rc < 0) return rc; + /* Set section block size for current node memory */ + set_block_size(); + /* Create user access node */ if (rc >= 0) uv_setup_proc_files(1); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 660270359d39..241dda687eb9 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -94,7 +94,7 @@ * Remove APM dependencies in arch/i386/kernel/process.c * Remove APM dependencies in drivers/char/sysrq.c * Reset time across standby. - * Allow more inititialisation on SMP. + * Allow more initialisation on SMP. * Remove CONFIG_APM_POWER_OFF and make it boot time * configurable (default on). * Make debug only a boot time parameter (remove APM_DEBUG). @@ -766,7 +766,7 @@ static int apm_driver_version(u_short *val) * not cleared until it is acknowledged. * * Additional information is returned in the info pointer, providing - * that APM 1.2 is in use. If no messges are pending the value 0x80 + * that APM 1.2 is in use. If no messages are pending the value 0x80 * is returned (No power management events pending). */ static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info) @@ -1025,7 +1025,7 @@ static int apm_enable_power_management(int enable) * status which gives the rough battery status, and current power * source. The bat value returned give an estimate as a percentage * of life and a status value for the battery. The estimated life - * if reported is a lifetime in secodnds/minutes at current powwer + * if reported is a lifetime in seconds/minutes at current power * consumption. */ diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 60b9f42ce3c1..ecd3fd6993d1 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -61,13 +61,6 @@ static void __used common(void) OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext); #endif -#ifdef CONFIG_PARAVIRT_XXL - BLANK(); - OFFSET(PV_IRQ_irq_disable, paravirt_patch_template, irq.irq_disable); - OFFSET(PV_IRQ_irq_enable, paravirt_patch_template, irq.irq_enable); - OFFSET(PV_CPU_iret, paravirt_patch_template, cpu.iret); -#endif - #ifdef CONFIG_XEN BLANK(); OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 6e043f295a60..2b411cd00a4e 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -53,11 +53,6 @@ void foo(void) offsetof(struct cpu_entry_area, tss.x86_tss.sp1) - offsetofend(struct cpu_entry_area, entry_stack_page.stack)); -#ifdef CONFIG_STACKPROTECTOR - BLANK(); - OFFSET(stack_canary_offset, stack_canary, canary); -#endif - BLANK(); DEFINE(EFI_svam, offsetof(efi_runtime_services_t, set_virtual_address_map)); } diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 347a956f71ca..2d11384dc9ab 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -628,11 +628,6 @@ static void early_init_amd(struct cpuinfo_x86 *c) early_init_amd_mc(c); -#ifdef CONFIG_X86_32 - if (c->x86 == 6) - set_cpu_cap(c, X86_FEATURE_K7); -#endif - if (c->x86 >= 0xf) set_cpu_cap(c, X86_FEATURE_K8); diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index 3ca9be482a9e..d66af2950e06 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -877,7 +877,7 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c) static int __cache_amd_cpumap_setup(unsigned int cpu, int index, struct _cpuid4_info_regs *base) { - struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); + struct cpu_cacheinfo *this_cpu_ci; struct cacheinfo *this_leaf; int i, sibling; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index ab640abe26b6..6bdb69a9a7dc 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -161,7 +161,6 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), - GDT_STACK_CANARY_INIT #endif } }; EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); @@ -482,7 +481,7 @@ static __always_inline void setup_pku(struct cpuinfo_x86 *c) if (pk) pk->pkru = init_pkru_value; /* - * Seting X86_CR4_PKE will cause the X86_FEATURE_OSPKE + * Setting X86_CR4_PKE will cause the X86_FEATURE_OSPKE * cpuid bit to be set. We need to ensure that we * update that bit in this CPU's "cpu_info". */ @@ -599,7 +598,6 @@ void load_percpu_segment(int cpu) __loadsegment_simple(gs, 0); wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu)); #endif - load_stack_canary_segment(); } #ifdef CONFIG_X86_32 @@ -1330,7 +1328,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) cpu_set_bug_bits(c); - cpu_set_core_cap_bits(c); + sld_setup(c); fpu__init_system(c); @@ -1404,7 +1402,7 @@ static void detect_null_seg_behavior(struct cpuinfo_x86 *c) * where GS is unused by the prev and next threads. * * Since neither vendor documents this anywhere that I can see, - * detect it directly instead of hardcoding the choice by + * detect it directly instead of hard-coding the choice by * vendor. * * I've designated AMD's behavior as the "bug" because it's @@ -1748,6 +1746,8 @@ DEFINE_PER_CPU(bool, hardirq_stack_inuse); DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; EXPORT_PER_CPU_SYMBOL(__preempt_count); +DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK; + /* May not be marked __init: used by software suspend */ void syscall_init(void) { @@ -1796,7 +1796,8 @@ DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack); #ifdef CONFIG_STACKPROTECTOR -DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); +DEFINE_PER_CPU(unsigned long, __stack_chk_guard); +EXPORT_PER_CPU_SYMBOL(__stack_chk_guard); #endif #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index 42af31b64c2c..defda61f372d 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -72,6 +72,9 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_AVX512_FP16, X86_FEATURE_AVX512BW }, { X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES }, { X86_FEATURE_PER_THREAD_MBA, X86_FEATURE_MBA }, + { X86_FEATURE_SGX_LC, X86_FEATURE_SGX }, + { X86_FEATURE_SGX1, X86_FEATURE_SGX }, + { X86_FEATURE_SGX2, X86_FEATURE_SGX1 }, {} }; diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 1d9b8aaea06c..7227c15299d0 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -291,7 +291,7 @@ static void init_cyrix(struct cpuinfo_x86 *c) mark_tsc_unstable("cyrix 5510/5520 detected"); } #endif - c->x86_cache_size = 16; /* Yep 16K integrated cache thats it */ + c->x86_cache_size = 16; /* Yep 16K integrated cache that's it */ /* GXm supports extended cpuid levels 'ala' AMD */ if (c->cpuid_level == 2) { diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c index 3b1b01f2b248..da696eb4821a 100644 --- a/arch/x86/kernel/cpu/feat_ctl.c +++ b/arch/x86/kernel/cpu/feat_ctl.c @@ -93,15 +93,9 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c) } #endif /* CONFIG_X86_VMX_FEATURE_NAMES */ -static void clear_sgx_caps(void) -{ - setup_clear_cpu_cap(X86_FEATURE_SGX); - setup_clear_cpu_cap(X86_FEATURE_SGX_LC); -} - static int __init nosgx(char *str) { - clear_sgx_caps(); + setup_clear_cpu_cap(X86_FEATURE_SGX); return 0; } @@ -110,23 +104,30 @@ early_param("nosgx", nosgx); void init_ia32_feat_ctl(struct cpuinfo_x86 *c) { + bool enable_sgx_kvm = false, enable_sgx_driver = false; bool tboot = tboot_enabled(); - bool enable_sgx; + bool enable_vmx; u64 msr; if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) { clear_cpu_cap(c, X86_FEATURE_VMX); - clear_sgx_caps(); + clear_cpu_cap(c, X86_FEATURE_SGX); return; } - /* - * Enable SGX if and only if the kernel supports SGX and Launch Control - * is supported, i.e. disable SGX if the LE hash MSRs can't be written. - */ - enable_sgx = cpu_has(c, X86_FEATURE_SGX) && - cpu_has(c, X86_FEATURE_SGX_LC) && - IS_ENABLED(CONFIG_X86_SGX); + enable_vmx = cpu_has(c, X86_FEATURE_VMX) && + IS_ENABLED(CONFIG_KVM_INTEL); + + if (cpu_has(c, X86_FEATURE_SGX) && IS_ENABLED(CONFIG_X86_SGX)) { + /* + * Separate out SGX driver enabling from KVM. This allows KVM + * guests to use SGX even if the kernel SGX driver refuses to + * use it. This happens if flexible Launch Control is not + * available. + */ + enable_sgx_driver = cpu_has(c, X86_FEATURE_SGX_LC); + enable_sgx_kvm = enable_vmx && IS_ENABLED(CONFIG_X86_SGX_KVM); + } if (msr & FEAT_CTL_LOCKED) goto update_caps; @@ -142,15 +143,18 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c) * i.e. KVM is enabled, to avoid unnecessarily adding an attack vector * for the kernel, e.g. using VMX to hide malicious code. */ - if (cpu_has(c, X86_FEATURE_VMX) && IS_ENABLED(CONFIG_KVM_INTEL)) { + if (enable_vmx) { msr |= FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; if (tboot) msr |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX; } - if (enable_sgx) - msr |= FEAT_CTL_SGX_ENABLED | FEAT_CTL_SGX_LC_ENABLED; + if (enable_sgx_kvm || enable_sgx_driver) { + msr |= FEAT_CTL_SGX_ENABLED; + if (enable_sgx_driver) + msr |= FEAT_CTL_SGX_LC_ENABLED; + } wrmsrl(MSR_IA32_FEAT_CTL, msr); @@ -173,10 +177,29 @@ update_caps: } update_sgx: - if (!(msr & FEAT_CTL_SGX_ENABLED) || - !(msr & FEAT_CTL_SGX_LC_ENABLED) || !enable_sgx) { - if (enable_sgx) - pr_err_once("SGX disabled by BIOS\n"); - clear_sgx_caps(); + if (!(msr & FEAT_CTL_SGX_ENABLED)) { + if (enable_sgx_kvm || enable_sgx_driver) + pr_err_once("SGX disabled by BIOS.\n"); + clear_cpu_cap(c, X86_FEATURE_SGX); + return; + } + + /* + * VMX feature bit may be cleared due to being disabled in BIOS, + * in which case SGX virtualization cannot be supported either. + */ + if (!cpu_has(c, X86_FEATURE_VMX) && enable_sgx_kvm) { + pr_err_once("SGX virtualization disabled due to lack of VMX.\n"); + enable_sgx_kvm = 0; + } + + if (!(msr & FEAT_CTL_SGX_LC_ENABLED) && enable_sgx_driver) { + if (!enable_sgx_kvm) { + pr_err_once("SGX Launch Control is locked. Disable SGX.\n"); + clear_cpu_cap(c, X86_FEATURE_SGX); + } else { + pr_err_once("SGX Launch Control is locked. Support SGX virtualization only.\n"); + clear_cpu_cap(c, X86_FEATURE_SGX_LC); + } } } diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index ae59115d18f9..0bd6c74e3ba1 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -215,12 +215,12 @@ static void bsp_init_hygon(struct cpuinfo_x86 *c) u32 ecx; ecx = cpuid_ecx(0x8000001e); - nodes_per_socket = ((ecx >> 8) & 7) + 1; + __max_die_per_package = nodes_per_socket = ((ecx >> 8) & 7) + 1; } else if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) { u64 value; rdmsrl(MSR_FAM10H_NODE_ID, value); - nodes_per_socket = ((value >> 3) & 7) + 1; + __max_die_per_package = nodes_per_socket = ((value >> 3) & 7) + 1; } if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) && diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 0e422a544835..8adffc17fa8b 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -44,9 +44,9 @@ enum split_lock_detect_state { }; /* - * Default to sld_off because most systems do not support split lock detection - * split_lock_setup() will switch this to sld_warn on systems that support - * split lock detect, unless there is a command line override. + * Default to sld_off because most systems do not support split lock detection. + * sld_state_setup() will switch this to sld_warn on systems that support + * split lock/bus lock detect, unless there is a command line override. */ static enum split_lock_detect_state sld_state __ro_after_init = sld_off; static u64 msr_test_ctrl_cache __ro_after_init; @@ -301,7 +301,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) * The operating system must reload CR3 to cause the TLB to be flushed" * * As a result, boot_cpu_has(X86_FEATURE_PGE) in arch/x86/include/asm/tlbflush.h - * should be false so that __flush_tlb_all() causes CR3 insted of CR4.PGE + * should be false so that __flush_tlb_all() causes CR3 instead of CR4.PGE * to be modified. */ if (c->x86 == 5 && c->x86_model == 9) { @@ -603,6 +603,7 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c) } static void split_lock_init(void); +static void bus_lock_init(void); static void init_intel(struct cpuinfo_x86 *c) { @@ -720,6 +721,7 @@ static void init_intel(struct cpuinfo_x86 *c) tsx_disable(); split_lock_init(); + bus_lock_init(); intel_init_thermal(c); } @@ -1020,16 +1022,15 @@ static bool split_lock_verify_msr(bool on) return ctrl == tmp; } -static void __init split_lock_setup(void) +static void __init sld_state_setup(void) { enum split_lock_detect_state state = sld_warn; char arg[20]; int i, ret; - if (!split_lock_verify_msr(false)) { - pr_info("MSR access failed: Disabled\n"); + if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && + !boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) return; - } ret = cmdline_find_option(boot_command_line, "split_lock_detect", arg, sizeof(arg)); @@ -1041,17 +1042,14 @@ static void __init split_lock_setup(void) } } } + sld_state = state; +} - switch (state) { - case sld_off: - pr_info("disabled\n"); +static void __init __split_lock_setup(void) +{ + if (!split_lock_verify_msr(false)) { + pr_info("MSR access failed: Disabled\n"); return; - case sld_warn: - pr_info("warning about user-space split_locks\n"); - break; - case sld_fatal: - pr_info("sending SIGBUS on user-space split_locks\n"); - break; } rdmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache); @@ -1061,7 +1059,9 @@ static void __init split_lock_setup(void) return; } - sld_state = state; + /* Restore the MSR to its cached value. */ + wrmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache); + setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT); } @@ -1118,6 +1118,29 @@ bool handle_guest_split_lock(unsigned long ip) } EXPORT_SYMBOL_GPL(handle_guest_split_lock); +static void bus_lock_init(void) +{ + u64 val; + + /* + * Warn and fatal are handled by #AC for split lock if #AC for + * split lock is supported. + */ + if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) || + (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && + (sld_state == sld_warn || sld_state == sld_fatal)) || + sld_state == sld_off) + return; + + /* + * Enable #DB for bus lock. All bus locks are handled in #DB except + * split locks are handled in #AC in the fatal case. + */ + rdmsrl(MSR_IA32_DEBUGCTLMSR, val); + val |= DEBUGCTLMSR_BUS_LOCK_DETECT; + wrmsrl(MSR_IA32_DEBUGCTLMSR, val); +} + bool handle_user_split_lock(struct pt_regs *regs, long error_code) { if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal) @@ -1126,6 +1149,21 @@ bool handle_user_split_lock(struct pt_regs *regs, long error_code) return true; } +void handle_bus_lock(struct pt_regs *regs) +{ + switch (sld_state) { + case sld_off: + break; + case sld_warn: + pr_warn_ratelimited("#DB: %s/%d took a bus_lock trap at address: 0x%lx\n", + current->comm, current->pid, regs->ip); + break; + case sld_fatal: + force_sig_fault(SIGBUS, BUS_ADRALN, NULL); + break; + } +} + /* * This function is called only when switching between tasks with * different split-lock detection modes. It sets the MSR for the @@ -1166,7 +1204,7 @@ static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = { {} }; -void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c) +static void __init split_lock_setup(struct cpuinfo_x86 *c) { const struct x86_cpu_id *m; u64 ia32_core_caps; @@ -1193,5 +1231,56 @@ void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c) } cpu_model_supports_sld = true; - split_lock_setup(); + __split_lock_setup(); +} + +static void sld_state_show(void) +{ + if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) && + !boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) + return; + + switch (sld_state) { + case sld_off: + pr_info("disabled\n"); + break; + case sld_warn: + if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) + pr_info("#AC: crashing the kernel on kernel split_locks and warning on user-space split_locks\n"); + else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) + pr_info("#DB: warning on user-space bus_locks\n"); + break; + case sld_fatal: + if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) { + pr_info("#AC: crashing the kernel on kernel split_locks and sending SIGBUS on user-space split_locks\n"); + } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) { + pr_info("#DB: sending SIGBUS on user-space bus_locks%s\n", + boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) ? + " from non-WB" : ""); + } + break; + } +} + +void __init sld_setup(struct cpuinfo_x86 *c) +{ + split_lock_setup(c); + sld_state_setup(); + sld_state_show(); +} + +#define X86_HYBRID_CPU_TYPE_ID_SHIFT 24 + +/** + * get_this_hybrid_cpu_type() - Get the type of this hybrid CPU + * + * Returns the CPU type [31:24] (i.e., Atom or Core) of a CPU in + * a hybrid processor. If the processor is not hybrid, returns 0. + */ +u8 get_this_hybrid_cpu_type(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) + return 0; + + return cpuid_eax(0x0000001a) >> X86_HYBRID_CPU_TYPE_ID_SHIFT; } diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 7962355436da..bf7fe87a7e88 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -529,7 +529,7 @@ static void mce_irq_work_cb(struct irq_work *entry) * Check if the address reported by the CPU is in a format we can parse. * It would be possible to add code for most other cases, but all would * be somewhat complicated (e.g. segment offset would require an instruction - * parser). So only support physical addresses up to page granuality for now. + * parser). So only support physical addresses up to page granularity for now. */ int mce_usable_address(struct mce *m) { diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 7b360731fc2d..4e86d97f9653 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -74,6 +74,7 @@ MCE_INJECT_SET(status); MCE_INJECT_SET(misc); MCE_INJECT_SET(addr); MCE_INJECT_SET(synd); +MCE_INJECT_SET(ipid); #define MCE_INJECT_GET(reg) \ static int inj_##reg##_get(void *data, u64 *val) \ @@ -88,11 +89,13 @@ MCE_INJECT_GET(status); MCE_INJECT_GET(misc); MCE_INJECT_GET(addr); MCE_INJECT_GET(synd); +MCE_INJECT_GET(ipid); DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n"); DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n"); DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n"); DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(ipid_fops, inj_ipid_get, inj_ipid_set, "%llx\n"); static void setup_inj_struct(struct mce *m) { @@ -629,6 +632,8 @@ static const char readme_msg[] = "\t is present in hardware. \n" "\t - \"th\": Trigger APIC interrupt for Threshold errors. Causes threshold \n" "\t APIC interrupt handler to handle the error. \n" +"\n" +"ipid:\t IPID (AMD-specific)\n" "\n"; static ssize_t @@ -652,6 +657,7 @@ static struct dfs_node { { .name = "misc", .fops = &misc_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "addr", .fops = &addr_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "synd", .fops = &synd_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "ipid", .fops = &ipid_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "bank", .fops = &bank_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "flags", .fops = &flags_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "cpu", .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR }, diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index e309476743b7..acfd5d9f93c6 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -486,6 +486,7 @@ static void intel_ppin_init(struct cpuinfo_x86 *c) case INTEL_FAM6_BROADWELL_X: case INTEL_FAM6_SKYLAKE_X: case INTEL_FAM6_ICELAKE_X: + case INTEL_FAM6_SAPPHIRERAPIDS_X: case INTEL_FAM6_XEON_PHI_KNL: case INTEL_FAM6_XEON_PHI_KNM: diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index 83df991314c5..17e631443116 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -142,7 +142,7 @@ static struct severity { MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR) ), MCESEV( - KEEP, "Non signalled machine check", + KEEP, "Non signaled machine check", SER, BITCLR(MCI_STATUS_S) ), @@ -218,15 +218,15 @@ static struct severity { static bool is_copy_from_user(struct pt_regs *regs) { u8 insn_buf[MAX_INSN_SIZE]; - struct insn insn; unsigned long addr; + struct insn insn; + int ret; if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip, MAX_INSN_SIZE)) return false; - kernel_insn_init(&insn, insn_buf, MAX_INSN_SIZE); - insn_get_opcode(&insn); - if (!insn.opcode.got) + ret = insn_decode_kernel(&insn, insn_buf); + if (ret < 0) return false; switch (insn.opcode.value) { @@ -234,10 +234,6 @@ static bool is_copy_from_user(struct pt_regs *regs) case 0x8A: case 0x8B: /* MOVZ mem,reg */ case 0xB60F: case 0xB70F: - insn_get_modrm(&insn); - insn_get_sib(&insn); - if (!insn.modrm.got || !insn.sib.got) - return false; addr = (unsigned long)insn_get_addr_ref(&insn, regs); break; /* REP MOVS */ diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index b935e1b5f115..6a6318e9590c 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -629,16 +629,16 @@ static ssize_t reload_store(struct device *dev, if (val != 1) return size; - tmp_ret = microcode_ops->request_microcode_fw(bsp, µcode_pdev->dev, true); - if (tmp_ret != UCODE_NEW) - return size; - get_online_cpus(); ret = check_online_cpus(); if (ret) goto put; + tmp_ret = microcode_ops->request_microcode_fw(bsp, µcode_pdev->dev, true); + if (tmp_ret != UCODE_NEW) + goto put; + mutex_lock(µcode_mutex); ret = microcode_reload_late(); mutex_unlock(µcode_mutex); diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index e88bc296afca..22f13343b5da 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -60,23 +60,18 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback) set_irq_regs(old_regs); } -int hv_setup_vmbus_irq(int irq, void (*handler)(void)) +void hv_setup_vmbus_handler(void (*handler)(void)) { - /* - * The 'irq' argument is ignored on x86/x64 because a hard-coded - * interrupt vector is used for Hyper-V interrupts. - */ vmbus_handler = handler; - return 0; } +EXPORT_SYMBOL_GPL(hv_setup_vmbus_handler); -void hv_remove_vmbus_irq(void) +void hv_remove_vmbus_handler(void) { /* We have no way to deallocate the interrupt gate */ vmbus_handler = NULL; } -EXPORT_SYMBOL_GPL(hv_setup_vmbus_irq); -EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq); +EXPORT_SYMBOL_GPL(hv_remove_vmbus_handler); /* * Routines to do per-architecture handling of stimer0 @@ -95,21 +90,17 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_stimer0) set_irq_regs(old_regs); } -int hv_setup_stimer0_irq(int *irq, int *vector, void (*handler)(void)) +/* For x86/x64, override weak placeholders in hyperv_timer.c */ +void hv_setup_stimer0_handler(void (*handler)(void)) { - *vector = HYPERV_STIMER0_VECTOR; - *irq = -1; /* Unused on x86/x64 */ hv_stimer0_handler = handler; - return 0; } -EXPORT_SYMBOL_GPL(hv_setup_stimer0_irq); -void hv_remove_stimer0_irq(int irq) +void hv_remove_stimer0_handler(void) { /* We have no way to deallocate the interrupt gate */ hv_stimer0_handler = NULL; } -EXPORT_SYMBOL_GPL(hv_remove_stimer0_irq); void hv_setup_kexec_handler(void (*handler)(void)) { @@ -197,7 +188,7 @@ static unsigned char hv_get_nmi_reason(void) #ifdef CONFIG_X86_LOCAL_APIC /* * Prior to WS2016 Debug-VM sends NMIs to all CPUs which makes - * it dificult to process CHANNELMSG_UNLOAD in case of crash. Handle + * it difficult to process CHANNELMSG_UNLOAD in case of crash. Handle * unknown NMI on the first CPU which gets it. */ static int hv_nmi_unknown(unsigned int val, struct pt_regs *regs) @@ -274,12 +265,13 @@ static void __init ms_hyperv_init_platform(void) * Extract the features and hints */ ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES); - ms_hyperv.features_b = cpuid_ebx(HYPERV_CPUID_FEATURES); + ms_hyperv.priv_high = cpuid_ebx(HYPERV_CPUID_FEATURES); ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES); ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO); - pr_info("Hyper-V: features 0x%x, hints 0x%x, misc 0x%x\n", - ms_hyperv.features, ms_hyperv.hints, ms_hyperv.misc_features); + pr_info("Hyper-V: privilege flags low 0x%x, high 0x%x, hints 0x%x, misc 0x%x\n", + ms_hyperv.features, ms_hyperv.priv_high, ms_hyperv.hints, + ms_hyperv.misc_features); ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS); ms_hyperv.max_lp_index = cpuid_ebx(HYPERV_CPUID_IMPLEMENT_LIMITS); @@ -325,7 +317,7 @@ static void __init ms_hyperv_init_platform(void) x86_platform.calibrate_cpu = hv_get_tsc_khz; } - if (ms_hyperv.features_b & HV_ISOLATION) { + if (ms_hyperv.priv_high & HV_ISOLATION) { ms_hyperv.isolation_config_a = cpuid_eax(HYPERV_CPUID_ISOLATION_CONFIG); ms_hyperv.isolation_config_b = cpuid_ebx(HYPERV_CPUID_ISOLATION_CONFIG); @@ -428,7 +420,7 @@ static void __init ms_hyperv_init_platform(void) /* * Hyper-V doesn't provide irq remapping for IO-APIC. To enable x2apic, - * set x2apic destination mode to physcial mode when x2apic is available + * set x2apic destination mode to physical mode when x2apic is available * and Hyper-V IOMMU driver makes sure cpus assigned with IO-APIC irqs * have 8-bit APIC id. */ diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 9231640782fa..0c3b372318b7 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -434,7 +434,7 @@ set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn, state->range_sizek = sizek - second_sizek; } -/* Mininum size of mtrr block that can take hole: */ +/* Minimum size of mtrr block that can take hole: */ static u64 mtrr_chunk_size __initdata = (256ULL<<20); static int __init parse_mtrr_chunk_size_opt(char *p) diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c index 28c8a23aa42e..a76694bffe86 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.c +++ b/arch/x86/kernel/cpu/mtrr/mtrr.c @@ -799,7 +799,7 @@ void mtrr_ap_init(void) * * This routine is called in two cases: * - * 1. very earily time of software resume, when there absolutely + * 1. very early time of software resume, when there absolutely * isn't mtrr entry changes; * * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 698bb26aeb6e..23001ae03e82 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -192,7 +192,7 @@ static unsigned int cbm_idx(struct rdt_resource *r, unsigned int closid) * Intel(R) Xeon(R) CPU E5-2608L v3 @ 2.00GHz * Intel(R) Xeon(R) CPU E5-2658A v3 @ 2.20GHz * - * Probe by trying to write the first of the L3 cach mask registers + * Probe by trying to write the first of the L3 cache mask registers * and checking that the bits stick. Max CLOSids is always 4 and max cbm length * is always 20 on hsw server parts. The minimum cache bitmask length * allowed for HSW server is always 2 bits. Hardcode all of them. diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 7ac31210e452..dbeaa8409313 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -387,7 +387,7 @@ void mon_event_count(void *info) * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so * that: * - * current bandwdith(cur_bw) < user specified bandwidth(user_bw) + * current bandwidth(cur_bw) < user specified bandwidth(user_bw) * * This uses the MBM counters to measure the bandwidth and MBA throttle * MSRs to control the bandwidth for a particular rdtgrp. It builds on the @@ -397,7 +397,7 @@ void mon_event_count(void *info) * timer. Having 1s interval makes the calculation of bandwidth simpler. * * Although MBA's goal is to restrict the bandwidth to a maximum, there may - * be a need to increase the bandwidth to avoid uncecessarily restricting + * be a need to increase the bandwidth to avoid unnecessarily restricting * the L2 <-> L3 traffic. * * Since MBA controls the L2 external bandwidth where as MBM measures the @@ -480,7 +480,7 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) /* * Delta values are updated dynamically package wise for each - * rdtgrp everytime the throttle MSR changes value. + * rdtgrp every time the throttle MSR changes value. * * This is because (1)the increase in bandwidth is not perfectly * linear and only "approximately" linear even when the hardware diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index e916646adc69..935af2ac6b1a 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -1307,7 +1307,7 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) * If the thread does not get on the CPU for whatever * reason and the process which sets up the region is * interrupted then this will leave the thread in runnable - * state and once it gets on the CPU it will derefence + * state and once it gets on the CPU it will dereference * the cleared, but not freed, plr struct resulting in an * empty pseudo-locking loop. */ @@ -1391,7 +1391,7 @@ out: * group is removed from user space via a "rmdir" from userspace or the * unmount of the resctrl filesystem. On removal the resource group does * not go back to pseudo-locksetup mode before it is removed, instead it is - * removed directly. There is thus assymmetry with the creation where the + * removed directly. There is thus asymmetry with the creation where the * &struct pseudo_lock_region is removed here while it was not created in * rdtgroup_pseudo_lock_create(). * diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index f9190adc52cb..01fd30e7829d 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * User interface for Resource Alloction in Resource Director Technology(RDT) + * User interface for Resource Allocation in Resource Director Technology(RDT) * * Copyright (C) 2016 Intel Corporation * @@ -294,7 +294,7 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of, /* * This is safe against resctrl_sched_in() called from __switch_to() * because __switch_to() is executed with interrupts disabled. A local call - * from update_closid_rmid() is proteced against __switch_to() because + * from update_closid_rmid() is protected against __switch_to() because * preemption is disabled. */ static void update_cpu_closid_rmid(void *info) @@ -2555,7 +2555,7 @@ static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, /* * This creates a directory mon_data which contains the monitored data. * - * mon_data has one directory for each domain whic are named + * mon_data has one directory for each domain which are named * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data * with L3 domain looks as below: * ./mon_data: diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 972ec3bfa9c0..21d1f062895a 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -36,6 +36,8 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 }, { X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 }, { X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 }, + { X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 }, + { X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 }, { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, diff --git a/arch/x86/kernel/cpu/sgx/Makefile b/arch/x86/kernel/cpu/sgx/Makefile index 91d3dc784a29..9c1656779b2a 100644 --- a/arch/x86/kernel/cpu/sgx/Makefile +++ b/arch/x86/kernel/cpu/sgx/Makefile @@ -3,3 +3,4 @@ obj-y += \ encl.o \ ioctl.o \ main.o +obj-$(CONFIG_X86_SGX_KVM) += virt.o diff --git a/arch/x86/kernel/cpu/sgx/arch.h b/arch/x86/kernel/cpu/sgx/arch.h deleted file mode 100644 index dd7602c44c72..000000000000 --- a/arch/x86/kernel/cpu/sgx/arch.h +++ /dev/null @@ -1,338 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/** - * Copyright(c) 2016-20 Intel Corporation. - * - * Contains data structures defined by the SGX architecture. Data structures - * defined by the Linux software stack should not be placed here. - */ -#ifndef _ASM_X86_SGX_ARCH_H -#define _ASM_X86_SGX_ARCH_H - -#include <linux/bits.h> -#include <linux/types.h> - -/* The SGX specific CPUID function. */ -#define SGX_CPUID 0x12 -/* EPC enumeration. */ -#define SGX_CPUID_EPC 2 -/* An invalid EPC section, i.e. the end marker. */ -#define SGX_CPUID_EPC_INVALID 0x0 -/* A valid EPC section. */ -#define SGX_CPUID_EPC_SECTION 0x1 -/* The bitmask for the EPC section type. */ -#define SGX_CPUID_EPC_MASK GENMASK(3, 0) - -/** - * enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV - * %SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not - * been completed yet. - * %SGX_INVALID_EINITTOKEN: EINITTOKEN is invalid and enclave signer's - * public key does not match IA32_SGXLEPUBKEYHASH. - * %SGX_UNMASKED_EVENT: An unmasked event, e.g. INTR, was received - */ -enum sgx_return_code { - SGX_NOT_TRACKED = 11, - SGX_INVALID_EINITTOKEN = 16, - SGX_UNMASKED_EVENT = 128, -}; - -/* The modulus size for 3072-bit RSA keys. */ -#define SGX_MODULUS_SIZE 384 - -/** - * enum sgx_miscselect - additional information to an SSA frame - * %SGX_MISC_EXINFO: Report #PF or #GP to the SSA frame. - * - * Save State Area (SSA) is a stack inside the enclave used to store processor - * state when an exception or interrupt occurs. This enum defines additional - * information stored to an SSA frame. - */ -enum sgx_miscselect { - SGX_MISC_EXINFO = BIT(0), -}; - -#define SGX_MISC_RESERVED_MASK GENMASK_ULL(63, 1) - -#define SGX_SSA_GPRS_SIZE 184 -#define SGX_SSA_MISC_EXINFO_SIZE 16 - -/** - * enum sgx_attributes - the attributes field in &struct sgx_secs - * %SGX_ATTR_INIT: Enclave can be entered (is initialized). - * %SGX_ATTR_DEBUG: Allow ENCLS(EDBGRD) and ENCLS(EDBGWR). - * %SGX_ATTR_MODE64BIT: Tell that this a 64-bit enclave. - * %SGX_ATTR_PROVISIONKEY: Allow to use provisioning keys for remote - * attestation. - * %SGX_ATTR_KSS: Allow to use key separation and sharing (KSS). - * %SGX_ATTR_EINITTOKENKEY: Allow to use token signing key that is used to - * sign cryptographic tokens that can be passed to - * EINIT as an authorization to run an enclave. - */ -enum sgx_attribute { - SGX_ATTR_INIT = BIT(0), - SGX_ATTR_DEBUG = BIT(1), - SGX_ATTR_MODE64BIT = BIT(2), - SGX_ATTR_PROVISIONKEY = BIT(4), - SGX_ATTR_EINITTOKENKEY = BIT(5), - SGX_ATTR_KSS = BIT(7), -}; - -#define SGX_ATTR_RESERVED_MASK (BIT_ULL(3) | BIT_ULL(6) | GENMASK_ULL(63, 8)) - -/** - * struct sgx_secs - SGX Enclave Control Structure (SECS) - * @size: size of the address space - * @base: base address of the address space - * @ssa_frame_size: size of an SSA frame - * @miscselect: additional information stored to an SSA frame - * @attributes: attributes for enclave - * @xfrm: XSave-Feature Request Mask (subset of XCR0) - * @mrenclave: SHA256-hash of the enclave contents - * @mrsigner: SHA256-hash of the public key used to sign the SIGSTRUCT - * @config_id: a user-defined value that is used in key derivation - * @isv_prod_id: a user-defined value that is used in key derivation - * @isv_svn: a user-defined value that is used in key derivation - * @config_svn: a user-defined value that is used in key derivation - * - * SGX Enclave Control Structure (SECS) is a special enclave page that is not - * visible in the address space. In fact, this structure defines the address - * range and other global attributes for the enclave and it is the first EPC - * page created for any enclave. It is moved from a temporary buffer to an EPC - * by the means of ENCLS[ECREATE] function. - */ -struct sgx_secs { - u64 size; - u64 base; - u32 ssa_frame_size; - u32 miscselect; - u8 reserved1[24]; - u64 attributes; - u64 xfrm; - u32 mrenclave[8]; - u8 reserved2[32]; - u32 mrsigner[8]; - u8 reserved3[32]; - u32 config_id[16]; - u16 isv_prod_id; - u16 isv_svn; - u16 config_svn; - u8 reserved4[3834]; -} __packed; - -/** - * enum sgx_tcs_flags - execution flags for TCS - * %SGX_TCS_DBGOPTIN: If enabled allows single-stepping and breakpoints - * inside an enclave. It is cleared by EADD but can - * be set later with EDBGWR. - */ -enum sgx_tcs_flags { - SGX_TCS_DBGOPTIN = 0x01, -}; - -#define SGX_TCS_RESERVED_MASK GENMASK_ULL(63, 1) -#define SGX_TCS_RESERVED_SIZE 4024 - -/** - * struct sgx_tcs - Thread Control Structure (TCS) - * @state: used to mark an entered TCS - * @flags: execution flags (cleared by EADD) - * @ssa_offset: SSA stack offset relative to the enclave base - * @ssa_index: the current SSA frame index (cleard by EADD) - * @nr_ssa_frames: the number of frame in the SSA stack - * @entry_offset: entry point offset relative to the enclave base - * @exit_addr: address outside the enclave to exit on an exception or - * interrupt - * @fs_offset: offset relative to the enclave base to become FS - * segment inside the enclave - * @gs_offset: offset relative to the enclave base to become GS - * segment inside the enclave - * @fs_limit: size to become a new FS-limit (only 32-bit enclaves) - * @gs_limit: size to become a new GS-limit (only 32-bit enclaves) - * - * Thread Control Structure (TCS) is an enclave page visible in its address - * space that defines an entry point inside the enclave. A thread enters inside - * an enclave by supplying address of TCS to ENCLU(EENTER). A TCS can be entered - * by only one thread at a time. - */ -struct sgx_tcs { - u64 state; - u64 flags; - u64 ssa_offset; - u32 ssa_index; - u32 nr_ssa_frames; - u64 entry_offset; - u64 exit_addr; - u64 fs_offset; - u64 gs_offset; - u32 fs_limit; - u32 gs_limit; - u8 reserved[SGX_TCS_RESERVED_SIZE]; -} __packed; - -/** - * struct sgx_pageinfo - an enclave page descriptor - * @addr: address of the enclave page - * @contents: pointer to the page contents - * @metadata: pointer either to a SECINFO or PCMD instance - * @secs: address of the SECS page - */ -struct sgx_pageinfo { - u64 addr; - u64 contents; - u64 metadata; - u64 secs; -} __packed __aligned(32); - - -/** - * enum sgx_page_type - bits in the SECINFO flags defining the page type - * %SGX_PAGE_TYPE_SECS: a SECS page - * %SGX_PAGE_TYPE_TCS: a TCS page - * %SGX_PAGE_TYPE_REG: a regular page - * %SGX_PAGE_TYPE_VA: a VA page - * %SGX_PAGE_TYPE_TRIM: a page in trimmed state - */ -enum sgx_page_type { - SGX_PAGE_TYPE_SECS, - SGX_PAGE_TYPE_TCS, - SGX_PAGE_TYPE_REG, - SGX_PAGE_TYPE_VA, - SGX_PAGE_TYPE_TRIM, -}; - -#define SGX_NR_PAGE_TYPES 5 -#define SGX_PAGE_TYPE_MASK GENMASK(7, 0) - -/** - * enum sgx_secinfo_flags - the flags field in &struct sgx_secinfo - * %SGX_SECINFO_R: allow read - * %SGX_SECINFO_W: allow write - * %SGX_SECINFO_X: allow execution - * %SGX_SECINFO_SECS: a SECS page - * %SGX_SECINFO_TCS: a TCS page - * %SGX_SECINFO_REG: a regular page - * %SGX_SECINFO_VA: a VA page - * %SGX_SECINFO_TRIM: a page in trimmed state - */ -enum sgx_secinfo_flags { - SGX_SECINFO_R = BIT(0), - SGX_SECINFO_W = BIT(1), - SGX_SECINFO_X = BIT(2), - SGX_SECINFO_SECS = (SGX_PAGE_TYPE_SECS << 8), - SGX_SECINFO_TCS = (SGX_PAGE_TYPE_TCS << 8), - SGX_SECINFO_REG = (SGX_PAGE_TYPE_REG << 8), - SGX_SECINFO_VA = (SGX_PAGE_TYPE_VA << 8), - SGX_SECINFO_TRIM = (SGX_PAGE_TYPE_TRIM << 8), -}; - -#define SGX_SECINFO_PERMISSION_MASK GENMASK_ULL(2, 0) -#define SGX_SECINFO_PAGE_TYPE_MASK (SGX_PAGE_TYPE_MASK << 8) -#define SGX_SECINFO_RESERVED_MASK ~(SGX_SECINFO_PERMISSION_MASK | \ - SGX_SECINFO_PAGE_TYPE_MASK) - -/** - * struct sgx_secinfo - describes attributes of an EPC page - * @flags: permissions and type - * - * Used together with ENCLS leaves that add or modify an EPC page to an - * enclave to define page permissions and type. - */ -struct sgx_secinfo { - u64 flags; - u8 reserved[56]; -} __packed __aligned(64); - -#define SGX_PCMD_RESERVED_SIZE 40 - -/** - * struct sgx_pcmd - Paging Crypto Metadata (PCMD) - * @enclave_id: enclave identifier - * @mac: MAC over PCMD, page contents and isvsvn - * - * PCMD is stored for every swapped page to the regular memory. When ELDU loads - * the page back it recalculates the MAC by using a isvsvn number stored in a - * VA page. Together these two structures bring integrity and rollback - * protection. - */ -struct sgx_pcmd { - struct sgx_secinfo secinfo; - u64 enclave_id; - u8 reserved[SGX_PCMD_RESERVED_SIZE]; - u8 mac[16]; -} __packed __aligned(128); - -#define SGX_SIGSTRUCT_RESERVED1_SIZE 84 -#define SGX_SIGSTRUCT_RESERVED2_SIZE 20 -#define SGX_SIGSTRUCT_RESERVED3_SIZE 32 -#define SGX_SIGSTRUCT_RESERVED4_SIZE 12 - -/** - * struct sgx_sigstruct_header - defines author of the enclave - * @header1: constant byte string - * @vendor: must be either 0x0000 or 0x8086 - * @date: YYYYMMDD in BCD - * @header2: costant byte string - * @swdefined: software defined value - */ -struct sgx_sigstruct_header { - u64 header1[2]; - u32 vendor; - u32 date; - u64 header2[2]; - u32 swdefined; - u8 reserved1[84]; -} __packed; - -/** - * struct sgx_sigstruct_body - defines contents of the enclave - * @miscselect: additional information stored to an SSA frame - * @misc_mask: required miscselect in SECS - * @attributes: attributes for enclave - * @xfrm: XSave-Feature Request Mask (subset of XCR0) - * @attributes_mask: required attributes in SECS - * @xfrm_mask: required XFRM in SECS - * @mrenclave: SHA256-hash of the enclave contents - * @isvprodid: a user-defined value that is used in key derivation - * @isvsvn: a user-defined value that is used in key derivation - */ -struct sgx_sigstruct_body { - u32 miscselect; - u32 misc_mask; - u8 reserved2[20]; - u64 attributes; - u64 xfrm; - u64 attributes_mask; - u64 xfrm_mask; - u8 mrenclave[32]; - u8 reserved3[32]; - u16 isvprodid; - u16 isvsvn; -} __packed; - -/** - * struct sgx_sigstruct - an enclave signature - * @header: defines author of the enclave - * @modulus: the modulus of the public key - * @exponent: the exponent of the public key - * @signature: the signature calculated over the fields except modulus, - * @body: defines contents of the enclave - * @q1: a value used in RSA signature verification - * @q2: a value used in RSA signature verification - * - * Header and body are the parts that are actual signed. The remaining fields - * define the signature of the enclave. - */ -struct sgx_sigstruct { - struct sgx_sigstruct_header header; - u8 modulus[SGX_MODULUS_SIZE]; - u32 exponent; - u8 signature[SGX_MODULUS_SIZE]; - struct sgx_sigstruct_body body; - u8 reserved4[12]; - u8 q1[SGX_MODULUS_SIZE]; - u8 q2[SGX_MODULUS_SIZE]; -} __packed; - -#define SGX_LAUNCH_TOKEN_SIZE 304 - -#endif /* _ASM_X86_SGX_ARCH_H */ diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c index 8ce6d8371cfb..aa9b8b868867 100644 --- a/arch/x86/kernel/cpu/sgx/driver.c +++ b/arch/x86/kernel/cpu/sgx/driver.c @@ -136,10 +136,6 @@ static const struct file_operations sgx_encl_fops = { .get_unmapped_area = sgx_get_unmapped_area, }; -const struct file_operations sgx_provision_fops = { - .owner = THIS_MODULE, -}; - static struct miscdevice sgx_dev_enclave = { .minor = MISC_DYNAMIC_MINOR, .name = "sgx_enclave", @@ -147,13 +143,6 @@ static struct miscdevice sgx_dev_enclave = { .fops = &sgx_encl_fops, }; -static struct miscdevice sgx_dev_provision = { - .minor = MISC_DYNAMIC_MINOR, - .name = "sgx_provision", - .nodename = "sgx_provision", - .fops = &sgx_provision_fops, -}; - int __init sgx_drv_init(void) { unsigned int eax, ebx, ecx, edx; @@ -187,11 +176,5 @@ int __init sgx_drv_init(void) if (ret) return ret; - ret = misc_register(&sgx_dev_provision); - if (ret) { - misc_deregister(&sgx_dev_enclave); - return ret; - } - return 0; } diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 7449ef33f081..3be203297988 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -7,7 +7,7 @@ #include <linux/shmem_fs.h> #include <linux/suspend.h> #include <linux/sched/mm.h> -#include "arch.h" +#include <asm/sgx.h> #include "encl.h" #include "encls.h" #include "sgx.h" @@ -78,7 +78,7 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page, ret = __sgx_encl_eldu(encl_page, epc_page, secs_page); if (ret) { - sgx_free_epc_page(epc_page); + sgx_encl_free_epc_page(epc_page); return ERR_PTR(ret); } @@ -404,7 +404,7 @@ void sgx_encl_release(struct kref *ref) if (sgx_unmark_page_reclaimable(entry->epc_page)) continue; - sgx_free_epc_page(entry->epc_page); + sgx_encl_free_epc_page(entry->epc_page); encl->secs_child_cnt--; entry->epc_page = NULL; } @@ -415,7 +415,7 @@ void sgx_encl_release(struct kref *ref) xa_destroy(&encl->page_array); if (!encl->secs_child_cnt && encl->secs.epc_page) { - sgx_free_epc_page(encl->secs.epc_page); + sgx_encl_free_epc_page(encl->secs.epc_page); encl->secs.epc_page = NULL; } @@ -423,7 +423,7 @@ void sgx_encl_release(struct kref *ref) va_page = list_first_entry(&encl->va_pages, struct sgx_va_page, list); list_del(&va_page->list); - sgx_free_epc_page(va_page->epc_page); + sgx_encl_free_epc_page(va_page->epc_page); kfree(va_page); } @@ -686,7 +686,7 @@ struct sgx_epc_page *sgx_alloc_va_page(void) ret = __epa(sgx_get_epc_virt_addr(epc_page)); if (ret) { WARN_ONCE(1, "EPA returned %d (0x%x)", ret, ret); - sgx_free_epc_page(epc_page); + sgx_encl_free_epc_page(epc_page); return ERR_PTR(-EFAULT); } @@ -735,3 +735,24 @@ bool sgx_va_page_full(struct sgx_va_page *va_page) return slot == SGX_VA_SLOT_COUNT; } + +/** + * sgx_encl_free_epc_page - free an EPC page assigned to an enclave + * @page: EPC page to be freed + * + * Free an EPC page assigned to an enclave. It does EREMOVE for the page, and + * only upon success, it puts the page back to free page list. Otherwise, it + * gives a WARNING to indicate page is leaked. + */ +void sgx_encl_free_epc_page(struct sgx_epc_page *page) +{ + int ret; + + WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED); + + ret = __eremove(sgx_get_epc_virt_addr(page)); + if (WARN_ONCE(ret, EREMOVE_ERROR_MESSAGE, ret, ret)) + return; + + sgx_free_epc_page(page); +} diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h index d8d30ccbef4c..6e74f85b6264 100644 --- a/arch/x86/kernel/cpu/sgx/encl.h +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -115,5 +115,6 @@ struct sgx_epc_page *sgx_alloc_va_page(void); unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page); void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset); bool sgx_va_page_full(struct sgx_va_page *va_page); +void sgx_encl_free_epc_page(struct sgx_epc_page *page); #endif /* _X86_ENCL_H */ diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h index 443188fe7e70..9b204843b78d 100644 --- a/arch/x86/kernel/cpu/sgx/encls.h +++ b/arch/x86/kernel/cpu/sgx/encls.h @@ -11,21 +11,6 @@ #include <asm/traps.h> #include "sgx.h" -enum sgx_encls_function { - ECREATE = 0x00, - EADD = 0x01, - EINIT = 0x02, - EREMOVE = 0x03, - EDGBRD = 0x04, - EDGBWR = 0x05, - EEXTEND = 0x06, - ELDU = 0x08, - EBLOCK = 0x09, - EPA = 0x0A, - EWB = 0x0B, - ETRACK = 0x0C, -}; - /** * ENCLS_FAULT_FLAG - flag signifying an ENCLS return code is a trapnr * @@ -55,6 +40,19 @@ enum sgx_encls_function { } while (0); \ } +/* + * encls_faulted() - Check if an ENCLS leaf faulted given an error code + * @ret: the return value of an ENCLS leaf function call + * + * Return: + * - true: ENCLS leaf faulted. + * - false: Otherwise. + */ +static inline bool encls_faulted(int ret) +{ + return ret & ENCLS_FAULT_FLAG; +} + /** * encls_failed() - Check if an ENCLS function failed * @ret: the return value of an ENCLS function call @@ -65,7 +63,7 @@ enum sgx_encls_function { */ static inline bool encls_failed(int ret) { - if (ret & ENCLS_FAULT_FLAG) + if (encls_faulted(ret)) return ENCLS_TRAPNR(ret) != X86_TRAP_PF; return !!ret; diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index 90a5caf76939..83df20e3e633 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -2,6 +2,7 @@ /* Copyright(c) 2016-20 Intel Corporation. */ #include <asm/mman.h> +#include <asm/sgx.h> #include <linux/mman.h> #include <linux/delay.h> #include <linux/file.h> @@ -47,7 +48,7 @@ static void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page) encl->page_cnt--; if (va_page) { - sgx_free_epc_page(va_page->epc_page); + sgx_encl_free_epc_page(va_page->epc_page); list_del(&va_page->list); kfree(va_page); } @@ -117,7 +118,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs) return 0; err_out: - sgx_free_epc_page(encl->secs.epc_page); + sgx_encl_free_epc_page(encl->secs.epc_page); encl->secs.epc_page = NULL; err_out_backing: @@ -365,7 +366,7 @@ err_out_unlock: mmap_read_unlock(current->mm); err_out_free: - sgx_free_epc_page(epc_page); + sgx_encl_free_epc_page(epc_page); kfree(encl_page); return ret; @@ -495,7 +496,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, void *token) { u64 mrsigner[4]; - int i, j, k; + int i, j; void *addr; int ret; @@ -544,8 +545,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, preempt_disable(); - for (k = 0; k < 4; k++) - wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + k, mrsigner[k]); + sgx_update_lepubkeyhash(mrsigner); ret = __einit(sigstruct, token, addr); @@ -568,7 +568,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, } } - if (ret & ENCLS_FAULT_FLAG) { + if (encls_faulted(ret)) { if (encls_failed(ret)) ENCLS_WARN(ret, "EINIT"); @@ -604,7 +604,6 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg) { struct sgx_sigstruct *sigstruct; struct sgx_enclave_init init_arg; - struct page *initp_page; void *token; int ret; @@ -615,11 +614,15 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg) if (copy_from_user(&init_arg, arg, sizeof(init_arg))) return -EFAULT; - initp_page = alloc_page(GFP_KERNEL); - if (!initp_page) + /* + * 'sigstruct' must be on a page boundary and 'token' on a 512 byte + * boundary. kmalloc() will give this alignment when allocating + * PAGE_SIZE bytes. + */ + sigstruct = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!sigstruct) return -ENOMEM; - sigstruct = kmap(initp_page); token = (void *)((unsigned long)sigstruct + PAGE_SIZE / 2); memset(token, 0, SGX_LAUNCH_TOKEN_SIZE); @@ -645,8 +648,7 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg) ret = sgx_encl_init(encl, sigstruct, token); out: - kunmap(initp_page); - __free_page(initp_page); + kfree(sigstruct); return ret; } @@ -665,24 +667,11 @@ out: static long sgx_ioc_enclave_provision(struct sgx_encl *encl, void __user *arg) { struct sgx_enclave_provision params; - struct file *file; if (copy_from_user(¶ms, arg, sizeof(params))) return -EFAULT; - file = fget(params.fd); - if (!file) - return -EINVAL; - - if (file->f_op != &sgx_provision_fops) { - fput(file); - return -EINVAL; - } - - encl->attributes_mask |= SGX_ATTR_PROVISIONKEY; - - fput(file); - return 0; + return sgx_set_attribute(&encl->attributes_mask, params.fd); } long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 8df81a3ed945..63d3de02bbcc 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -1,14 +1,17 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright(c) 2016-20 Intel Corporation. */ +#include <linux/file.h> #include <linux/freezer.h> #include <linux/highmem.h> #include <linux/kthread.h> +#include <linux/miscdevice.h> #include <linux/pagemap.h> #include <linux/ratelimit.h> #include <linux/sched/mm.h> #include <linux/sched/signal.h> #include <linux/slab.h> +#include <asm/sgx.h> #include "driver.h" #include "encl.h" #include "encls.h" @@ -23,42 +26,58 @@ static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq); * with sgx_reclaimer_lock acquired. */ static LIST_HEAD(sgx_active_page_list); - static DEFINE_SPINLOCK(sgx_reclaimer_lock); +/* The free page list lock protected variables prepend the lock. */ +static unsigned long sgx_nr_free_pages; + +/* Nodes with one or more EPC sections. */ +static nodemask_t sgx_numa_mask; + +/* + * Array with one list_head for each possible NUMA node. Each + * list contains all the sgx_epc_section's which are on that + * node. + */ +static struct sgx_numa_node *sgx_numa_nodes; + +static LIST_HEAD(sgx_dirty_page_list); + /* - * Reset dirty EPC pages to uninitialized state. Laundry can be left with SECS - * pages whose child pages blocked EREMOVE. + * Reset post-kexec EPC pages to the uninitialized state. The pages are removed + * from the input list, and made available for the page allocator. SECS pages + * prepending their children in the input list are left intact. */ -static void sgx_sanitize_section(struct sgx_epc_section *section) +static void __sgx_sanitize_pages(struct list_head *dirty_page_list) { struct sgx_epc_page *page; LIST_HEAD(dirty); int ret; - /* init_laundry_list is thread-local, no need for a lock: */ - while (!list_empty(§ion->init_laundry_list)) { + /* dirty_page_list is thread-local, no need for a lock: */ + while (!list_empty(dirty_page_list)) { if (kthread_should_stop()) return; - /* needed for access to ->page_list: */ - spin_lock(§ion->lock); - - page = list_first_entry(§ion->init_laundry_list, - struct sgx_epc_page, list); + page = list_first_entry(dirty_page_list, struct sgx_epc_page, list); ret = __eremove(sgx_get_epc_virt_addr(page)); - if (!ret) - list_move(&page->list, §ion->page_list); - else + if (!ret) { + /* + * page is now sanitized. Make it available via the SGX + * page allocator: + */ + list_del(&page->list); + sgx_free_epc_page(page); + } else { + /* The page is not yet clean - move to the dirty list. */ list_move_tail(&page->list, &dirty); - - spin_unlock(§ion->lock); + } cond_resched(); } - list_splice(&dirty, §ion->init_laundry_list); + list_splice(&dirty, dirty_page_list); } static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page) @@ -195,10 +214,10 @@ static const cpumask_t *sgx_encl_ewb_cpumask(struct sgx_encl *encl) /* * Swap page to the regular memory transformed to the blocked state by using - * EBLOCK, which means that it can no loger be referenced (no new TLB entries). + * EBLOCK, which means that it can no longer be referenced (no new TLB entries). * * The first trial just tries to write the page assuming that some other thread - * has reset the count for threads inside the enlave by using ETRACK, and + * has reset the count for threads inside the enclave by using ETRACK, and * previous thread count has been zeroed out. The second trial calls ETRACK * before EWB. If that fails we kick all the HW threads out, and then do EWB, * which should be guaranteed the succeed. @@ -278,7 +297,7 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page, sgx_encl_ewb(encl->secs.epc_page, &secs_backing); - sgx_free_epc_page(encl->secs.epc_page); + sgx_encl_free_epc_page(encl->secs.epc_page); encl->secs.epc_page = NULL; sgx_encl_put_backing(&secs_backing, true); @@ -308,6 +327,7 @@ static void sgx_reclaim_pages(void) struct sgx_epc_section *section; struct sgx_encl_page *encl_page; struct sgx_epc_page *epc_page; + struct sgx_numa_node *node; pgoff_t page_index; int cnt = 0; int ret; @@ -379,50 +399,33 @@ skip: epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; section = &sgx_epc_sections[epc_page->section]; - spin_lock(§ion->lock); - list_add_tail(&epc_page->list, §ion->page_list); - section->free_cnt++; - spin_unlock(§ion->lock); - } -} - -static unsigned long sgx_nr_free_pages(void) -{ - unsigned long cnt = 0; - int i; - - for (i = 0; i < sgx_nr_epc_sections; i++) - cnt += sgx_epc_sections[i].free_cnt; + node = section->node; - return cnt; + spin_lock(&node->lock); + list_add_tail(&epc_page->list, &node->free_page_list); + sgx_nr_free_pages++; + spin_unlock(&node->lock); + } } static bool sgx_should_reclaim(unsigned long watermark) { - return sgx_nr_free_pages() < watermark && - !list_empty(&sgx_active_page_list); + return sgx_nr_free_pages < watermark && !list_empty(&sgx_active_page_list); } static int ksgxd(void *p) { - int i; - set_freezable(); /* * Sanitize pages in order to recover from kexec(). The 2nd pass is * required for SECS pages, whose child pages blocked EREMOVE. */ - for (i = 0; i < sgx_nr_epc_sections; i++) - sgx_sanitize_section(&sgx_epc_sections[i]); - - for (i = 0; i < sgx_nr_epc_sections; i++) { - sgx_sanitize_section(&sgx_epc_sections[i]); + __sgx_sanitize_pages(&sgx_dirty_page_list); + __sgx_sanitize_pages(&sgx_dirty_page_list); - /* Should never happen. */ - if (!list_empty(&sgx_epc_sections[i].init_laundry_list)) - WARN(1, "EPC section %d has unsanitized pages.\n", i); - } + /* sanity check: */ + WARN_ON(!list_empty(&sgx_dirty_page_list)); while (!kthread_should_stop()) { if (try_to_freeze()) @@ -454,45 +457,56 @@ static bool __init sgx_page_reclaimer_init(void) return true; } -static struct sgx_epc_page *__sgx_alloc_epc_page_from_section(struct sgx_epc_section *section) +static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid) { - struct sgx_epc_page *page; + struct sgx_numa_node *node = &sgx_numa_nodes[nid]; + struct sgx_epc_page *page = NULL; - spin_lock(§ion->lock); + spin_lock(&node->lock); - if (list_empty(§ion->page_list)) { - spin_unlock(§ion->lock); + if (list_empty(&node->free_page_list)) { + spin_unlock(&node->lock); return NULL; } - page = list_first_entry(§ion->page_list, struct sgx_epc_page, list); + page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list); list_del_init(&page->list); - section->free_cnt--; + sgx_nr_free_pages--; + + spin_unlock(&node->lock); - spin_unlock(§ion->lock); return page; } /** * __sgx_alloc_epc_page() - Allocate an EPC page * - * Iterate through EPC sections and borrow a free EPC page to the caller. When a - * page is no longer needed it must be released with sgx_free_epc_page(). + * Iterate through NUMA nodes and reserve ia free EPC page to the caller. Start + * from the NUMA node, where the caller is executing. * * Return: - * an EPC page, - * -errno on error + * - an EPC page: A borrowed EPC pages were available. + * - NULL: Out of EPC pages. */ struct sgx_epc_page *__sgx_alloc_epc_page(void) { - struct sgx_epc_section *section; struct sgx_epc_page *page; - int i; + int nid_of_current = numa_node_id(); + int nid = nid_of_current; - for (i = 0; i < sgx_nr_epc_sections; i++) { - section = &sgx_epc_sections[i]; + if (node_isset(nid_of_current, sgx_numa_mask)) { + page = __sgx_alloc_epc_page_from_node(nid_of_current); + if (page) + return page; + } + + /* Fall back to the non-local NUMA nodes: */ + while (true) { + nid = next_node_in(nid, sgx_numa_mask); + if (nid == nid_of_current) + break; - page = __sgx_alloc_epc_page_from_section(section); + page = __sgx_alloc_epc_page_from_node(nid); if (page) return page; } @@ -598,23 +612,22 @@ struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim) * sgx_free_epc_page() - Free an EPC page * @page: an EPC page * - * Call EREMOVE for an EPC page and insert it back to the list of free pages. + * Put the EPC page back to the list of free pages. It's the caller's + * responsibility to make sure that the page is in uninitialized state. In other + * words, do EREMOVE, EWB or whatever operation is necessary before calling + * this function. */ void sgx_free_epc_page(struct sgx_epc_page *page) { struct sgx_epc_section *section = &sgx_epc_sections[page->section]; - int ret; + struct sgx_numa_node *node = section->node; - WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED); + spin_lock(&node->lock); - ret = __eremove(sgx_get_epc_virt_addr(page)); - if (WARN_ONCE(ret, "EREMOVE returned %d (0x%x)", ret, ret)) - return; + list_add_tail(&page->list, &node->free_page_list); + sgx_nr_free_pages++; - spin_lock(§ion->lock); - list_add_tail(&page->list, §ion->page_list); - section->free_cnt++; - spin_unlock(§ion->lock); + spin_unlock(&node->lock); } static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, @@ -635,18 +648,14 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, } section->phys_addr = phys_addr; - spin_lock_init(§ion->lock); - INIT_LIST_HEAD(§ion->page_list); - INIT_LIST_HEAD(§ion->init_laundry_list); for (i = 0; i < nr_pages; i++) { section->pages[i].section = index; section->pages[i].flags = 0; section->pages[i].owner = NULL; - list_add_tail(§ion->pages[i].list, §ion->init_laundry_list); + list_add_tail(§ion->pages[i].list, &sgx_dirty_page_list); } - section->free_cnt = nr_pages; return true; } @@ -665,8 +674,13 @@ static bool __init sgx_page_cache_init(void) { u32 eax, ebx, ecx, edx, type; u64 pa, size; + int nid; int i; + sgx_numa_nodes = kmalloc_array(num_possible_nodes(), sizeof(*sgx_numa_nodes), GFP_KERNEL); + if (!sgx_numa_nodes) + return false; + for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) { cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx); @@ -689,6 +703,21 @@ static bool __init sgx_page_cache_init(void) break; } + nid = numa_map_to_online_node(phys_to_target_node(pa)); + if (nid == NUMA_NO_NODE) { + /* The physical address is already printed above. */ + pr_warn(FW_BUG "Unable to map EPC section to online node. Fallback to the NUMA node 0.\n"); + nid = 0; + } + + if (!node_isset(nid, sgx_numa_mask)) { + spin_lock_init(&sgx_numa_nodes[nid].lock); + INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list); + node_set(nid, sgx_numa_mask); + } + + sgx_epc_sections[i].node = &sgx_numa_nodes[nid]; + sgx_nr_epc_sections++; } @@ -700,6 +729,67 @@ static bool __init sgx_page_cache_init(void) return true; } +/* + * Update the SGX_LEPUBKEYHASH MSRs to the values specified by caller. + * Bare-metal driver requires to update them to hash of enclave's signer + * before EINIT. KVM needs to update them to guest's virtual MSR values + * before doing EINIT from guest. + */ +void sgx_update_lepubkeyhash(u64 *lepubkeyhash) +{ + int i; + + WARN_ON_ONCE(preemptible()); + + for (i = 0; i < 4; i++) + wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]); +} + +const struct file_operations sgx_provision_fops = { + .owner = THIS_MODULE, +}; + +static struct miscdevice sgx_dev_provision = { + .minor = MISC_DYNAMIC_MINOR, + .name = "sgx_provision", + .nodename = "sgx_provision", + .fops = &sgx_provision_fops, +}; + +/** + * sgx_set_attribute() - Update allowed attributes given file descriptor + * @allowed_attributes: Pointer to allowed enclave attributes + * @attribute_fd: File descriptor for specific attribute + * + * Append enclave attribute indicated by file descriptor to allowed + * attributes. Currently only SGX_ATTR_PROVISIONKEY indicated by + * /dev/sgx_provision is supported. + * + * Return: + * -0: SGX_ATTR_PROVISIONKEY is appended to allowed_attributes + * -EINVAL: Invalid, or not supported file descriptor + */ +int sgx_set_attribute(unsigned long *allowed_attributes, + unsigned int attribute_fd) +{ + struct file *file; + + file = fget(attribute_fd); + if (!file) + return -EINVAL; + + if (file->f_op != &sgx_provision_fops) { + fput(file); + return -EINVAL; + } + + *allowed_attributes |= SGX_ATTR_PROVISIONKEY; + + fput(file); + return 0; +} +EXPORT_SYMBOL_GPL(sgx_set_attribute); + static int __init sgx_init(void) { int ret; @@ -716,12 +806,28 @@ static int __init sgx_init(void) goto err_page_cache; } - ret = sgx_drv_init(); + ret = misc_register(&sgx_dev_provision); if (ret) goto err_kthread; + /* + * Always try to initialize the native *and* KVM drivers. + * The KVM driver is less picky than the native one and + * can function if the native one is not supported on the + * current system or fails to initialize. + * + * Error out only if both fail to initialize. + */ + ret = sgx_drv_init(); + + if (sgx_vepc_init() && ret) + goto err_provision; + return 0; +err_provision: + misc_deregister(&sgx_dev_provision); + err_kthread: kthread_stop(ksgxd_tsk); diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index 5fa42d143feb..4628acec0009 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -8,11 +8,15 @@ #include <linux/rwsem.h> #include <linux/types.h> #include <asm/asm.h> -#include "arch.h" +#include <asm/sgx.h> #undef pr_fmt #define pr_fmt(fmt) "sgx: " fmt +#define EREMOVE_ERROR_MESSAGE \ + "EREMOVE returned %d (0x%x) and an EPC page was leaked. SGX may become unusable. " \ + "Refer to Documentation/x86/sgx.rst for more information." + #define SGX_MAX_EPC_SECTIONS 8 #define SGX_EEXTEND_BLOCK_SIZE 256 #define SGX_NR_TO_SCAN 16 @@ -30,28 +34,25 @@ struct sgx_epc_page { }; /* + * Contains the tracking data for NUMA nodes having EPC pages. Most importantly, + * the free page list local to the node is stored here. + */ +struct sgx_numa_node { + struct list_head free_page_list; + spinlock_t lock; +}; + +/* * The firmware can define multiple chunks of EPC to the different areas of the * physical memory e.g. for memory areas of the each node. This structure is * used to store EPC pages for one EPC section and virtual memory area where * the pages have been mapped. - * - * 'lock' must be held before accessing 'page_list' or 'free_cnt'. */ struct sgx_epc_section { unsigned long phys_addr; void *virt_addr; struct sgx_epc_page *pages; - - spinlock_t lock; - struct list_head page_list; - unsigned long free_cnt; - - /* - * Pages which need EREMOVE run on them before they can be - * used. Only safe to be accessed in ksgxd and init code. - * Not protected by locks. - */ - struct list_head init_laundry_list; + struct sgx_numa_node *node; }; extern struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS]; @@ -83,4 +84,15 @@ void sgx_mark_page_reclaimable(struct sgx_epc_page *page); int sgx_unmark_page_reclaimable(struct sgx_epc_page *page); struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim); +#ifdef CONFIG_X86_SGX_KVM +int __init sgx_vepc_init(void); +#else +static inline int __init sgx_vepc_init(void) +{ + return -ENODEV; +} +#endif + +void sgx_update_lepubkeyhash(u64 *lepubkeyhash); + #endif /* _X86_SGX_H */ diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c new file mode 100644 index 000000000000..6ad165a5c0cc --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/virt.c @@ -0,0 +1,376 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Device driver to expose SGX enclave memory to KVM guests. + * + * Copyright(c) 2021 Intel Corporation. + */ + +#include <linux/miscdevice.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/sched/mm.h> +#include <linux/sched/signal.h> +#include <linux/slab.h> +#include <linux/xarray.h> +#include <asm/sgx.h> +#include <uapi/asm/sgx.h> + +#include "encls.h" +#include "sgx.h" + +struct sgx_vepc { + struct xarray page_array; + struct mutex lock; +}; + +/* + * Temporary SECS pages that cannot be EREMOVE'd due to having child in other + * virtual EPC instances, and the lock to protect it. + */ +static struct mutex zombie_secs_pages_lock; +static struct list_head zombie_secs_pages; + +static int __sgx_vepc_fault(struct sgx_vepc *vepc, + struct vm_area_struct *vma, unsigned long addr) +{ + struct sgx_epc_page *epc_page; + unsigned long index, pfn; + int ret; + + WARN_ON(!mutex_is_locked(&vepc->lock)); + + /* Calculate index of EPC page in virtual EPC's page_array */ + index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start); + + epc_page = xa_load(&vepc->page_array, index); + if (epc_page) + return 0; + + epc_page = sgx_alloc_epc_page(vepc, false); + if (IS_ERR(epc_page)) + return PTR_ERR(epc_page); + + ret = xa_err(xa_store(&vepc->page_array, index, epc_page, GFP_KERNEL)); + if (ret) + goto err_free; + + pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page)); + + ret = vmf_insert_pfn(vma, addr, pfn); + if (ret != VM_FAULT_NOPAGE) { + ret = -EFAULT; + goto err_delete; + } + + return 0; + +err_delete: + xa_erase(&vepc->page_array, index); +err_free: + sgx_free_epc_page(epc_page); + return ret; +} + +static vm_fault_t sgx_vepc_fault(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct sgx_vepc *vepc = vma->vm_private_data; + int ret; + + mutex_lock(&vepc->lock); + ret = __sgx_vepc_fault(vepc, vma, vmf->address); + mutex_unlock(&vepc->lock); + + if (!ret) + return VM_FAULT_NOPAGE; + + if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) { + mmap_read_unlock(vma->vm_mm); + return VM_FAULT_RETRY; + } + + return VM_FAULT_SIGBUS; +} + +static const struct vm_operations_struct sgx_vepc_vm_ops = { + .fault = sgx_vepc_fault, +}; + +static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct sgx_vepc *vepc = file->private_data; + + if (!(vma->vm_flags & VM_SHARED)) + return -EINVAL; + + vma->vm_ops = &sgx_vepc_vm_ops; + /* Don't copy VMA in fork() */ + vma->vm_flags |= VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY; + vma->vm_private_data = vepc; + + return 0; +} + +static int sgx_vepc_free_page(struct sgx_epc_page *epc_page) +{ + int ret; + + /* + * Take a previously guest-owned EPC page and return it to the + * general EPC page pool. + * + * Guests can not be trusted to have left this page in a good + * state, so run EREMOVE on the page unconditionally. In the + * case that a guest properly EREMOVE'd this page, a superfluous + * EREMOVE is harmless. + */ + ret = __eremove(sgx_get_epc_virt_addr(epc_page)); + if (ret) { + /* + * Only SGX_CHILD_PRESENT is expected, which is because of + * EREMOVE'ing an SECS still with child, in which case it can + * be handled by EREMOVE'ing the SECS again after all pages in + * virtual EPC have been EREMOVE'd. See comments in below in + * sgx_vepc_release(). + * + * The user of virtual EPC (KVM) needs to guarantee there's no + * logical processor is still running in the enclave in guest, + * otherwise EREMOVE will get SGX_ENCLAVE_ACT which cannot be + * handled here. + */ + WARN_ONCE(ret != SGX_CHILD_PRESENT, EREMOVE_ERROR_MESSAGE, + ret, ret); + return ret; + } + + sgx_free_epc_page(epc_page); + + return 0; +} + +static int sgx_vepc_release(struct inode *inode, struct file *file) +{ + struct sgx_vepc *vepc = file->private_data; + struct sgx_epc_page *epc_page, *tmp, *entry; + unsigned long index; + + LIST_HEAD(secs_pages); + + xa_for_each(&vepc->page_array, index, entry) { + /* + * Remove all normal, child pages. sgx_vepc_free_page() + * will fail if EREMOVE fails, but this is OK and expected on + * SECS pages. Those can only be EREMOVE'd *after* all their + * child pages. Retries below will clean them up. + */ + if (sgx_vepc_free_page(entry)) + continue; + + xa_erase(&vepc->page_array, index); + } + + /* + * Retry EREMOVE'ing pages. This will clean up any SECS pages that + * only had children in this 'epc' area. + */ + xa_for_each(&vepc->page_array, index, entry) { + epc_page = entry; + /* + * An EREMOVE failure here means that the SECS page still + * has children. But, since all children in this 'sgx_vepc' + * have been removed, the SECS page must have a child on + * another instance. + */ + if (sgx_vepc_free_page(epc_page)) + list_add_tail(&epc_page->list, &secs_pages); + + xa_erase(&vepc->page_array, index); + } + + /* + * SECS pages are "pinned" by child pages, and "unpinned" once all + * children have been EREMOVE'd. A child page in this instance + * may have pinned an SECS page encountered in an earlier release(), + * creating a zombie. Since some children were EREMOVE'd above, + * try to EREMOVE all zombies in the hopes that one was unpinned. + */ + mutex_lock(&zombie_secs_pages_lock); + list_for_each_entry_safe(epc_page, tmp, &zombie_secs_pages, list) { + /* + * Speculatively remove the page from the list of zombies, + * if the page is successfully EREMOVE'd it will be added to + * the list of free pages. If EREMOVE fails, throw the page + * on the local list, which will be spliced on at the end. + */ + list_del(&epc_page->list); + + if (sgx_vepc_free_page(epc_page)) + list_add_tail(&epc_page->list, &secs_pages); + } + + if (!list_empty(&secs_pages)) + list_splice_tail(&secs_pages, &zombie_secs_pages); + mutex_unlock(&zombie_secs_pages_lock); + + kfree(vepc); + + return 0; +} + +static int sgx_vepc_open(struct inode *inode, struct file *file) +{ + struct sgx_vepc *vepc; + + vepc = kzalloc(sizeof(struct sgx_vepc), GFP_KERNEL); + if (!vepc) + return -ENOMEM; + mutex_init(&vepc->lock); + xa_init(&vepc->page_array); + + file->private_data = vepc; + + return 0; +} + +static const struct file_operations sgx_vepc_fops = { + .owner = THIS_MODULE, + .open = sgx_vepc_open, + .release = sgx_vepc_release, + .mmap = sgx_vepc_mmap, +}; + +static struct miscdevice sgx_vepc_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "sgx_vepc", + .nodename = "sgx_vepc", + .fops = &sgx_vepc_fops, +}; + +int __init sgx_vepc_init(void) +{ + /* SGX virtualization requires KVM to work */ + if (!cpu_feature_enabled(X86_FEATURE_VMX)) + return -ENODEV; + + INIT_LIST_HEAD(&zombie_secs_pages); + mutex_init(&zombie_secs_pages_lock); + + return misc_register(&sgx_vepc_dev); +} + +/** + * sgx_virt_ecreate() - Run ECREATE on behalf of guest + * @pageinfo: Pointer to PAGEINFO structure + * @secs: Userspace pointer to SECS page + * @trapnr: trap number injected to guest in case of ECREATE error + * + * Run ECREATE on behalf of guest after KVM traps ECREATE for the purpose + * of enforcing policies of guest's enclaves, and return the trap number + * which should be injected to guest in case of any ECREATE error. + * + * Return: + * - 0: ECREATE was successful. + * - <0: on error. + */ +int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs, + int *trapnr) +{ + int ret; + + /* + * @secs is an untrusted, userspace-provided address. It comes from + * KVM and is assumed to be a valid pointer which points somewhere in + * userspace. This can fault and call SGX or other fault handlers when + * userspace mapping @secs doesn't exist. + * + * Add a WARN() to make sure @secs is already valid userspace pointer + * from caller (KVM), who should already have handled invalid pointer + * case (for instance, made by malicious guest). All other checks, + * such as alignment of @secs, are deferred to ENCLS itself. + */ + if (WARN_ON_ONCE(!access_ok(secs, PAGE_SIZE))) + return -EINVAL; + + __uaccess_begin(); + ret = __ecreate(pageinfo, (void *)secs); + __uaccess_end(); + + if (encls_faulted(ret)) { + *trapnr = ENCLS_TRAPNR(ret); + return -EFAULT; + } + + /* ECREATE doesn't return an error code, it faults or succeeds. */ + WARN_ON_ONCE(ret); + return 0; +} +EXPORT_SYMBOL_GPL(sgx_virt_ecreate); + +static int __sgx_virt_einit(void __user *sigstruct, void __user *token, + void __user *secs) +{ + int ret; + + /* + * Make sure all userspace pointers from caller (KVM) are valid. + * All other checks deferred to ENCLS itself. Also see comment + * for @secs in sgx_virt_ecreate(). + */ +#define SGX_EINITTOKEN_SIZE 304 + if (WARN_ON_ONCE(!access_ok(sigstruct, sizeof(struct sgx_sigstruct)) || + !access_ok(token, SGX_EINITTOKEN_SIZE) || + !access_ok(secs, PAGE_SIZE))) + return -EINVAL; + + __uaccess_begin(); + ret = __einit((void *)sigstruct, (void *)token, (void *)secs); + __uaccess_end(); + + return ret; +} + +/** + * sgx_virt_einit() - Run EINIT on behalf of guest + * @sigstruct: Userspace pointer to SIGSTRUCT structure + * @token: Userspace pointer to EINITTOKEN structure + * @secs: Userspace pointer to SECS page + * @lepubkeyhash: Pointer to guest's *virtual* SGX_LEPUBKEYHASH MSR values + * @trapnr: trap number injected to guest in case of EINIT error + * + * Run EINIT on behalf of guest after KVM traps EINIT. If SGX_LC is available + * in host, SGX driver may rewrite the hardware values at wish, therefore KVM + * needs to update hardware values to guest's virtual MSR values in order to + * ensure EINIT is executed with expected hardware values. + * + * Return: + * - 0: EINIT was successful. + * - <0: on error. + */ +int sgx_virt_einit(void __user *sigstruct, void __user *token, + void __user *secs, u64 *lepubkeyhash, int *trapnr) +{ + int ret; + + if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) { + ret = __sgx_virt_einit(sigstruct, token, secs); + } else { + preempt_disable(); + + sgx_update_lepubkeyhash(lepubkeyhash); + + ret = __sgx_virt_einit(sigstruct, token, secs); + preempt_enable(); + } + + /* Propagate up the error from the WARN_ON_ONCE in __sgx_virt_einit() */ + if (ret == -EINVAL) + return ret; + + if (encls_faulted(ret)) { + *trapnr = ENCLS_TRAPNR(ret); + return -EFAULT; + } + + return ret; +} +EXPORT_SYMBOL_GPL(sgx_virt_einit); diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 8678864ce712..132a2de44d2f 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -30,7 +30,7 @@ EXPORT_SYMBOL(__max_die_per_package); #ifdef CONFIG_SMP /* - * Check if given CPUID extended toplogy "leaf" is implemented + * Check if given CPUID extended topology "leaf" is implemented */ static int check_extended_topology_leaf(int leaf) { @@ -44,7 +44,7 @@ static int check_extended_topology_leaf(int leaf) return 0; } /* - * Return best CPUID Extended Toplogy Leaf supported + * Return best CPUID Extended Topology Leaf supported */ static int detect_extended_topology_leaf(struct cpuinfo_x86 *c) { diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index c6ede3b3d302..c04b933f48d3 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -27,6 +27,7 @@ #include <linux/clocksource.h> #include <linux/cpu.h> #include <linux/reboot.h> +#include <linux/static_call.h> #include <asm/div64.h> #include <asm/x86_init.h> #include <asm/hypervisor.h> @@ -336,11 +337,11 @@ static void __init vmware_paravirt_ops_setup(void) vmware_cyc2ns_setup(); if (vmw_sched_clock) - pv_ops.time.sched_clock = vmware_sched_clock; + paravirt_set_sched_clock(vmware_sched_clock); if (vmware_is_stealclock_available()) { has_steal_clock = true; - pv_ops.time.steal_clock = vmware_steal_clock; + static_call_update(pv_steal_clock, vmware_steal_clock); /* We use reboot notifier only to disable steal clock */ register_reboot_notifier(&vmware_pv_reboot_nb); @@ -378,6 +379,8 @@ static void __init vmware_set_capabilities(void) { setup_force_cpu_cap(X86_FEATURE_CONSTANT_TSC); setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); + if (vmware_tsc_khz) + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMCALL) setup_force_cpu_cap(X86_FEATURE_VMCALL); else if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMMCALL) diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index a8f3af257e26..54ce999ed321 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -323,8 +323,8 @@ static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem, cmem->nr_ranges = 1; /* Exclude elf header region */ - start = image->arch.elf_load_addr; - end = start + image->arch.elf_headers_sz - 1; + start = image->elf_load_addr; + end = start + image->elf_headers_sz - 1; return crash_exclude_mem_range(cmem, start, end); } @@ -337,7 +337,7 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) struct crash_memmap_data cmd; struct crash_mem *cmem; - cmem = vzalloc(sizeof(struct crash_mem)); + cmem = vzalloc(struct_size(cmem, ranges, 1)); if (!cmem) return -ENOMEM; @@ -407,20 +407,20 @@ int crash_load_segments(struct kimage *image) if (ret) return ret; - image->arch.elf_headers = kbuf.buffer; - image->arch.elf_headers_sz = kbuf.bufsz; + image->elf_headers = kbuf.buffer; + image->elf_headers_sz = kbuf.bufsz; kbuf.memsz = kbuf.bufsz; kbuf.buf_align = ELF_CORE_HEADER_ALIGN; kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; ret = kexec_add_buffer(&kbuf); if (ret) { - vfree((void *)image->arch.elf_headers); + vfree((void *)image->elf_headers); return ret; } - image->arch.elf_load_addr = kbuf.mem; + image->elf_load_addr = kbuf.mem; pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - image->arch.elf_load_addr, kbuf.bufsz, kbuf.bufsz); + image->elf_load_addr, kbuf.bufsz, kbuf.bufsz); return ret; } diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c index 759d392cbe9f..d1d49e3d536b 100644 --- a/arch/x86/kernel/doublefault_32.c +++ b/arch/x86/kernel/doublefault_32.c @@ -100,9 +100,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = { .ss = __KERNEL_DS, .ds = __USER_DS, .fs = __KERNEL_PERCPU, -#ifndef CONFIG_X86_32_LAZY_GS - .gs = __KERNEL_STACK_CANARY, -#endif + .gs = 0, .__cr3 = __pa_nodebug(swapper_pg_dir), }, diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 22aad412f965..bc0657f0deed 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -31,8 +31,8 @@ * - inform the user about the firmware's notion of memory layout * via /sys/firmware/memmap * - * - the hibernation code uses it to generate a kernel-independent MD5 - * fingerprint of the physical memory layout of a system. + * - the hibernation code uses it to generate a kernel-independent CRC32 + * checksum of the physical memory layout of a system. * * - 'e820_table_kexec': a slightly modified (by the kernel) firmware version * passed to us by the bootloader - the major difference between @@ -793,7 +793,7 @@ core_initcall(e820__register_nvs_regions); #endif /* - * Allocate the requested number of bytes with the requsted alignment + * Allocate the requested number of bytes with the requested alignment * and return (the physical address) to the caller. Also register this * range in the 'kexec' E820 table as a reserved range. * diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index a4b5af03dcc1..6edd1e2ee8af 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -551,6 +551,7 @@ static const struct pci_device_id intel_early_ids[] __initconst = { INTEL_EHL_IDS(&gen11_early_ops), INTEL_TGL_12_IDS(&gen11_early_ops), INTEL_RKL_IDS(&gen11_early_ops), + INTEL_ADLS_IDS(&gen11_early_ops), }; struct resource intel_graphics_stolen_res __ro_after_init = DEFINE_RES_MEM(0, 0); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 683749b80ae2..a85c64000218 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -253,7 +253,7 @@ static bool xfeature_enabled(enum xfeature xfeature) static void __init setup_xstate_features(void) { u32 eax, ebx, ecx, edx, i; - /* start at the beginnning of the "extended state" */ + /* start at the beginning of the "extended state" */ unsigned int last_good_offset = offsetof(struct xregs_state, extended_state_area); /* diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 7edbd5ee5ed4..1b3ce3b4a2a2 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -66,7 +66,7 @@ int ftrace_arch_code_modify_post_process(void) static const char *ftrace_nop_replace(void) { - return ideal_nops[NOP_ATOMIC5]; + return x86_nops[5]; } static const char *ftrace_call_replace(unsigned long ip, unsigned long addr) @@ -377,7 +377,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) ip = trampoline + (jmp_offset - start_offset); if (WARN_ON(*(char *)ip != 0x75)) goto fail; - ret = copy_from_kernel_nofault(ip, ideal_nops[2], 2); + ret = copy_from_kernel_nofault(ip, x86_nops[2], 2); if (ret < 0) goto fail; } diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 5e9beb77cafd..18be44163a50 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -104,7 +104,7 @@ static unsigned int __head *fixup_int(void *ptr, unsigned long physaddr) static bool __head check_la57_support(unsigned long physaddr) { /* - * 5-level paging is detected and enabled at kernel decomression + * 5-level paging is detected and enabled at kernel decompression * stage. Only check if it has been enabled there. */ if (!(native_read_cr4() & X86_CR4_LA57)) diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 7ed84c282233..67f590425d90 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -318,8 +318,8 @@ SYM_FUNC_START(startup_32_smp) movl $(__KERNEL_PERCPU), %eax movl %eax,%fs # set this cpu's percpu - movl $(__KERNEL_STACK_CANARY),%eax - movl %eax,%gs + xorl %eax,%eax + movl %eax,%gs # clear possible garbage in %gs xorl %eax,%eax # Clear LDT lldt %ax @@ -339,20 +339,6 @@ SYM_FUNC_END(startup_32_smp) */ __INIT setup_once: -#ifdef CONFIG_STACKPROTECTOR - /* - * Configure the stack canary. The linker can't handle this by - * relocation. Manually set base address in stack canary - * segment descriptor. - */ - movl $gdt_page,%eax - movl $stack_canary,%ecx - movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) - shrl $16, %ecx - movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) - movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax) -#endif - andl $0,setup_once_ref /* Once is enough, thanks */ ret diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index ee1a283f8e96..d552f177eca0 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -245,7 +245,7 @@ static const __initconst struct idt_data ist_idts[] = { * after that. * * Note, that X86_64 cannot install the real #PF handler in - * idt_setup_early_traps() because the memory intialization needs the #PF + * idt_setup_early_traps() because the memory initialization needs the #PF * handler from the early_idt_handler_array to initialize the early page * tables. */ diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 58aa712973ac..e28f6a5d14f1 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -338,7 +338,7 @@ void fixup_irqs(void) irq_migrate_all_off_this_cpu(); /* - * We can remove mdelay() and then send spuriuous interrupts to + * We can remove mdelay() and then send spurious interrupts to * new cpu targets for all the irqs that were handled previously by * this cpu. While it works, I have seen spurious interrupt messages * (nothing wrong but still...). diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index 5ba8477c2cb7..6a2eb62c85e6 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -28,10 +28,8 @@ static void bug_at(const void *ip, int line) } static const void * -__jump_label_set_jump_code(struct jump_entry *entry, enum jump_label_type type, int init) +__jump_label_set_jump_code(struct jump_entry *entry, enum jump_label_type type) { - const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP }; - const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5]; const void *expect, *code; const void *addr, *dest; int line; @@ -41,10 +39,8 @@ __jump_label_set_jump_code(struct jump_entry *entry, enum jump_label_type type, code = text_gen_insn(JMP32_INSN_OPCODE, addr, dest); - if (init) { - expect = default_nop; line = __LINE__; - } else if (type == JUMP_LABEL_JMP) { - expect = ideal_nop; line = __LINE__; + if (type == JUMP_LABEL_JMP) { + expect = x86_nops[5]; line = __LINE__; } else { expect = code; line = __LINE__; } @@ -53,7 +49,7 @@ __jump_label_set_jump_code(struct jump_entry *entry, enum jump_label_type type, bug_at(addr, line); if (type == JUMP_LABEL_NOP) - code = ideal_nop; + code = x86_nops[5]; return code; } @@ -62,7 +58,7 @@ static inline void __jump_label_transform(struct jump_entry *entry, enum jump_label_type type, int init) { - const void *opcode = __jump_label_set_jump_code(entry, type, init); + const void *opcode = __jump_label_set_jump_code(entry, type); /* * As long as only a single processor is running and the code is still @@ -113,7 +109,7 @@ bool arch_jump_label_transform_queue(struct jump_entry *entry, } mutex_lock(&text_mutex); - opcode = __jump_label_set_jump_code(entry, type, 0); + opcode = __jump_label_set_jump_code(entry, type); text_poke_queue((void *)jump_entry_code(entry), opcode, JUMP_LABEL_NOP_SIZE, NULL); mutex_unlock(&text_mutex); @@ -136,22 +132,6 @@ static enum { __init_or_module void arch_jump_label_transform_static(struct jump_entry *entry, enum jump_label_type type) { - /* - * This function is called at boot up and when modules are - * first loaded. Check if the default nop, the one that is - * inserted at compile time, is the ideal nop. If it is, then - * we do not need to update the nop, and we can leave it as is. - * If it is not, then we need to update the nop to the ideal nop. - */ - if (jlstate == JL_STATE_START) { - const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP }; - const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5]; - - if (memcmp(ideal_nop, default_nop, 5) != 0) - jlstate = JL_STATE_UPDATE; - else - jlstate = JL_STATE_NO_UPDATE; - } if (jlstate == JL_STATE_UPDATE) jump_label_transform(entry, type, 1); } diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index ce831f9448e7..170d0fd68b1f 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -75,7 +75,7 @@ static int setup_cmdline(struct kimage *image, struct boot_params *params, if (image->type == KEXEC_TYPE_CRASH) { len = sprintf(cmdline_ptr, - "elfcorehdr=0x%lx ", image->arch.elf_load_addr); + "elfcorehdr=0x%lx ", image->elf_load_addr); } memcpy(cmdline_ptr + len, cmdline, cmdline_len); cmdline_len += len; diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index ff7878df96b4..3a43a2dee658 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -17,7 +17,7 @@ * Updated by: Tom Rini <trini@kernel.crashing.org> * Updated by: Jason Wessel <jason.wessel@windriver.com> * Modified for 386 by Jim Kingdon, Cygnus Support. - * Origianl kgdb, compatibility with 2.1.xx kernel by + * Original kgdb, compatibility with 2.1.xx kernel by * David Grothe <dave@gcom.com> * Integrated into 2.2.5 kernel by Tigran Aivazian <tigran@sco.com> * X86_64 changes from Andi Kleen's patch merged by Jim Houston @@ -642,7 +642,7 @@ void kgdb_arch_late(void) struct perf_event **pevent; /* - * Pre-allocate the hw breakpoint structions in the non-atomic + * Pre-allocate the hw breakpoint instructions in the non-atomic * portion of kgdb because this operation requires mutexs to * complete. */ diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index df776cdca327..d3d65545cb8b 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -139,6 +139,8 @@ NOKPROBE_SYMBOL(synthesize_relcall); int can_boost(struct insn *insn, void *addr) { kprobe_opcode_t opcode; + insn_byte_t prefix; + int i; if (search_exception_tables((unsigned long)addr)) return 0; /* Page fault may occur on this address. */ @@ -151,35 +153,39 @@ int can_boost(struct insn *insn, void *addr) if (insn->opcode.nbytes != 1) return 0; - /* Can't boost Address-size override prefix */ - if (unlikely(inat_is_address_size_prefix(insn->attr))) - return 0; + for_each_insn_prefix(insn, i, prefix) { + insn_attr_t attr; + + attr = inat_get_opcode_attribute(prefix); + /* Can't boost Address-size override prefix and CS override prefix */ + if (prefix == 0x2e || inat_is_address_size_prefix(attr)) + return 0; + } opcode = insn->opcode.bytes[0]; - switch (opcode & 0xf0) { - case 0x60: - /* can't boost "bound" */ - return (opcode != 0x62); - case 0x70: - return 0; /* can't boost conditional jump */ - case 0x90: - return opcode != 0x9a; /* can't boost call far */ - case 0xc0: - /* can't boost software-interruptions */ - return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf; - case 0xd0: - /* can boost AA* and XLAT */ - return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7); - case 0xe0: - /* can boost in/out and absolute jmps */ - return ((opcode & 0x04) || opcode == 0xea); - case 0xf0: - /* clear and set flags are boostable */ - return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe)); + switch (opcode) { + case 0x62: /* bound */ + case 0x70 ... 0x7f: /* Conditional jumps */ + case 0x9a: /* Call far */ + case 0xc0 ... 0xc1: /* Grp2 */ + case 0xcc ... 0xce: /* software exceptions */ + case 0xd0 ... 0xd3: /* Grp2 */ + case 0xd6: /* (UD) */ + case 0xd8 ... 0xdf: /* ESC */ + case 0xe0 ... 0xe3: /* LOOP*, JCXZ */ + case 0xe8 ... 0xe9: /* near Call, JMP */ + case 0xeb: /* Short JMP */ + case 0xf0 ... 0xf4: /* LOCK/REP, HLT */ + case 0xf6 ... 0xf7: /* Grp3 */ + case 0xfe: /* Grp4 */ + /* ... are not boostable */ + return 0; + case 0xff: /* Grp5 */ + /* Only indirect jmp is boostable */ + return X86_MODRM_REG(insn->modrm.bytes[0]) == 4; default: - /* CS override prefix and call are not boostable */ - return (opcode != 0x2e && opcode != 0x9a); + return 1; } } @@ -229,7 +235,7 @@ __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr) return 0UL; if (faddr) - memcpy(buf, ideal_nops[NOP_ATOMIC5], 5); + memcpy(buf, x86_nops[5], 5); else buf[0] = kp->opcode; return (unsigned long)buf; @@ -265,6 +271,8 @@ static int can_probe(unsigned long paddr) /* Decode instructions */ addr = paddr - offset; while (addr < paddr) { + int ret; + /* * Check if the instruction has been modified by another * kprobe, in which case we replace the breakpoint by the @@ -276,8 +284,10 @@ static int can_probe(unsigned long paddr) __addr = recover_probed_instruction(buf, addr); if (!__addr) return 0; - kernel_insn_init(&insn, (void *)__addr, MAX_INSN_SIZE); - insn_get_length(&insn); + + ret = insn_decode_kernel(&insn, (void *)__addr); + if (ret < 0) + return 0; /* * Another debugging subsystem might insert this breakpoint. @@ -301,8 +311,8 @@ static int can_probe(unsigned long paddr) int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn) { kprobe_opcode_t buf[MAX_INSN_SIZE]; - unsigned long recovered_insn = - recover_probed_instruction(buf, (unsigned long)src); + unsigned long recovered_insn = recover_probed_instruction(buf, (unsigned long)src); + int ret; if (!recovered_insn || !insn) return 0; @@ -312,8 +322,9 @@ int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn) MAX_INSN_SIZE)) return 0; - kernel_insn_init(insn, dest, MAX_INSN_SIZE); - insn_get_length(insn); + ret = insn_decode_kernel(insn, dest); + if (ret < 0) + return 0; /* We can not probe force emulate prefixed instruction */ if (insn_has_emulate_prefix(insn)) @@ -357,13 +368,14 @@ int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn) return insn->length; } -/* Prepare reljump right after instruction to boost */ -static int prepare_boost(kprobe_opcode_t *buf, struct kprobe *p, - struct insn *insn) +/* Prepare reljump or int3 right after instruction */ +static int prepare_singlestep(kprobe_opcode_t *buf, struct kprobe *p, + struct insn *insn) { int len = insn->length; - if (can_boost(insn, p->addr) && + if (!IS_ENABLED(CONFIG_PREEMPTION) && + !p->post_handler && can_boost(insn, p->addr) && MAX_INSN_SIZE - len >= JMP32_INSN_SIZE) { /* * These instructions can be executed directly if it @@ -374,7 +386,12 @@ static int prepare_boost(kprobe_opcode_t *buf, struct kprobe *p, len += JMP32_INSN_SIZE; p->ainsn.boostable = 1; } else { - p->ainsn.boostable = 0; + /* Otherwise, put an int3 for trapping singlestep */ + if (MAX_INSN_SIZE - len < INT3_INSN_SIZE) + return -ENOSPC; + + buf[len] = INT3_INSN_OPCODE; + len += INT3_INSN_SIZE; } return len; @@ -411,86 +428,290 @@ void free_insn_page(void *page) module_memfree(page); } -static void set_resume_flags(struct kprobe *p, struct insn *insn) +/* Kprobe x86 instruction emulation - only regs->ip or IF flag modifiers */ + +static void kprobe_emulate_ifmodifiers(struct kprobe *p, struct pt_regs *regs) +{ + switch (p->ainsn.opcode) { + case 0xfa: /* cli */ + regs->flags &= ~(X86_EFLAGS_IF); + break; + case 0xfb: /* sti */ + regs->flags |= X86_EFLAGS_IF; + break; + case 0x9c: /* pushf */ + int3_emulate_push(regs, regs->flags); + break; + case 0x9d: /* popf */ + regs->flags = int3_emulate_pop(regs); + break; + } + regs->ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size; +} +NOKPROBE_SYMBOL(kprobe_emulate_ifmodifiers); + +static void kprobe_emulate_ret(struct kprobe *p, struct pt_regs *regs) +{ + int3_emulate_ret(regs); +} +NOKPROBE_SYMBOL(kprobe_emulate_ret); + +static void kprobe_emulate_call(struct kprobe *p, struct pt_regs *regs) +{ + unsigned long func = regs->ip - INT3_INSN_SIZE + p->ainsn.size; + + func += p->ainsn.rel32; + int3_emulate_call(regs, func); +} +NOKPROBE_SYMBOL(kprobe_emulate_call); + +static nokprobe_inline +void __kprobe_emulate_jmp(struct kprobe *p, struct pt_regs *regs, bool cond) +{ + unsigned long ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size; + + if (cond) + ip += p->ainsn.rel32; + int3_emulate_jmp(regs, ip); +} + +static void kprobe_emulate_jmp(struct kprobe *p, struct pt_regs *regs) +{ + __kprobe_emulate_jmp(p, regs, true); +} +NOKPROBE_SYMBOL(kprobe_emulate_jmp); + +static const unsigned long jcc_mask[6] = { + [0] = X86_EFLAGS_OF, + [1] = X86_EFLAGS_CF, + [2] = X86_EFLAGS_ZF, + [3] = X86_EFLAGS_CF | X86_EFLAGS_ZF, + [4] = X86_EFLAGS_SF, + [5] = X86_EFLAGS_PF, +}; + +static void kprobe_emulate_jcc(struct kprobe *p, struct pt_regs *regs) +{ + bool invert = p->ainsn.jcc.type & 1; + bool match; + + if (p->ainsn.jcc.type < 0xc) { + match = regs->flags & jcc_mask[p->ainsn.jcc.type >> 1]; + } else { + match = ((regs->flags & X86_EFLAGS_SF) >> X86_EFLAGS_SF_BIT) ^ + ((regs->flags & X86_EFLAGS_OF) >> X86_EFLAGS_OF_BIT); + if (p->ainsn.jcc.type >= 0xe) + match = match && (regs->flags & X86_EFLAGS_ZF); + } + __kprobe_emulate_jmp(p, regs, (match && !invert) || (!match && invert)); +} +NOKPROBE_SYMBOL(kprobe_emulate_jcc); + +static void kprobe_emulate_loop(struct kprobe *p, struct pt_regs *regs) +{ + bool match; + + if (p->ainsn.loop.type != 3) { /* LOOP* */ + if (p->ainsn.loop.asize == 32) + match = ((*(u32 *)®s->cx)--) != 0; +#ifdef CONFIG_X86_64 + else if (p->ainsn.loop.asize == 64) + match = ((*(u64 *)®s->cx)--) != 0; +#endif + else + match = ((*(u16 *)®s->cx)--) != 0; + } else { /* JCXZ */ + if (p->ainsn.loop.asize == 32) + match = *(u32 *)(®s->cx) == 0; +#ifdef CONFIG_X86_64 + else if (p->ainsn.loop.asize == 64) + match = *(u64 *)(®s->cx) == 0; +#endif + else + match = *(u16 *)(®s->cx) == 0; + } + + if (p->ainsn.loop.type == 0) /* LOOPNE */ + match = match && !(regs->flags & X86_EFLAGS_ZF); + else if (p->ainsn.loop.type == 1) /* LOOPE */ + match = match && (regs->flags & X86_EFLAGS_ZF); + + __kprobe_emulate_jmp(p, regs, match); +} +NOKPROBE_SYMBOL(kprobe_emulate_loop); + +static const int addrmode_regoffs[] = { + offsetof(struct pt_regs, ax), + offsetof(struct pt_regs, cx), + offsetof(struct pt_regs, dx), + offsetof(struct pt_regs, bx), + offsetof(struct pt_regs, sp), + offsetof(struct pt_regs, bp), + offsetof(struct pt_regs, si), + offsetof(struct pt_regs, di), +#ifdef CONFIG_X86_64 + offsetof(struct pt_regs, r8), + offsetof(struct pt_regs, r9), + offsetof(struct pt_regs, r10), + offsetof(struct pt_regs, r11), + offsetof(struct pt_regs, r12), + offsetof(struct pt_regs, r13), + offsetof(struct pt_regs, r14), + offsetof(struct pt_regs, r15), +#endif +}; + +static void kprobe_emulate_call_indirect(struct kprobe *p, struct pt_regs *regs) +{ + unsigned long offs = addrmode_regoffs[p->ainsn.indirect.reg]; + + int3_emulate_call(regs, regs_get_register(regs, offs)); +} +NOKPROBE_SYMBOL(kprobe_emulate_call_indirect); + +static void kprobe_emulate_jmp_indirect(struct kprobe *p, struct pt_regs *regs) +{ + unsigned long offs = addrmode_regoffs[p->ainsn.indirect.reg]; + + int3_emulate_jmp(regs, regs_get_register(regs, offs)); +} +NOKPROBE_SYMBOL(kprobe_emulate_jmp_indirect); + +static int prepare_emulation(struct kprobe *p, struct insn *insn) { insn_byte_t opcode = insn->opcode.bytes[0]; switch (opcode) { case 0xfa: /* cli */ case 0xfb: /* sti */ + case 0x9c: /* pushfl */ case 0x9d: /* popf/popfd */ - /* Check whether the instruction modifies Interrupt Flag or not */ - p->ainsn.if_modifier = 1; - break; - case 0x9c: /* pushfl */ - p->ainsn.is_pushf = 1; + /* + * IF modifiers must be emulated since it will enable interrupt while + * int3 single stepping. + */ + p->ainsn.emulate_op = kprobe_emulate_ifmodifiers; + p->ainsn.opcode = opcode; break; - case 0xcf: /* iret */ - p->ainsn.if_modifier = 1; - fallthrough; case 0xc2: /* ret/lret */ case 0xc3: case 0xca: case 0xcb: - case 0xea: /* jmp absolute -- ip is correct */ - /* ip is already adjusted, no more changes required */ - p->ainsn.is_abs_ip = 1; - /* Without resume jump, this is boostable */ - p->ainsn.boostable = 1; + p->ainsn.emulate_op = kprobe_emulate_ret; break; - case 0xe8: /* call relative - Fix return addr */ - p->ainsn.is_call = 1; + case 0x9a: /* far call absolute -- segment is not supported */ + case 0xea: /* far jmp absolute -- segment is not supported */ + case 0xcc: /* int3 */ + case 0xcf: /* iret -- in-kernel IRET is not supported */ + return -EOPNOTSUPP; break; -#ifdef CONFIG_X86_32 - case 0x9a: /* call absolute -- same as call absolute, indirect */ - p->ainsn.is_call = 1; - p->ainsn.is_abs_ip = 1; + case 0xe8: /* near call relative */ + p->ainsn.emulate_op = kprobe_emulate_call; + if (insn->immediate.nbytes == 2) + p->ainsn.rel32 = *(s16 *)&insn->immediate.value; + else + p->ainsn.rel32 = *(s32 *)&insn->immediate.value; break; -#endif - case 0xff: + case 0xeb: /* short jump relative */ + case 0xe9: /* near jump relative */ + p->ainsn.emulate_op = kprobe_emulate_jmp; + if (insn->immediate.nbytes == 1) + p->ainsn.rel32 = *(s8 *)&insn->immediate.value; + else if (insn->immediate.nbytes == 2) + p->ainsn.rel32 = *(s16 *)&insn->immediate.value; + else + p->ainsn.rel32 = *(s32 *)&insn->immediate.value; + break; + case 0x70 ... 0x7f: + /* 1 byte conditional jump */ + p->ainsn.emulate_op = kprobe_emulate_jcc; + p->ainsn.jcc.type = opcode & 0xf; + p->ainsn.rel32 = *(char *)insn->immediate.bytes; + break; + case 0x0f: opcode = insn->opcode.bytes[1]; + if ((opcode & 0xf0) == 0x80) { + /* 2 bytes Conditional Jump */ + p->ainsn.emulate_op = kprobe_emulate_jcc; + p->ainsn.jcc.type = opcode & 0xf; + if (insn->immediate.nbytes == 2) + p->ainsn.rel32 = *(s16 *)&insn->immediate.value; + else + p->ainsn.rel32 = *(s32 *)&insn->immediate.value; + } else if (opcode == 0x01 && + X86_MODRM_REG(insn->modrm.bytes[0]) == 0 && + X86_MODRM_MOD(insn->modrm.bytes[0]) == 3) { + /* VM extensions - not supported */ + return -EOPNOTSUPP; + } + break; + case 0xe0: /* Loop NZ */ + case 0xe1: /* Loop */ + case 0xe2: /* Loop */ + case 0xe3: /* J*CXZ */ + p->ainsn.emulate_op = kprobe_emulate_loop; + p->ainsn.loop.type = opcode & 0x3; + p->ainsn.loop.asize = insn->addr_bytes * 8; + p->ainsn.rel32 = *(s8 *)&insn->immediate.value; + break; + case 0xff: + /* + * Since the 0xff is an extended group opcode, the instruction + * is determined by the MOD/RM byte. + */ + opcode = insn->modrm.bytes[0]; if ((opcode & 0x30) == 0x10) { - /* - * call absolute, indirect - * Fix return addr; ip is correct. - * But this is not boostable - */ - p->ainsn.is_call = 1; - p->ainsn.is_abs_ip = 1; + if ((opcode & 0x8) == 0x8) + return -EOPNOTSUPP; /* far call */ + /* call absolute, indirect */ + p->ainsn.emulate_op = kprobe_emulate_call_indirect; + } else if ((opcode & 0x30) == 0x20) { + if ((opcode & 0x8) == 0x8) + return -EOPNOTSUPP; /* far jmp */ + /* jmp near absolute indirect */ + p->ainsn.emulate_op = kprobe_emulate_jmp_indirect; + } else break; - } else if (((opcode & 0x31) == 0x20) || - ((opcode & 0x31) == 0x21)) { - /* - * jmp near and far, absolute indirect - * ip is correct. - */ - p->ainsn.is_abs_ip = 1; - /* Without resume jump, this is boostable */ - p->ainsn.boostable = 1; - } + + if (insn->addr_bytes != sizeof(unsigned long)) + return -EOPNOTSUPP; /* Don't support differnt size */ + if (X86_MODRM_MOD(opcode) != 3) + return -EOPNOTSUPP; /* TODO: support memory addressing */ + + p->ainsn.indirect.reg = X86_MODRM_RM(opcode); +#ifdef CONFIG_X86_64 + if (X86_REX_B(insn->rex_prefix.value)) + p->ainsn.indirect.reg += 8; +#endif + break; + default: break; } + p->ainsn.size = insn->length; + + return 0; } static int arch_copy_kprobe(struct kprobe *p) { struct insn insn; kprobe_opcode_t buf[MAX_INSN_SIZE]; - int len; + int ret, len; /* Copy an instruction with recovering if other optprobe modifies it.*/ len = __copy_instruction(buf, p->addr, p->ainsn.insn, &insn); if (!len) return -EINVAL; - /* - * __copy_instruction can modify the displacement of the instruction, - * but it doesn't affect boostable check. - */ - len = prepare_boost(buf, p, &insn); + /* Analyze the opcode and setup emulate functions */ + ret = prepare_emulation(p, &insn); + if (ret < 0) + return ret; - /* Analyze the opcode and set resume flags */ - set_resume_flags(p, &insn); + /* Add int3 for single-step or booster jmp */ + len = prepare_singlestep(buf, p, &insn); + if (len < 0) + return len; /* Also, displacement change doesn't affect the first byte */ p->opcode = buf[0]; @@ -583,29 +804,7 @@ set_current_kprobe(struct kprobe *p, struct pt_regs *regs, { __this_cpu_write(current_kprobe, p); kcb->kprobe_saved_flags = kcb->kprobe_old_flags - = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF)); - if (p->ainsn.if_modifier) - kcb->kprobe_saved_flags &= ~X86_EFLAGS_IF; -} - -static nokprobe_inline void clear_btf(void) -{ - if (test_thread_flag(TIF_BLOCKSTEP)) { - unsigned long debugctl = get_debugctlmsr(); - - debugctl &= ~DEBUGCTLMSR_BTF; - update_debugctlmsr(debugctl); - } -} - -static nokprobe_inline void restore_btf(void) -{ - if (test_thread_flag(TIF_BLOCKSTEP)) { - unsigned long debugctl = get_debugctlmsr(); - - debugctl |= DEBUGCTLMSR_BTF; - update_debugctlmsr(debugctl); - } + = (regs->flags & X86_EFLAGS_IF); } void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) @@ -620,6 +819,22 @@ void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) } NOKPROBE_SYMBOL(arch_prepare_kretprobe); +static void kprobe_post_process(struct kprobe *cur, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { + kcb->kprobe_status = KPROBE_HIT_SSDONE; + cur->post_handler(cur, regs, 0); + } + + /* Restore back the original saved kprobes variables and continue. */ + if (kcb->kprobe_status == KPROBE_REENTER) + restore_previous_kprobe(kcb); + else + reset_current_kprobe(); +} +NOKPROBE_SYMBOL(kprobe_post_process); + static void setup_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb, int reenter) { @@ -627,7 +842,7 @@ static void setup_singlestep(struct kprobe *p, struct pt_regs *regs, return; #if !defined(CONFIG_PREEMPTION) - if (p->ainsn.boostable && !p->post_handler) { + if (p->ainsn.boostable) { /* Boost up -- we can execute copied instructions directly */ if (!reenter) reset_current_kprobe(); @@ -646,19 +861,51 @@ static void setup_singlestep(struct kprobe *p, struct pt_regs *regs, kcb->kprobe_status = KPROBE_REENTER; } else kcb->kprobe_status = KPROBE_HIT_SS; - /* Prepare real single stepping */ - clear_btf(); - regs->flags |= X86_EFLAGS_TF; + + if (p->ainsn.emulate_op) { + p->ainsn.emulate_op(p, regs); + kprobe_post_process(p, regs, kcb); + return; + } + + /* Disable interrupt, and set ip register on trampoline */ regs->flags &= ~X86_EFLAGS_IF; - /* single step inline if the instruction is an int3 */ - if (p->opcode == INT3_INSN_OPCODE) - regs->ip = (unsigned long)p->addr; - else - regs->ip = (unsigned long)p->ainsn.insn; + regs->ip = (unsigned long)p->ainsn.insn; } NOKPROBE_SYMBOL(setup_singlestep); /* + * Called after single-stepping. p->addr is the address of the + * instruction whose first byte has been replaced by the "int3" + * instruction. To avoid the SMP problems that can occur when we + * temporarily put back the original opcode to single-step, we + * single-stepped a copy of the instruction. The address of this + * copy is p->ainsn.insn. We also doesn't use trap, but "int3" again + * right after the copied instruction. + * Different from the trap single-step, "int3" single-step can not + * handle the instruction which changes the ip register, e.g. jmp, + * call, conditional jmp, and the instructions which changes the IF + * flags because interrupt must be disabled around the single-stepping. + * Such instructions are software emulated, but others are single-stepped + * using "int3". + * + * When the 2nd "int3" handled, the regs->ip and regs->flags needs to + * be adjusted, so that we can resume execution on correct code. + */ +static void resume_singlestep(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + unsigned long copy_ip = (unsigned long)p->ainsn.insn; + unsigned long orig_ip = (unsigned long)p->addr; + + /* Restore saved interrupt flag and ip register */ + regs->flags |= kcb->kprobe_saved_flags; + /* Note that regs->ip is executed int3 so must be a step back */ + regs->ip += (orig_ip - copy_ip) - INT3_INSN_SIZE; +} +NOKPROBE_SYMBOL(resume_singlestep); + +/* * We have reentered the kprobe_handler(), since another probe was hit while * within the handler. We save the original kprobes variables and just single * step on the instruction of the new probe without calling any user handlers. @@ -693,6 +940,12 @@ static int reenter_kprobe(struct kprobe *p, struct pt_regs *regs, } NOKPROBE_SYMBOL(reenter_kprobe); +static nokprobe_inline int kprobe_is_ss(struct kprobe_ctlblk *kcb) +{ + return (kcb->kprobe_status == KPROBE_HIT_SS || + kcb->kprobe_status == KPROBE_REENTER); +} + /* * Interrupts are disabled on entry as trap3 is an interrupt gate and they * remain disabled throughout this function. @@ -737,7 +990,18 @@ int kprobe_int3_handler(struct pt_regs *regs) reset_current_kprobe(); return 1; } - } else if (*addr != INT3_INSN_OPCODE) { + } else if (kprobe_is_ss(kcb)) { + p = kprobe_running(); + if ((unsigned long)p->ainsn.insn < regs->ip && + (unsigned long)p->ainsn.insn + MAX_INSN_SIZE > regs->ip) { + /* Most provably this is the second int3 for singlestep */ + resume_singlestep(p, regs, kcb); + kprobe_post_process(p, regs, kcb); + return 1; + } + } + + if (*addr != INT3_INSN_OPCODE) { /* * The breakpoint instruction was removed right * after we hit it. Another cpu has removed @@ -810,91 +1074,6 @@ __used __visible void *trampoline_handler(struct pt_regs *regs) } NOKPROBE_SYMBOL(trampoline_handler); -/* - * Called after single-stepping. p->addr is the address of the - * instruction whose first byte has been replaced by the "int 3" - * instruction. To avoid the SMP problems that can occur when we - * temporarily put back the original opcode to single-step, we - * single-stepped a copy of the instruction. The address of this - * copy is p->ainsn.insn. - * - * This function prepares to return from the post-single-step - * interrupt. We have to fix up the stack as follows: - * - * 0) Except in the case of absolute or indirect jump or call instructions, - * the new ip is relative to the copied instruction. We need to make - * it relative to the original instruction. - * - * 1) If the single-stepped instruction was pushfl, then the TF and IF - * flags are set in the just-pushed flags, and may need to be cleared. - * - * 2) If the single-stepped instruction was a call, the return address - * that is atop the stack is the address following the copied instruction. - * We need to make it the address following the original instruction. - */ -static void resume_execution(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb) -{ - unsigned long *tos = stack_addr(regs); - unsigned long copy_ip = (unsigned long)p->ainsn.insn; - unsigned long orig_ip = (unsigned long)p->addr; - - regs->flags &= ~X86_EFLAGS_TF; - - /* Fixup the contents of top of stack */ - if (p->ainsn.is_pushf) { - *tos &= ~(X86_EFLAGS_TF | X86_EFLAGS_IF); - *tos |= kcb->kprobe_old_flags; - } else if (p->ainsn.is_call) { - *tos = orig_ip + (*tos - copy_ip); - } - - if (!p->ainsn.is_abs_ip) - regs->ip += orig_ip - copy_ip; - - restore_btf(); -} -NOKPROBE_SYMBOL(resume_execution); - -/* - * Interrupts are disabled on entry as trap1 is an interrupt gate and they - * remain disabled throughout this function. - */ -int kprobe_debug_handler(struct pt_regs *regs) -{ - struct kprobe *cur = kprobe_running(); - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - - if (!cur) - return 0; - - resume_execution(cur, regs, kcb); - regs->flags |= kcb->kprobe_saved_flags; - - if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { - kcb->kprobe_status = KPROBE_HIT_SSDONE; - cur->post_handler(cur, regs, 0); - } - - /* Restore back the original saved kprobes variables and continue. */ - if (kcb->kprobe_status == KPROBE_REENTER) { - restore_previous_kprobe(kcb); - goto out; - } - reset_current_kprobe(); -out: - /* - * if somebody else is singlestepping across a probe point, flags - * will have TF set, in which case, continue the remaining processing - * of do_debug, as if this is not a probe hit. - */ - if (regs->flags & X86_EFLAGS_TF) - return 0; - - return 1; -} -NOKPROBE_SYMBOL(kprobe_debug_handler); - int kprobe_fault_handler(struct pt_regs *regs, int trapnr) { struct kprobe *cur = kprobe_running(); @@ -912,20 +1091,9 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr) * normal page fault. */ regs->ip = (unsigned long)cur->addr; - /* - * Trap flag (TF) has been set here because this fault - * happened where the single stepping will be done. - * So clear it by resetting the current kprobe: - */ - regs->flags &= ~X86_EFLAGS_TF; - /* - * Since the single step (trap) has been cancelled, - * we need to restore BTF here. - */ - restore_btf(); /* - * If the TF flag was set before the kprobe hit, + * If the IF flag was set before the kprobe hit, * don't touch it: */ regs->flags |= kcb->kprobe_old_flags; diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 373e5fa3ce1f..596de2f6d3a5 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -12,7 +12,7 @@ #include "common.h" -/* Ftrace callback handler for kprobes -- called under preepmt disabed */ +/* Ftrace callback handler for kprobes -- called under preempt disabled */ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct ftrace_regs *fregs) { diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 08eb23074f92..71425ebba98a 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -312,6 +312,8 @@ static int can_optimize(unsigned long paddr) addr = paddr - offset; while (addr < paddr - offset + size) { /* Decode until function end */ unsigned long recovered_insn; + int ret; + if (search_exception_tables(addr)) /* * Since some fixup code will jumps into this function, @@ -321,8 +323,11 @@ static int can_optimize(unsigned long paddr) recovered_insn = recover_probed_instruction(buf, addr); if (!recovered_insn) return 0; - kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE); - insn_get_length(&insn); + + ret = insn_decode_kernel(&insn, (void *)recovered_insn); + if (ret < 0) + return 0; + /* * In the case of detecting unknown breakpoint, this could be * a padding INT3 between functions. Let's check that all the diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 5e78e01ca3b4..172c947240b9 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -650,7 +650,7 @@ static void __init kvm_guest_init(void) if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { has_steal_clock = 1; - pv_ops.time.steal_clock = kvm_steal_clock; + static_call_update(pv_steal_clock, kvm_steal_clock); } if (pv_tlb_flush_supported()) { @@ -836,28 +836,25 @@ static void kvm_kick_cpu(int cpu) static void kvm_wait(u8 *ptr, u8 val) { - unsigned long flags; - if (in_nmi()) return; - local_irq_save(flags); - - if (READ_ONCE(*ptr) != val) - goto out; - /* * halt until it's our turn and kicked. Note that we do safe halt * for irq enabled case to avoid hang when lock info is overwritten * in irq spinlock slowpath and no spurious interrupt occur to save us. */ - if (arch_irqs_disabled_flags(flags)) - halt(); - else - safe_halt(); + if (irqs_disabled()) { + if (READ_ONCE(*ptr) == val) + halt(); + } else { + local_irq_disable(); -out: - local_irq_restore(flags); + if (READ_ONCE(*ptr) == val) + safe_halt(); + + local_irq_enable(); + } } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 1fc0962c89c0..d37ed4e1d033 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -106,7 +106,7 @@ static inline void kvm_sched_clock_init(bool stable) if (!stable) clear_sched_clock_stable(); kvm_sched_clock_offset = kvm_clock_read(); - pv_ops.time.sched_clock = kvm_sched_clock_read; + paravirt_set_sched_clock(kvm_sched_clock_read); pr_info("kvm-clock: using sched offset of %llu cycles", kvm_sched_clock_offset); diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index a29a44a98e5b..c078b0d3ab0e 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -260,7 +260,7 @@ static void set_idt(void *newidt, u16 limit) { struct desc_ptr curidt; - /* x86-64 supports unaliged loads & stores */ + /* x86-64 supports unaligned loads & stores */ curidt.size = limit; curidt.address = (unsigned long)newidt; @@ -402,8 +402,8 @@ void machine_kexec(struct kimage *image) #ifdef CONFIG_KEXEC_FILE void *arch_kexec_kernel_image_load(struct kimage *image) { - vfree(image->arch.elf_headers); - image->arch.elf_headers = NULL; + vfree(image->elf_headers); + image->elf_headers = NULL; if (!image->fops || !image->fops->load) return ERR_PTR(-ENOEXEC); diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index 4f75d0cf6305..9e1ea99ad9df 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -32,3 +32,12 @@ bool pv_is_native_vcpu_is_preempted(void) return pv_ops.lock.vcpu_is_preempted.func == __raw_callee_save___native_vcpu_is_preempted; } + +void __init paravirt_set_cap(void) +{ + if (!pv_is_native_spin_unlock()) + setup_force_cpu_cap(X86_FEATURE_PVUNLOCK); + + if (!pv_is_native_vcpu_is_preempted()) + setup_force_cpu_cap(X86_FEATURE_VCPUPREEMPT); +} diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index c60222ab8ab9..d0730264786b 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -14,6 +14,7 @@ #include <linux/highmem.h> #include <linux/kprobes.h> #include <linux/pgtable.h> +#include <linux/static_call.h> #include <asm/bug.h> #include <asm/paravirt.h> @@ -52,7 +53,10 @@ void __init default_banner(void) } /* Undefined instruction for dealing with missing ops pointers. */ -static const unsigned char ud2a[] = { 0x0f, 0x0b }; +static void paravirt_BUG(void) +{ + BUG(); +} struct branch { unsigned char opcode; @@ -85,25 +89,6 @@ u64 notrace _paravirt_ident_64(u64 x) { return x; } - -static unsigned paravirt_patch_jmp(void *insn_buff, const void *target, - unsigned long addr, unsigned len) -{ - struct branch *b = insn_buff; - unsigned long delta = (unsigned long)target - (addr+5); - - if (len < 5) { -#ifdef CONFIG_RETPOLINE - WARN_ONCE(1, "Failing to patch indirect JMP in %ps\n", (void *)addr); -#endif - return len; /* call too long for patch site */ - } - - b->opcode = 0xe9; /* jmp */ - b->delta = delta; - - return 5; -} #endif DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key); @@ -114,8 +99,8 @@ void __init native_pv_lock_init(void) static_branch_disable(&virt_spin_lock_key); } -unsigned paravirt_patch_default(u8 type, void *insn_buff, - unsigned long addr, unsigned len) +unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr, + unsigned int len) { /* * Neat trick to map patch type back to the call within the @@ -125,20 +110,10 @@ unsigned paravirt_patch_default(u8 type, void *insn_buff, unsigned ret; if (opfunc == NULL) - /* If there's no function, patch it with a ud2a (BUG) */ - ret = paravirt_patch_insns(insn_buff, len, ud2a, ud2a+sizeof(ud2a)); + /* If there's no function, patch it with paravirt_BUG() */ + ret = paravirt_patch_call(insn_buff, paravirt_BUG, addr, len); else if (opfunc == _paravirt_nop) ret = 0; - -#ifdef CONFIG_PARAVIRT_XXL - /* identity functions just return their single argument */ - else if (opfunc == _paravirt_ident_64) - ret = paravirt_patch_ident_64(insn_buff, len); - - else if (type == PARAVIRT_PATCH(cpu.iret)) - /* If operation requires a jmp, then jmp */ - ret = paravirt_patch_jmp(insn_buff, opfunc, addr, len); -#endif else /* Otherwise call the function. */ ret = paravirt_patch_call(insn_buff, opfunc, addr, len); @@ -146,19 +121,6 @@ unsigned paravirt_patch_default(u8 type, void *insn_buff, return ret; } -unsigned paravirt_patch_insns(void *insn_buff, unsigned len, - const char *start, const char *end) -{ - unsigned insn_len = end - start; - - /* Alternative instruction is too large for the patch site and we cannot continue: */ - BUG_ON(insn_len > len || start == NULL); - - memcpy(insn_buff, start, insn_len); - - return insn_len; -} - struct static_key paravirt_steal_enabled; struct static_key paravirt_steal_rq_enabled; @@ -167,6 +129,14 @@ static u64 native_steal_clock(int cpu) return 0; } +DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock); +DEFINE_STATIC_CALL(pv_sched_clock, native_sched_clock); + +void paravirt_set_sched_clock(u64 (*func)(void)) +{ + static_call_update(pv_sched_clock, func); +} + /* These are in entry.S */ extern void native_iret(void); @@ -269,13 +239,6 @@ struct pv_info pv_info = { #define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64) struct paravirt_patch_template pv_ops = { - /* Init ops. */ - .init.patch = native_patch, - - /* Time ops. */ - .time.sched_clock = native_sched_clock, - .time.steal_clock = native_steal_clock, - /* Cpu ops. */ .cpu.io_delay = native_io_delay, @@ -308,8 +271,6 @@ struct paravirt_patch_template pv_ops = { .cpu.load_sp0 = native_load_sp0, - .cpu.iret = native_iret, - #ifdef CONFIG_X86_IOPL_IOPERM .cpu.invalidate_io_bitmap = native_tss_invalidate_io_bitmap, .cpu.update_io_bitmap = native_tss_update_io_bitmap, @@ -414,6 +375,8 @@ struct paravirt_patch_template pv_ops = { NOKPROBE_SYMBOL(native_get_debugreg); NOKPROBE_SYMBOL(native_set_debugreg); NOKPROBE_SYMBOL(native_load_idt); + +void (*paravirt_iret)(void) = native_iret; #endif EXPORT_SYMBOL(pv_ops); diff --git a/arch/x86/kernel/paravirt_patch.c b/arch/x86/kernel/paravirt_patch.c deleted file mode 100644 index abd27ec67397..000000000000 --- a/arch/x86/kernel/paravirt_patch.c +++ /dev/null @@ -1,99 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/stringify.h> - -#include <asm/paravirt.h> -#include <asm/asm-offsets.h> - -#define PSTART(d, m) \ - patch_data_##d.m - -#define PEND(d, m) \ - (PSTART(d, m) + sizeof(patch_data_##d.m)) - -#define PATCH(d, m, insn_buff, len) \ - paravirt_patch_insns(insn_buff, len, PSTART(d, m), PEND(d, m)) - -#define PATCH_CASE(ops, m, data, insn_buff, len) \ - case PARAVIRT_PATCH(ops.m): \ - return PATCH(data, ops##_##m, insn_buff, len) - -#ifdef CONFIG_PARAVIRT_XXL -struct patch_xxl { - const unsigned char irq_irq_disable[1]; - const unsigned char irq_irq_enable[1]; - const unsigned char irq_save_fl[2]; - const unsigned char mmu_read_cr2[3]; - const unsigned char mmu_read_cr3[3]; - const unsigned char mmu_write_cr3[3]; - const unsigned char cpu_wbinvd[2]; - const unsigned char mov64[3]; -}; - -static const struct patch_xxl patch_data_xxl = { - .irq_irq_disable = { 0xfa }, // cli - .irq_irq_enable = { 0xfb }, // sti - .irq_save_fl = { 0x9c, 0x58 }, // pushf; pop %[re]ax - .mmu_read_cr2 = { 0x0f, 0x20, 0xd0 }, // mov %cr2, %[re]ax - .mmu_read_cr3 = { 0x0f, 0x20, 0xd8 }, // mov %cr3, %[re]ax - .mmu_write_cr3 = { 0x0f, 0x22, 0xdf }, // mov %rdi, %cr3 - .cpu_wbinvd = { 0x0f, 0x09 }, // wbinvd - .mov64 = { 0x48, 0x89, 0xf8 }, // mov %rdi, %rax -}; - -unsigned int paravirt_patch_ident_64(void *insn_buff, unsigned int len) -{ - return PATCH(xxl, mov64, insn_buff, len); -} -# endif /* CONFIG_PARAVIRT_XXL */ - -#ifdef CONFIG_PARAVIRT_SPINLOCKS -struct patch_lock { - unsigned char queued_spin_unlock[3]; - unsigned char vcpu_is_preempted[2]; -}; - -static const struct patch_lock patch_data_lock = { - .vcpu_is_preempted = { 0x31, 0xc0 }, // xor %eax, %eax - -# ifdef CONFIG_X86_64 - .queued_spin_unlock = { 0xc6, 0x07, 0x00 }, // movb $0, (%rdi) -# else - .queued_spin_unlock = { 0xc6, 0x00, 0x00 }, // movb $0, (%eax) -# endif -}; -#endif /* CONFIG_PARAVIRT_SPINLOCKS */ - -unsigned int native_patch(u8 type, void *insn_buff, unsigned long addr, - unsigned int len) -{ - switch (type) { - -#ifdef CONFIG_PARAVIRT_XXL - PATCH_CASE(irq, save_fl, xxl, insn_buff, len); - PATCH_CASE(irq, irq_enable, xxl, insn_buff, len); - PATCH_CASE(irq, irq_disable, xxl, insn_buff, len); - - PATCH_CASE(mmu, read_cr2, xxl, insn_buff, len); - PATCH_CASE(mmu, read_cr3, xxl, insn_buff, len); - PATCH_CASE(mmu, write_cr3, xxl, insn_buff, len); - - PATCH_CASE(cpu, wbinvd, xxl, insn_buff, len); -#endif - -#ifdef CONFIG_PARAVIRT_SPINLOCKS - case PARAVIRT_PATCH(lock.queued_spin_unlock): - if (pv_is_native_spin_unlock()) - return PATCH(lock, queued_spin_unlock, insn_buff, len); - break; - - case PARAVIRT_PATCH(lock.vcpu_is_preempted): - if (pv_is_native_vcpu_is_preempted()) - return PATCH(lock, vcpu_is_preempted, insn_buff, len); - break; -#endif - default: - break; - } - - return paravirt_patch_default(type, insn_buff, addr, len); -} diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 9c214d7085a4..43cbfc84153a 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -63,14 +63,9 @@ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = { */ .sp0 = (1UL << (BITS_PER_LONG-1)) + 1, - /* - * .sp1 is cpu_current_top_of_stack. The init task never - * runs user code, but cpu_current_top_of_stack should still - * be well defined before the first context switch. - */ +#ifdef CONFIG_X86_32 .sp1 = TOP_OF_INIT_STACK, -#ifdef CONFIG_X86_32 .ss0 = __KERNEL_DS, .ss1 = __KERNEL_CS, #endif @@ -451,7 +446,7 @@ void speculative_store_bypass_ht_init(void) * First HT sibling to come up on the core. Link shared state of * the first HT sibling to itself. The siblings on the same core * which come up later will see the shared state pointer and link - * themself to the state of this CPU. + * themselves to the state of this CPU. */ st->shared_state = st; } diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 11065dc03f5b..eda37df016f0 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -89,7 +89,7 @@ u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) /* * Assumption here is that last_value, a global accumulator, always goes * forward. If we are less than that, we should not be much smaller. - * We assume there is an error marging we're inside, and then the correction + * We assume there is an error margin we're inside, and then the correction * does not sacrifice accuracy. * * For reads: global may have changed between test and return, diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index 94b33885f8d2..f469153eca8a 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -107,7 +107,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) * - Write protect disabled * - No task switch * - Don't do FP software emulation. - * - Proctected mode enabled + * - Protected mode enabled */ movl %cr0, %eax andl $~(X86_CR0_PG | X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %eax diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index a4d9a261425b..c53271aebb64 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -121,7 +121,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) * - Write protect disabled * - No task switch * - Don't do FP software emulation. - * - Proctected mode enabled + * - Protected mode enabled */ movq %cr0, %rax andq $~(X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %rax diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d883176ef2ce..72920af0b3c0 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -65,7 +65,7 @@ RESERVE_BRK(dmi_alloc, 65536); /* * Range of the BSS area. The size of the BSS area is determined - * at link time, with RESERVE_BRK*() facility reserving additional + * at link time, with RESERVE_BRK() facility reserving additional * chunks. */ unsigned long _brk_start = (unsigned long)__brk_base; @@ -633,11 +633,16 @@ static void __init trim_snb_memory(void) printk(KERN_DEBUG "reserving inaccessible SNB gfx pages\n"); /* - * Reserve all memory below the 1 MB mark that has not - * already been reserved. + * SandyBridge integrated graphics devices have a bug that prevents + * them from accessing certain memory ranges, namely anything below + * 1M and in the pages listed in bad_pages[] above. + * + * To avoid these pages being ever accessed by SNB gfx devices + * reserve all memory below the 1 MB mark and bad_pages that have + * not already been reserved at boot time. */ memblock_reserve(0, 1<<20); - + for (i = 0; i < ARRAY_SIZE(bad_pages); i++) { if (memblock_reserve(bad_pages[i], PAGE_SIZE)) printk(KERN_WARNING "failed to reserve 0x%08lx\n", @@ -645,18 +650,6 @@ static void __init trim_snb_memory(void) } } -/* - * Here we put platform-specific memory range workarounds, i.e. - * memory known to be corrupt or otherwise in need to be reserved on - * specific platforms. - * - * If this gets used more widely it could use a real dispatch mechanism. - */ -static void __init trim_platform_memory_ranges(void) -{ - trim_snb_memory(); -} - static void __init trim_bios_range(void) { /* @@ -725,11 +718,41 @@ static int __init parse_reservelow(char *p) early_param("reservelow", parse_reservelow); -static void __init trim_low_memory_range(void) +static void __init early_reserve_memory(void) { + /* + * Reserve the memory occupied by the kernel between _text and + * __end_of_kernel_reserve symbols. Any kernel sections after the + * __end_of_kernel_reserve symbol must be explicitly reserved with a + * separate memblock_reserve() or they will be discarded. + */ + memblock_reserve(__pa_symbol(_text), + (unsigned long)__end_of_kernel_reserve - (unsigned long)_text); + + /* + * The first 4Kb of memory is a BIOS owned area, but generally it is + * not listed as such in the E820 table. + * + * Reserve the first memory page and typically some additional + * memory (64KiB by default) since some BIOSes are known to corrupt + * low memory. See the Kconfig help text for X86_RESERVE_LOW. + * + * In addition, make sure page 0 is always reserved because on + * systems with L1TF its contents can be leaked to user processes. + */ memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE)); + + early_reserve_initrd(); + + if (efi_enabled(EFI_BOOT)) + efi_memblock_x86_reserve_range(); + + memblock_x86_reserve_range_setup_data(); + + reserve_ibft_region(); + reserve_bios_regions(); } - + /* * Dump out kernel offset information on panic. */ @@ -764,29 +787,6 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) void __init setup_arch(char **cmdline_p) { - /* - * Reserve the memory occupied by the kernel between _text and - * __end_of_kernel_reserve symbols. Any kernel sections after the - * __end_of_kernel_reserve symbol must be explicitly reserved with a - * separate memblock_reserve() or they will be discarded. - */ - memblock_reserve(__pa_symbol(_text), - (unsigned long)__end_of_kernel_reserve - (unsigned long)_text); - - /* - * Make sure page 0 is always reserved because on systems with - * L1TF its contents can be leaked to user processes. - */ - memblock_reserve(0, PAGE_SIZE); - - early_reserve_initrd(); - - /* - * At this point everything still needed from the boot loader - * or BIOS or kernel text should be early reserved or marked not - * RAM in e820. All other memory is free game. - */ - #ifdef CONFIG_X86_32 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); @@ -822,7 +822,6 @@ void __init setup_arch(char **cmdline_p) idt_setup_early_traps(); early_cpu_init(); - arch_init_ideal_nops(); jump_label_init(); static_call_init(); early_ioremap_init(); @@ -910,8 +909,18 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); - if (efi_enabled(EFI_BOOT)) - efi_memblock_x86_reserve_range(); + /* + * Do some memory reservations *before* memory is added to + * memblock, so memblock allocations won't overwrite it. + * Do it after early param, so we could get (unlikely) panic from + * serial. + * + * After this point everything still needed from the boot loader or + * firmware or kernel text should be early reserved or marked not + * RAM in e820. All other memory is free game. + */ + early_reserve_memory(); + #ifdef CONFIG_MEMORY_HOTPLUG /* * Memory used by the kernel cannot be hot-removed because Linux @@ -938,9 +947,6 @@ void __init setup_arch(char **cmdline_p) x86_report_nx(); - /* after early param, so could get panic from serial */ - memblock_x86_reserve_range_setup_data(); - if (acpi_mps_check()) { #ifdef CONFIG_X86_LOCAL_APIC disable_apic = 1; @@ -1032,14 +1038,12 @@ void __init setup_arch(char **cmdline_p) */ find_smp_config(); - reserve_ibft_region(); - early_alloc_pgt_buf(); /* * Need to conclude brk, before e820__memblock_setup() - * it could use memblock_find_in_range, could overlap with - * brk area. + * it could use memblock_find_in_range, could overlap with + * brk area. */ reserve_brk(); @@ -1054,8 +1058,6 @@ void __init setup_arch(char **cmdline_p) */ sev_setup_arch(); - reserve_bios_regions(); - efi_fake_memmap(); efi_find_mirror(); efi_esrt_init(); @@ -1081,8 +1083,12 @@ void __init setup_arch(char **cmdline_p) reserve_real_mode(); - trim_platform_memory_ranges(); - trim_low_memory_range(); + /* + * Reserving memory causing GPU hangs on Sandy Bridge integrated + * graphics devices should be done after we allocated memory under + * 1M for the real mode trampoline. + */ + trim_snb_memory(); init_mem_mapping(); @@ -1129,6 +1135,8 @@ void __init setup_arch(char **cmdline_p) reserve_initrd(); acpi_table_upgrade(); + /* Look for ACPI tables and reserve memory occupied by them. */ + acpi_boot_table_init(); vsmp_init(); @@ -1136,11 +1144,6 @@ void __init setup_arch(char **cmdline_p) early_platform_quirks(); - /* - * Parse the ACPI tables for possible boot-time SMP configuration. - */ - acpi_boot_table_init(); - early_acpi_boot_init(); initmem_init(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index fd945ce78554..0941d2f44f2a 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -224,7 +224,6 @@ void __init setup_per_cpu_areas(void) per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); per_cpu(cpu_number, cpu) = cpu; setup_percpu_segment(cpu); - setup_stack_canary_segment(cpu); /* * Copy data used in early init routines from the * initial arrays to the per cpu data areas. These diff --git a/arch/x86/kernel/sev-es-shared.c b/arch/x86/kernel/sev-es-shared.c index cdc04d091242..0aa9f13efd57 100644 --- a/arch/x86/kernel/sev-es-shared.c +++ b/arch/x86/kernel/sev-es-shared.c @@ -24,7 +24,7 @@ static bool __init sev_es_check_cpu_features(void) return true; } -static void sev_es_terminate(unsigned int reason) +static void __noreturn sev_es_terminate(unsigned int reason) { u64 val = GHCB_SEV_TERMINATE; @@ -186,7 +186,6 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) * make it accessible to the hypervisor. * * In particular, check for: - * - Hypervisor CPUID bit * - Availability of CPUID leaf 0x8000001f * - SEV CPUID bit. * @@ -194,10 +193,7 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) * can't be checked here. */ - if ((fn == 1 && !(regs->cx & BIT(31)))) - /* Hypervisor bit */ - goto fail; - else if (fn == 0x80000000 && (regs->ax < 0x8000001f)) + if (fn == 0x80000000 && (regs->ax < 0x8000001f)) /* SEV leaf check */ goto fail; else if ((fn == 0x8000001f && !(regs->ax & BIT(1)))) @@ -210,12 +206,8 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) return; fail: - sev_es_wr_ghcb_msr(GHCB_SEV_TERMINATE); - VMGEXIT(); - - /* Shouldn't get here - if we do halt the machine */ - while (true) - asm volatile("hlt\n"); + /* Terminate the guest */ + sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); } static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt, diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c index 04a780abb512..73873b007838 100644 --- a/arch/x86/kernel/sev-es.c +++ b/arch/x86/kernel/sev-es.c @@ -137,29 +137,41 @@ static __always_inline bool on_vc_stack(struct pt_regs *regs) } /* - * This function handles the case when an NMI is raised in the #VC exception - * handler entry code. In this case, the IST entry for #VC must be adjusted, so - * that any subsequent #VC exception will not overwrite the stack contents of the - * interrupted #VC handler. + * This function handles the case when an NMI is raised in the #VC + * exception handler entry code, before the #VC handler has switched off + * its IST stack. In this case, the IST entry for #VC must be adjusted, + * so that any nested #VC exception will not overwrite the stack + * contents of the interrupted #VC handler. * * The IST entry is adjusted unconditionally so that it can be also be - * unconditionally adjusted back in sev_es_ist_exit(). Otherwise a nested - * sev_es_ist_exit() call may adjust back the IST entry too early. + * unconditionally adjusted back in __sev_es_ist_exit(). Otherwise a + * nested sev_es_ist_exit() call may adjust back the IST entry too + * early. + * + * The __sev_es_ist_enter() and __sev_es_ist_exit() functions always run + * on the NMI IST stack, as they are only called from NMI handling code + * right now. */ void noinstr __sev_es_ist_enter(struct pt_regs *regs) { unsigned long old_ist, new_ist; /* Read old IST entry */ - old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); + new_ist = old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); - /* Make room on the IST stack */ + /* + * If NMI happened while on the #VC IST stack, set the new IST + * value below regs->sp, so that the interrupted stack frame is + * not overwritten by subsequent #VC exceptions. + */ if (on_vc_stack(regs)) - new_ist = ALIGN_DOWN(regs->sp, 8) - sizeof(old_ist); - else - new_ist = old_ist - sizeof(old_ist); + new_ist = regs->sp; - /* Store old IST entry */ + /* + * Reserve additional 8 bytes and store old IST value so this + * adjustment can be unrolled in __sev_es_ist_exit(). + */ + new_ist -= sizeof(old_ist); *(unsigned long *)new_ist = old_ist; /* Set new IST entry */ @@ -251,39 +263,54 @@ static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt, return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE); } -static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) +static enum es_result __vc_decode_user_insn(struct es_em_ctxt *ctxt) { char buffer[MAX_INSN_SIZE]; - enum es_result ret; int res; - if (user_mode(ctxt->regs)) { - res = insn_fetch_from_user_inatomic(ctxt->regs, buffer); - if (!res) { - ctxt->fi.vector = X86_TRAP_PF; - ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER; - ctxt->fi.cr2 = ctxt->regs->ip; - return ES_EXCEPTION; - } + res = insn_fetch_from_user_inatomic(ctxt->regs, buffer); + if (!res) { + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER; + ctxt->fi.cr2 = ctxt->regs->ip; + return ES_EXCEPTION; + } - if (!insn_decode(&ctxt->insn, ctxt->regs, buffer, res)) - return ES_DECODE_FAILED; - } else { - res = vc_fetch_insn_kernel(ctxt, buffer); - if (res) { - ctxt->fi.vector = X86_TRAP_PF; - ctxt->fi.error_code = X86_PF_INSTR; - ctxt->fi.cr2 = ctxt->regs->ip; - return ES_EXCEPTION; - } + if (!insn_decode_from_regs(&ctxt->insn, ctxt->regs, buffer, res)) + return ES_DECODE_FAILED; - insn_init(&ctxt->insn, buffer, MAX_INSN_SIZE - res, 1); - insn_get_length(&ctxt->insn); + if (ctxt->insn.immediate.got) + return ES_OK; + else + return ES_DECODE_FAILED; +} + +static enum es_result __vc_decode_kern_insn(struct es_em_ctxt *ctxt) +{ + char buffer[MAX_INSN_SIZE]; + int res, ret; + + res = vc_fetch_insn_kernel(ctxt, buffer); + if (res) { + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = X86_PF_INSTR; + ctxt->fi.cr2 = ctxt->regs->ip; + return ES_EXCEPTION; } - ret = ctxt->insn.immediate.got ? ES_OK : ES_DECODE_FAILED; + ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64); + if (ret < 0) + return ES_DECODE_FAILED; + else + return ES_OK; +} - return ret; +static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) +{ + if (user_mode(ctxt->regs)) + return __vc_decode_user_insn(ctxt); + else + return __vc_decode_kern_insn(ctxt); } static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index ea794a083c44..a06cb107c0e8 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -492,7 +492,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, * SS descriptor, but we do need SS to be valid. It's possible * that the old SS is entirely bogus -- this can happen if the * signal we're trying to deliver is #GP or #SS caused by a bad - * SS value. We also have a compatbility issue here: DOSEMU + * SS value. We also have a compatibility issue here: DOSEMU * relies on the contents of the SS register indicating the * SS value at the time of the signal, even though that code in * DOSEMU predates sigreturn's ability to restore SS. (DOSEMU @@ -766,30 +766,8 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) { - /* - * This function is fundamentally broken as currently - * implemented. - * - * The idea is that we want to trigger a call to the - * restart_block() syscall and that we want in_ia32_syscall(), - * in_x32_syscall(), etc. to match whatever they were in the - * syscall being restarted. We assume that the syscall - * instruction at (regs->ip - 2) matches whatever syscall - * instruction we used to enter in the first place. - * - * The problem is that we can get here when ptrace pokes - * syscall-like values into regs even if we're not in a syscall - * at all. - * - * For now, we maintain historical behavior and guess based on - * stored state. We could do better by saving the actual - * syscall arch in restart_block or (with caveats on x32) by - * checking if regs->ip points to 'int $0x80'. The current - * behavior is incorrect if a tracer has a different bitness - * than the tracee. - */ #ifdef CONFIG_IA32_EMULATION - if (current_thread_info()->status & (TS_COMPAT|TS_I386_REGS_POKED)) + if (current->restart_block.arch_data & TS_COMPAT) return __NR_ia32_restart_syscall; #endif #ifdef CONFIG_X86_X32_ABI diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index a5330ff498f0..0e5d0a7e203b 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -29,7 +29,7 @@ static inline void signal_compat_build_tests(void) BUILD_BUG_ON(NSIGFPE != 15); BUILD_BUG_ON(NSIGSEGV != 9); BUILD_BUG_ON(NSIGBUS != 5); - BUILD_BUG_ON(NSIGTRAP != 5); + BUILD_BUG_ON(NSIGTRAP != 6); BUILD_BUG_ON(NSIGCHLD != 6); BUILD_BUG_ON(NSIGSYS != 2); @@ -138,6 +138,9 @@ static inline void signal_compat_build_tests(void) BUILD_BUG_ON(offsetof(siginfo_t, si_pkey) != 0x20); BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pkey) != 0x14); + BUILD_BUG_ON(offsetof(siginfo_t, si_perf) != 0x18); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf) != 0x10); + CHECK_CSI_OFFSET(_sigpoll); CHECK_CSI_SIZE (_sigpoll, 2*sizeof(int)); CHECK_SI_SIZE (_sigpoll, 4*sizeof(int)); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index eff4ce3b10da..06db901fabe8 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -67,7 +67,7 @@ * 5AP. symmetric IO mode (normal Linux operation) not affected. * 'noapic' mode has vector 0xf filled out properly. * 6AP. 'noapic' mode might be affected - fixed in later steppings - * 7AP. We do not assume writes to the LVT deassering IRQs + * 7AP. We do not assume writes to the LVT deasserting IRQs * 8AP. We do not enable low power mode (deep sleep) during MP bootup * 9AP. We do not use mixed mode * @@ -204,7 +204,7 @@ static void native_stop_other_cpus(int wait) } /* * Don't wait longer than 10 ms if the caller didn't - * reqeust it. If wait is true, the machine hangs here if + * request it. If wait is true, the machine hangs here if * one or more CPUs do not reach shutdown state. */ timeout = USEC_PER_MSEC * 10; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 02813a7f3a7c..7ffb0cf3f997 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -458,29 +458,52 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) return false; } +static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ + if (c->phys_proc_id == o->phys_proc_id && + c->cpu_die_id == o->cpu_die_id) + return true; + return false; +} + /* - * Define snc_cpu[] for SNC (Sub-NUMA Cluster) CPUs. + * Unlike the other levels, we do not enforce keeping a + * multicore group inside a NUMA node. If this happens, we will + * discard the MC level of the topology later. + */ +static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ + if (c->phys_proc_id == o->phys_proc_id) + return true; + return false; +} + +/* + * Define intel_cod_cpu[] for Intel COD (Cluster-on-Die) CPUs. * - * These are Intel CPUs that enumerate an LLC that is shared by - * multiple NUMA nodes. The LLC on these systems is shared for - * off-package data access but private to the NUMA node (half - * of the package) for on-package access. + * Any Intel CPU that has multiple nodes per package and does not + * match intel_cod_cpu[] has the SNC (Sub-NUMA Cluster) topology. * - * CPUID (the source of the information about the LLC) can only - * enumerate the cache as being shared *or* unshared, but not - * this particular configuration. The CPU in this case enumerates - * the cache to be shared across the entire package (spanning both - * NUMA nodes). + * When in SNC mode, these CPUs enumerate an LLC that is shared + * by multiple NUMA nodes. The LLC is shared for off-package data + * access but private to the NUMA node (half of the package) for + * on-package access. CPUID (the source of the information about + * the LLC) can only enumerate the cache as shared or unshared, + * but not this particular configuration. */ -static const struct x86_cpu_id snc_cpu[] = { - X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, NULL), +static const struct x86_cpu_id intel_cod_cpu[] = { + X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, 0), /* COD */ + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, 0), /* COD */ + X86_MATCH_INTEL_FAM6_MODEL(ANY, 1), /* SNC */ {} }; static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { + const struct x86_cpu_id *id = x86_match_cpu(intel_cod_cpu); int cpu1 = c->cpu_index, cpu2 = o->cpu_index; + bool intel_snc = id && id->driver_data; /* Do not match if we do not have a valid APICID for cpu: */ if (per_cpu(cpu_llc_id, cpu1) == BAD_APICID) @@ -495,32 +518,12 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) * means 'c' does not share the LLC of 'o'. This will be * reflected to userspace. */ - if (!topology_same_node(c, o) && x86_match_cpu(snc_cpu)) + if (match_pkg(c, o) && !topology_same_node(c, o) && intel_snc) return false; return topology_sane(c, o, "llc"); } -/* - * Unlike the other levels, we do not enforce keeping a - * multicore group inside a NUMA node. If this happens, we will - * discard the MC level of the topology later. - */ -static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) -{ - if (c->phys_proc_id == o->phys_proc_id) - return true; - return false; -} - -static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) -{ - if ((c->phys_proc_id == o->phys_proc_id) && - (c->cpu_die_id == o->cpu_die_id)) - return true; - return false; -} - #if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) static inline int x86_sched_itmt_flags(void) @@ -592,14 +595,23 @@ void set_cpu_sibling_map(int cpu) for_each_cpu(i, cpu_sibling_setup_mask) { o = &cpu_data(i); + if (match_pkg(c, o) && !topology_same_node(c, o)) + x86_has_numa_in_package = true; + if ((i == cpu) || (has_smt && match_smt(c, o))) link_mask(topology_sibling_cpumask, cpu, i); if ((i == cpu) || (has_mp && match_llc(c, o))) link_mask(cpu_llc_shared_mask, cpu, i); + if ((i == cpu) || (has_mp && match_die(c, o))) + link_mask(topology_die_cpumask, cpu, i); } + threads = cpumask_weight(topology_sibling_cpumask(cpu)); + if (threads > __max_smt_threads) + __max_smt_threads = threads; + /* * This needs a separate iteration over the cpus because we rely on all * topology_sibling_cpumask links to be set-up. @@ -613,8 +625,7 @@ void set_cpu_sibling_map(int cpu) /* * Does this new cpu bringup a new core? */ - if (cpumask_weight( - topology_sibling_cpumask(cpu)) == 1) { + if (threads == 1) { /* * for each core in package, increment * the booted_cores for this new cpu @@ -631,16 +642,7 @@ void set_cpu_sibling_map(int cpu) } else if (i != cpu && !c->booted_cores) c->booted_cores = cpu_data(i).booted_cores; } - if (match_pkg(c, o) && !topology_same_node(c, o)) - x86_has_numa_in_package = true; - - if ((i == cpu) || (has_mp && match_die(c, o))) - link_mask(topology_die_cpumask, cpu, i); } - - threads = cpumask_weight(topology_sibling_cpumask(cpu)); - if (threads > __max_smt_threads) - __max_smt_threads = threads; } /* maps the cpu to the sched domain representing multi-core */ @@ -1407,7 +1409,7 @@ void __init calculate_max_logical_packages(void) int ncpus; /* - * Today neither Intel nor AMD support heterogenous systems so + * Today neither Intel nor AMD support heterogeneous systems so * extrapolate the boot cpu's data to all packages. */ ncpus = cpu_data(0).booted_cores * topology_max_smt_threads(); @@ -1659,13 +1661,17 @@ void play_dead_common(void) local_irq_disable(); } -static bool wakeup_cpu0(void) +/** + * cond_wakeup_cpu0 - Wake up CPU0 if needed. + * + * If NMI wants to wake up CPU0, start CPU0. + */ +void cond_wakeup_cpu0(void) { if (smp_processor_id() == 0 && enable_start_cpu0) - return true; - - return false; + start_cpu0(); } +EXPORT_SYMBOL_GPL(cond_wakeup_cpu0); /* * We need to flush the caches before going to sleep, lest we have @@ -1734,11 +1740,8 @@ static inline void mwait_play_dead(void) __monitor(mwait_ptr, 0, 0); mb(); __mwait(eax, 0); - /* - * If NMI wants to wake up CPU0, start CPU0. - */ - if (wakeup_cpu0()) - start_cpu0(); + + cond_wakeup_cpu0(); } } @@ -1749,11 +1752,8 @@ void hlt_play_dead(void) while (1) { native_halt(); - /* - * If NMI wants to wake up CPU0, start CPU0. - */ - if (wakeup_cpu0()) - start_cpu0(); + + cond_wakeup_cpu0(); } } diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 8627fda8d993..15b058eefc4e 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -29,12 +29,6 @@ void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, } } -/* - * This function returns an error if it detects any unreliable features of the - * stack. Otherwise it guarantees that the stack trace is reliable. - * - * If the task is not 'current', the caller *must* ensure the task is inactive. - */ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, void *cookie, struct task_struct *task) { diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c index 9442c4136c38..ea028e736831 100644 --- a/arch/x86/kernel/static_call.c +++ b/arch/x86/kernel/static_call.c @@ -34,7 +34,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, void break; case NOP: - code = ideal_nops[NOP_ATOMIC5]; + code = x86_nops[5]; break; case JMP: @@ -66,7 +66,7 @@ static void __static_call_validate(void *insn, bool tail) return; } else { if (opcode == CALL_INSN_OPCODE || - !memcmp(insn, ideal_nops[NOP_ATOMIC5], 5) || + !memcmp(insn, x86_nops[5], 5) || !memcmp(insn, xor5rax, 5)) return; } diff --git a/arch/x86/kernel/sysfb_efi.c b/arch/x86/kernel/sysfb_efi.c index 653b7f617b61..8a56a6d80098 100644 --- a/arch/x86/kernel/sysfb_efi.c +++ b/arch/x86/kernel/sysfb_efi.c @@ -10,7 +10,7 @@ * EFI Quirks * Several EFI systems do not correctly advertise their boot framebuffers. * Hence, we use this static table of known broken machines and fix up the - * information so framebuffer drivers can load corectly. + * information so framebuffer drivers can load correctly. */ #include <linux/dmi.h> diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 4c09ba110204..f9af561c3cd4 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -49,6 +49,30 @@ bool tboot_enabled(void) return tboot != NULL; } +/* noinline to prevent gcc from warning about dereferencing constant fixaddr */ +static noinline __init bool check_tboot_version(void) +{ + if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) { + pr_warn("tboot at 0x%llx is invalid\n", boot_params.tboot_addr); + return false; + } + + if (tboot->version < 5) { + pr_warn("tboot version is invalid: %u\n", tboot->version); + return false; + } + + pr_info("found shared page at phys addr 0x%llx:\n", + boot_params.tboot_addr); + pr_debug("version: %d\n", tboot->version); + pr_debug("log_addr: 0x%08x\n", tboot->log_addr); + pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry); + pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base); + pr_debug("tboot_size: 0x%x\n", tboot->tboot_size); + + return true; +} + void __init tboot_probe(void) { /* Look for valid page-aligned address for shared page. */ @@ -66,25 +90,9 @@ void __init tboot_probe(void) /* Map and check for tboot UUID. */ set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr); - tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE); - if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) { - pr_warn("tboot at 0x%llx is invalid\n", boot_params.tboot_addr); + tboot = (void *)fix_to_virt(FIX_TBOOT_BASE); + if (!check_tboot_version()) tboot = NULL; - return; - } - if (tboot->version < 5) { - pr_warn("tboot version is invalid: %u\n", tboot->version); - tboot = NULL; - return; - } - - pr_info("found shared page at phys addr 0x%llx:\n", - boot_params.tboot_addr); - pr_debug("version: %d\n", tboot->version); - pr_debug("log_addr: 0x%08x\n", tboot->log_addr); - pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry); - pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base); - pr_debug("tboot_size: 0x%x\n", tboot->tboot_size); } static pgd_t *tboot_pg_dir; diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index 64a496a0687f..3c883e064242 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -164,17 +164,11 @@ int do_set_thread_area(struct task_struct *p, int idx, savesegment(fs, sel); if (sel == modified_sel) loadsegment(fs, sel); - - savesegment(gs, sel); - if (sel == modified_sel) - load_gs_index(sel); #endif -#ifdef CONFIG_X86_32_LAZY_GS savesegment(gs, sel); if (sel == modified_sel) - loadsegment(gs, sel); -#endif + load_gs_index(sel); } else { #ifdef CONFIG_X86_64 if (p->thread.fsindex == modified_sel) diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index f5477eab5692..bd83748e2bde 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -113,7 +113,7 @@ int arch_register_cpu(int num) * Two known BSP/CPU0 dependencies: Resume from suspend/hibernate * depends on BSP. PIC interrupts depend on BSP. * - * If the BSP depencies are under control, one can tell kernel to + * If the BSP dependencies are under control, one can tell kernel to * enable BSP hotplug. This basically adds a control file and * one can attempt to offline BSP. */ diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ac1874a2a70e..853ea7a80806 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -395,7 +395,7 @@ DEFINE_IDTENTRY_DF(exc_double_fault) /* * Adjust our frame so that we return straight to the #GP * vector with the expected RSP value. This is safe because - * we won't enable interupts or schedule before we invoke + * we won't enable interrupts or schedule before we invoke * general_protection, so nothing will clobber the stack * frame we just set up. * @@ -498,14 +498,15 @@ static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs, { u8 insn_buf[MAX_INSN_SIZE]; struct insn insn; + int ret; if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip, MAX_INSN_SIZE)) return GP_NO_HINT; - kernel_insn_init(&insn, insn_buf, MAX_INSN_SIZE); - insn_get_modrm(&insn); - insn_get_sib(&insn); + ret = insn_decode_kernel(&insn, insn_buf); + if (ret < 0) + return GP_NO_HINT; *addr = (unsigned long)insn_get_addr_ref(&insn, regs); if (*addr == -1UL) @@ -556,7 +557,7 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection) tsk->thread.trap_nr = X86_TRAP_GP; if (fixup_vdso_exception(regs, X86_TRAP_GP, error_code, 0)) - return; + goto exit; show_signal(tsk, SIGSEGV, "", desc, regs, error_code); force_sig(SIGSEGV); @@ -889,9 +890,6 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, if ((dr6 & DR_STEP) && is_sysenter_singlestep(regs)) dr6 &= ~DR_STEP; - if (kprobe_debug_handler(regs)) - goto out; - /* * The kernel doesn't use INT1 */ @@ -978,6 +976,10 @@ static __always_inline void exc_debug_user(struct pt_regs *regs, goto out_irq; } + /* #DB for bus lock can only be triggered from userspace. */ + if (dr6 & DR_BUS_LOCK) + handle_bus_lock(regs); + /* Add the virtual_dr6 bits for signals. */ dr6 |= current->thread.virtual_dr6; if (dr6 & (DR_STEP | DR_TRAP_BITS) || icebp) @@ -1057,7 +1059,7 @@ static void math_error(struct pt_regs *regs, int trapnr) goto exit; if (fixup_vdso_exception(regs, trapnr, 0, 0)) - return; + goto exit; force_sig_fault(SIGFPE, si_code, (void __user *)uprobe_get_trap_addr(regs)); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index f70dffc2771f..57ec01192180 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -14,6 +14,7 @@ #include <linux/percpu.h> #include <linux/timex.h> #include <linux/static_key.h> +#include <linux/static_call.h> #include <asm/hpet.h> #include <asm/timer.h> @@ -254,7 +255,7 @@ unsigned long long sched_clock(void) bool using_native_sched_clock(void) { - return pv_ops.time.sched_clock == native_sched_clock; + return static_call_query(pv_sched_clock) == native_sched_clock; } #else unsigned long long @@ -739,7 +740,7 @@ static unsigned long pit_hpet_ptimer_calibrate_cpu(void) * 2) Reference counter. If available we use the HPET or the * PMTIMER as a reference to check the sanity of that value. * We use separate TSC readouts and check inside of the - * reference read for any possible disturbance. We dicard + * reference read for any possible disturbance. We discard * disturbed values here as well. We do that around the PIT * calibration delay loop as we have to wait for a certain * amount of time anyway. @@ -1079,7 +1080,7 @@ static void tsc_resume(struct clocksource *cs) * very small window right after one CPU updated cycle_last under * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which * is smaller than the cycle_last reference value due to a TSC which - * is slighty behind. This delta is nowhere else observable, but in + * is slightly behind. This delta is nowhere else observable, but in * that case it results in a forward time jump in the range of hours * due to the unsigned delta calculation of the time keeping core * code, which is necessary to support wrapping clocksources like pm @@ -1264,7 +1265,7 @@ EXPORT_SYMBOL(convert_art_to_tsc); * corresponding clocksource * @cycles: System counter value * @cs: Clocksource corresponding to system counter value. Used - * by timekeeping code to verify comparibility of two cycle + * by timekeeping code to verify comparability of two cycle * values. */ diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 3d3c761eb74a..50a4515fe0ad 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -472,7 +472,7 @@ retry: /* * Add the result to the previous adjustment value. * - * The adjustement value is slightly off by the overhead of the + * The adjustment value is slightly off by the overhead of the * sync mechanism (observed values are ~200 TSC cycles), but this * really depends on CPU, node distance and frequency. So * compensating for this is hard to get right. Experiments show diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c index f6225bf22c02..8daa70b0d2da 100644 --- a/arch/x86/kernel/umip.c +++ b/arch/x86/kernel/umip.c @@ -272,7 +272,7 @@ static int emulate_umip_insn(struct insn *insn, int umip_inst, * by whether the operand is a register or a memory location. * If operand is a register, return as many bytes as the operand * size. If operand is memory, return only the two least - * siginificant bytes. + * significant bytes. */ if (X86_MODRM_MOD(insn->modrm.value) == 3) *data_size = insn->opnd_bytes; @@ -356,7 +356,7 @@ bool fixup_umip_exception(struct pt_regs *regs) if (!nr_copied) return false; - if (!insn_decode(&insn, regs, buf, nr_copied)) + if (!insn_decode_from_regs(&insn, regs, buf, nr_copied)) return false; umip_inst = identify_insn(&insn); diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index a2b413394917..b63cf8f7745e 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -276,12 +276,12 @@ static bool is_prefix_bad(struct insn *insn) static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool x86_64) { + enum insn_mode m = x86_64 ? INSN_MODE_64 : INSN_MODE_32; u32 volatile *good_insns; + int ret; - insn_init(insn, auprobe->insn, sizeof(auprobe->insn), x86_64); - /* has the side-effect of processing the entire instruction */ - insn_get_length(insn); - if (!insn_complete(insn)) + ret = insn_decode(insn, auprobe->insn, sizeof(auprobe->insn), m); + if (ret < 0) return -ENOEXEC; if (is_prefix_bad(insn)) |