diff options
Diffstat (limited to 'arch/x86')
189 files changed, 5289 insertions, 2433 deletions
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index 5a83da703e87..6a1f36df6a18 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -28,5 +28,7 @@ obj-y += net/ obj-$(CONFIG_KEXEC_FILE) += purgatory/ +obj-y += virt/svm/ + # for cleaning subdir- += boot tools diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5edec175b9bf..0f869c4785b8 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -496,6 +496,15 @@ config X86_CPU_RESCTRL Say N if unsure. +config X86_FRED + bool "Flexible Return and Event Delivery" + depends on X86_64 + help + When enabled, try to use Flexible Return and Event Delivery + instead of the legacy SYSCALL/SYSENTER/IDT architecture for + ring transitions and exception/interrupt handling if the + system supports. + if X86_32 config X86_BIGSMP bool "Support for big SMP systems with more than 8 CPUs" @@ -1539,19 +1548,6 @@ config AMD_MEM_ENCRYPT This requires an AMD processor that supports Secure Memory Encryption (SME). -config AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT - bool "Activate AMD Secure Memory Encryption (SME) by default" - depends on AMD_MEM_ENCRYPT - help - Say yes to have system memory encrypted by default if running on - an AMD processor that supports Secure Memory Encryption (SME). - - If set to Y, then the encryption of system memory can be - deactivated with the mem_encrypt=off command line option. - - If set to N, then the encryption of system memory can be - activated with the mem_encrypt=on command line option. - # Common NUMA Features config NUMA bool "NUMA Memory Allocation and Scheduler Support" diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index b9224cf2ee4d..2a7279d80460 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -379,7 +379,7 @@ config X86_CMOV config X86_MINIMUM_CPU_FAMILY int default "64" if X86_64 - default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCRUSOE || MCORE2 || MK7 || MK8) + default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCORE2 || MK7 || MK8) default "5" if X86_32 && X86_CMPXCHG64 default "4" diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 1a068de12a56..da8f3caf2781 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -112,13 +112,13 @@ ifeq ($(CONFIG_X86_32),y) # temporary until string.h is fixed KBUILD_CFLAGS += -ffreestanding - ifeq ($(CONFIG_STACKPROTECTOR),y) - ifeq ($(CONFIG_SMP),y) + ifeq ($(CONFIG_STACKPROTECTOR),y) + ifeq ($(CONFIG_SMP),y) KBUILD_CFLAGS += -mstack-protector-guard-reg=fs -mstack-protector-guard-symbol=__stack_chk_guard - else + else KBUILD_CFLAGS += -mstack-protector-guard=global - endif - endif + endif + endif else BITS := 64 UTS_MACHINE := x86_64 diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 454acd7a2daf..9db630238034 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -304,6 +304,10 @@ void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code) if (result != ES_OK) goto finish; + result = vc_check_opcode_bytes(&ctxt, exit_code); + if (result != ES_OK) + goto finish; + switch (exit_code) { case SVM_EXIT_RDTSC: case SVM_EXIT_RDTSCP: @@ -365,7 +369,7 @@ static void enforce_vmpl0(void) MSR_AMD64_SNP_VMPL_SSS | \ MSR_AMD64_SNP_SECURE_TSC | \ MSR_AMD64_SNP_VMGEXIT_PARAM | \ - MSR_AMD64_SNP_VMSA_REG_PROTECTION | \ + MSR_AMD64_SNP_VMSA_REG_PROT | \ MSR_AMD64_SNP_RESERVED_BIT13 | \ MSR_AMD64_SNP_RESERVED_BIT15 | \ MSR_AMD64_SNP_RESERVED_MASK) diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index b2771710ed98..a1bbedd989e4 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -106,8 +106,7 @@ extra_header_fields: .word 0 # MinorSubsystemVersion .long 0 # Win32VersionValue - .long setup_size + ZO__end + pecompat_vsize - # SizeOfImage + .long setup_size + ZO__end # SizeOfImage .long salign # SizeOfHeaders .long 0 # CheckSum @@ -143,7 +142,7 @@ section_table: .ascii ".setup" .byte 0 .byte 0 - .long setup_size - salign # VirtualSize + .long pecompat_fstart - salign # VirtualSize .long salign # VirtualAddress .long pecompat_fstart - salign # SizeOfRawData .long salign # PointerToRawData @@ -156,8 +155,8 @@ section_table: #ifdef CONFIG_EFI_MIXED .asciz ".compat" - .long 8 # VirtualSize - .long setup_size + ZO__end # VirtualAddress + .long pecompat_fsize # VirtualSize + .long pecompat_fstart # VirtualAddress .long pecompat_fsize # SizeOfRawData .long pecompat_fstart # PointerToRawData @@ -172,17 +171,16 @@ section_table: * modes this image supports. */ .pushsection ".pecompat", "a", @progbits - .balign falign - .set pecompat_vsize, salign + .balign salign .globl pecompat_fstart pecompat_fstart: .byte 0x1 # Version .byte 8 # Size .word IMAGE_FILE_MACHINE_I386 # PE machine type .long setup_size + ZO_efi32_pe_entry # Entrypoint + .byte 0x0 # Sentinel .popsection #else - .set pecompat_vsize, 0 .set pecompat_fstart, setup_size #endif .ascii ".text" diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld index 83bb7efad8ae..3a2d1360abb0 100644 --- a/arch/x86/boot/setup.ld +++ b/arch/x86/boot/setup.ld @@ -24,6 +24,9 @@ SECTIONS .text : { *(.text .text.*) } .text32 : { *(.text32) } + .pecompat : { *(.pecompat) } + PROVIDE(pecompat_fsize = setup_size - pecompat_fstart); + . = ALIGN(16); .rodata : { *(.rodata*) } @@ -36,9 +39,6 @@ SECTIONS . = ALIGN(16); .data : { *(.data*) } - .pecompat : { *(.pecompat) } - PROVIDE(pecompat_fsize = setup_size - pecompat_fstart); - .signature : { setup_sig = .; LONG(0x5a5aaa55) diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c index eeec9986570e..d07be9d05cd0 100644 --- a/arch/x86/coco/core.c +++ b/arch/x86/coco/core.c @@ -14,7 +14,7 @@ #include <asm/processor.h> enum cc_vendor cc_vendor __ro_after_init = CC_VENDOR_NONE; -static u64 cc_mask __ro_after_init; +u64 cc_mask __ro_after_init; static bool noinstr intel_cc_platform_has(enum cc_attr attr) { @@ -148,8 +148,3 @@ u64 cc_mkdec(u64 val) } } EXPORT_SYMBOL_GPL(cc_mkdec); - -__init void cc_set_mask(u64 mask) -{ - cc_mask = mask; -} diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index ca2fe186994b..c93e7f5c2a06 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -18,6 +18,9 @@ obj-y += vdso/ obj-y += vsyscall/ obj-$(CONFIG_PREEMPTION) += thunk_$(BITS).o +CFLAGS_entry_fred.o += -fno-stack-protector +CFLAGS_REMOVE_entry_fred.o += -pg $(CC_FLAGS_FTRACE) +obj-$(CONFIG_X86_FRED) += entry_64_fred.o entry_fred.o + obj-$(CONFIG_IA32_EMULATION) += entry_64_compat.o syscall_32.o obj-$(CONFIG_X86_X32_ABI) += syscall_x32.o - diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 9f1d94790a54..1c3b561528fd 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -65,7 +65,7 @@ For 32-bit we have the following conventions - kernel is built with * for assembly code: */ -.macro PUSH_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 +.macro PUSH_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 unwind_hint=1 .if \save_ret pushq %rsi /* pt_regs->si */ movq 8(%rsp), %rsi /* temporarily store the return address in %rsi */ @@ -87,14 +87,17 @@ For 32-bit we have the following conventions - kernel is built with pushq %r13 /* pt_regs->r13 */ pushq %r14 /* pt_regs->r14 */ pushq %r15 /* pt_regs->r15 */ + + .if \unwind_hint UNWIND_HINT_REGS + .endif .if \save_ret pushq %rsi /* return address on top of stack */ .endif .endm -.macro CLEAR_REGS +.macro CLEAR_REGS clear_bp=1 /* * Sanitize registers of values that a speculation attack might * otherwise want to exploit. The lower registers are likely clobbered @@ -109,7 +112,9 @@ For 32-bit we have the following conventions - kernel is built with xorl %r10d, %r10d /* nospec r10 */ xorl %r11d, %r11d /* nospec r11 */ xorl %ebx, %ebx /* nospec rbx */ + .if \clear_bp xorl %ebp, %ebp /* nospec rbp */ + .endif xorl %r12d, %r12d /* nospec r12 */ xorl %r13d, %r13d /* nospec r13 */ xorl %r14d, %r14d /* nospec r14 */ @@ -117,9 +122,9 @@ For 32-bit we have the following conventions - kernel is built with .endm -.macro PUSH_AND_CLEAR_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 - PUSH_REGS rdx=\rdx, rcx=\rcx, rax=\rax, save_ret=\save_ret - CLEAR_REGS +.macro PUSH_AND_CLEAR_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 clear_bp=1 unwind_hint=1 + PUSH_REGS rdx=\rdx, rcx=\rcx, rax=\rax, save_ret=\save_ret unwind_hint=\unwind_hint + CLEAR_REGS clear_bp=\clear_bp .endm .macro POP_REGS pop_rdi=1 @@ -239,17 +244,19 @@ For 32-bit we have the following conventions - kernel is built with .Ldone_\@: .endm -.macro RESTORE_CR3 scratch_reg:req save_reg:req +/* Restore CR3 from a kernel context. May restore a user CR3 value. */ +.macro PARANOID_RESTORE_CR3 scratch_reg:req save_reg:req ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI - ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID - /* - * KERNEL pages can always resume with NOFLUSH as we do - * explicit flushes. + * If CR3 contained the kernel page tables at the paranoid exception + * entry, then there is nothing to restore as CR3 is not modified while + * handling the exception. */ bt $PTI_USER_PGTABLE_BIT, \save_reg - jnc .Lnoflush_\@ + jnc .Lend_\@ + + ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID /* * Check if there's a pending flush for the user ASID we're @@ -257,20 +264,12 @@ For 32-bit we have the following conventions - kernel is built with */ movq \save_reg, \scratch_reg andq $(0x7FF), \scratch_reg - bt \scratch_reg, THIS_CPU_user_pcid_flush_mask - jnc .Lnoflush_\@ - btr \scratch_reg, THIS_CPU_user_pcid_flush_mask - jmp .Lwrcr3_\@ + jc .Lwrcr3_\@ -.Lnoflush_\@: SET_NOFLUSH_BIT \save_reg .Lwrcr3_\@: - /* - * The CR3 write could be avoided when not changing its value, - * but would require a CR3 read *and* a scratch register. - */ movq \save_reg, %cr3 .Lend_\@: .endm @@ -285,7 +284,7 @@ For 32-bit we have the following conventions - kernel is built with .endm .macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req .endm -.macro RESTORE_CR3 scratch_reg:req save_reg:req +.macro PARANOID_RESTORE_CR3 scratch_reg:req save_reg:req .endm #endif diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S index 8c8d38f0cb1d..003379049924 100644 --- a/arch/x86/entry/entry.S +++ b/arch/x86/entry/entry.S @@ -6,6 +6,9 @@ #include <linux/export.h> #include <linux/linkage.h> #include <asm/msr-index.h> +#include <asm/unwind_hints.h> +#include <asm/segment.h> +#include <asm/cache.h> .pushsection .noinstr.text, "ax" @@ -20,3 +23,23 @@ SYM_FUNC_END(entry_ibpb) EXPORT_SYMBOL_GPL(entry_ibpb); .popsection + +/* + * Define the VERW operand that is disguised as entry code so that + * it can be referenced with KPTI enabled. This ensure VERW can be + * used late in exit-to-user path after page tables are switched. + */ +.pushsection .entry.text, "ax" + +.align L1_CACHE_BYTES, 0xcc +SYM_CODE_START_NOALIGN(mds_verw_sel) + UNWIND_HINT_UNDEFINED + ANNOTATE_NOENDBR + .word __KERNEL_DS +.align L1_CACHE_BYTES, 0xcc +SYM_CODE_END(mds_verw_sel); +/* For KVM */ +EXPORT_SYMBOL_GPL(mds_verw_sel); + +.popsection + diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index c73047bf9f4b..b8be0164385c 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -649,10 +649,6 @@ SYM_CODE_START_LOCAL(asm_\cfunc) SYM_CODE_END(asm_\cfunc) .endm -.macro idtentry_sysvec vector cfunc - idtentry \vector asm_\cfunc \cfunc has_error_code=0 -.endm - /* * Include the defines which emit the idt entries which are shared * shared between 32 and 64 bit and emit the __irqentry_text_* markers @@ -885,6 +881,7 @@ SYM_FUNC_START(entry_SYSENTER_32) BUG_IF_WRONG_CR3 no_user_check=1 popfl popl %eax + CLEAR_CPU_BUFFERS /* * Return back to the vDSO, which will pop ecx and edx. @@ -954,6 +951,7 @@ restore_all_switch_stack: /* Restore user state */ RESTORE_REGS pop=4 # skip orig_eax/error_code + CLEAR_CPU_BUFFERS .Lirq_return: /* * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization @@ -1146,6 +1144,7 @@ SYM_CODE_START(asm_exc_nmi) /* Not on SYSENTER stack. */ call exc_nmi + CLEAR_CPU_BUFFERS jmp .Lnmi_return .Lnmi_from_sysenter_stack: diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index c40f89ab1b4c..a19ed151a813 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -161,6 +161,7 @@ syscall_return_via_sysret: SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack, SYM_L_GLOBAL) ANNOTATE_NOENDBR swapgs + CLEAR_CPU_BUFFERS sysretq SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL) ANNOTATE_NOENDBR @@ -247,7 +248,13 @@ SYM_CODE_START(ret_from_fork_asm) * and unwind should work normally. */ UNWIND_HINT_REGS + +#ifdef CONFIG_X86_FRED + ALTERNATIVE "jmp swapgs_restore_regs_and_return_to_usermode", \ + "jmp asm_fred_exit_user", X86_FEATURE_FRED +#else jmp swapgs_restore_regs_and_return_to_usermode +#endif SYM_CODE_END(ret_from_fork_asm) .popsection @@ -370,14 +377,6 @@ SYM_CODE_END(\asmsym) idtentry \vector asm_\cfunc \cfunc has_error_code=1 .endm -/* - * System vectors which invoke their handlers directly and are not - * going through the regular common device interrupt handling code. - */ -.macro idtentry_sysvec vector cfunc - idtentry \vector asm_\cfunc \cfunc has_error_code=0 -.endm - /** * idtentry_mce_db - Macro to generate entry stubs for #MC and #DB * @vector: Vector number @@ -573,6 +572,7 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) .Lswapgs_and_iret: swapgs + CLEAR_CPU_BUFFERS /* Assert that the IRET frame indicates user mode. */ testb $3, 8(%rsp) jnz .Lnative_iret @@ -723,6 +723,8 @@ native_irq_return_ldt: */ popq %rax /* Restore user RAX */ + CLEAR_CPU_BUFFERS + /* * RSP now points to an ordinary IRET frame, except that the page * is read-only and RSP[31:16] are preloaded with the userspace @@ -968,14 +970,14 @@ SYM_CODE_START_LOCAL(paranoid_exit) IBRS_EXIT save_reg=%r15 /* - * The order of operations is important. RESTORE_CR3 requires + * The order of operations is important. PARANOID_RESTORE_CR3 requires * kernel GSBASE. * * NB to anyone to try to optimize this code: this code does * not execute at all for exceptions from user mode. Those * exceptions go through error_return instead. */ - RESTORE_CR3 scratch_reg=%rax save_reg=%r14 + PARANOID_RESTORE_CR3 scratch_reg=%rax save_reg=%r14 /* Handle the three GSBASE cases */ ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE @@ -1404,8 +1406,7 @@ end_repeat_nmi: /* Always restore stashed SPEC_CTRL value (see paranoid_entry) */ IBRS_EXIT save_reg=%r15 - /* Always restore stashed CR3 value (see paranoid_entry) */ - RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 + PARANOID_RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 /* * The above invocation of paranoid_entry stored the GSBASE @@ -1450,6 +1451,12 @@ nmi_restore: movq $0, 5*8(%rsp) /* clear "NMI executing" */ /* + * Skip CLEAR_CPU_BUFFERS here, since it only helps in rare cases like + * NMI in kernel after user state is restored. For an unprivileged user + * these conditions are hard to meet. + */ + + /* * iretq reads the "iret" frame and exits the NMI stack in a * single instruction. We are returning to kernel mode, so this * cannot result in a fault. Similarly, we don't need to worry @@ -1466,6 +1473,7 @@ SYM_CODE_START(entry_SYSCALL32_ignore) UNWIND_HINT_END_OF_STACK ENDBR mov $-ENOSYS, %eax + CLEAR_CPU_BUFFERS sysretl SYM_CODE_END(entry_SYSCALL32_ignore) diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index de94e2e84ecc..eabf48c4d4b4 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -270,6 +270,7 @@ SYM_INNER_LABEL(entry_SYSRETL_compat_unsafe_stack, SYM_L_GLOBAL) xorl %r9d, %r9d xorl %r10d, %r10d swapgs + CLEAR_CPU_BUFFERS sysretl SYM_INNER_LABEL(entry_SYSRETL_compat_end, SYM_L_GLOBAL) ANNOTATE_NOENDBR diff --git a/arch/x86/entry/entry_64_fred.S b/arch/x86/entry/entry_64_fred.S new file mode 100644 index 000000000000..a02bc6f3d2e6 --- /dev/null +++ b/arch/x86/entry/entry_64_fred.S @@ -0,0 +1,131 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * The actual FRED entry points. + */ + +#include <linux/export.h> + +#include <asm/asm.h> +#include <asm/fred.h> +#include <asm/segment.h> + +#include "calling.h" + + .code64 + .section .noinstr.text, "ax" + +.macro FRED_ENTER + UNWIND_HINT_END_OF_STACK + ENDBR + PUSH_AND_CLEAR_REGS + movq %rsp, %rdi /* %rdi -> pt_regs */ +.endm + +.macro FRED_EXIT + UNWIND_HINT_REGS + POP_REGS +.endm + +/* + * The new RIP value that FRED event delivery establishes is + * IA32_FRED_CONFIG & ~FFFH for events that occur in ring 3. + * Thus the FRED ring 3 entry point must be 4K page aligned. + */ + .align 4096 + +SYM_CODE_START_NOALIGN(asm_fred_entrypoint_user) + FRED_ENTER + call fred_entry_from_user +SYM_INNER_LABEL(asm_fred_exit_user, SYM_L_GLOBAL) + FRED_EXIT +1: ERETU + + _ASM_EXTABLE_TYPE(1b, asm_fred_entrypoint_user, EX_TYPE_ERETU) +SYM_CODE_END(asm_fred_entrypoint_user) + +/* + * The new RIP value that FRED event delivery establishes is + * (IA32_FRED_CONFIG & ~FFFH) + 256 for events that occur in + * ring 0, i.e., asm_fred_entrypoint_user + 256. + */ + .org asm_fred_entrypoint_user + 256, 0xcc +SYM_CODE_START_NOALIGN(asm_fred_entrypoint_kernel) + FRED_ENTER + call fred_entry_from_kernel + FRED_EXIT + ERETS +SYM_CODE_END(asm_fred_entrypoint_kernel) + +#if IS_ENABLED(CONFIG_KVM_INTEL) +SYM_FUNC_START(asm_fred_entry_from_kvm) + push %rbp + mov %rsp, %rbp + + UNWIND_HINT_SAVE + + /* + * Both IRQ and NMI from VMX can be handled on current task stack + * because there is no need to protect from reentrancy and the call + * stack leading to this helper is effectively constant and shallow + * (relatively speaking). Do the same when FRED is active, i.e., no + * need to check current stack level for a stack switch. + * + * Emulate the FRED-defined redzone and stack alignment. + */ + sub $(FRED_CONFIG_REDZONE_AMOUNT << 6), %rsp + and $FRED_STACK_FRAME_RSP_MASK, %rsp + + /* + * Start to push a FRED stack frame, which is always 64 bytes: + * + * +--------+-----------------+ + * | Bytes | Usage | + * +--------+-----------------+ + * | 63:56 | Reserved | + * | 55:48 | Event Data | + * | 47:40 | SS + Event Info | + * | 39:32 | RSP | + * | 31:24 | RFLAGS | + * | 23:16 | CS + Aux Info | + * | 15:8 | RIP | + * | 7:0 | Error Code | + * +--------+-----------------+ + */ + push $0 /* Reserved, must be 0 */ + push $0 /* Event data, 0 for IRQ/NMI */ + push %rdi /* fred_ss handed in by the caller */ + push %rbp + pushf + mov $__KERNEL_CS, %rax + push %rax + + /* + * Unlike the IDT event delivery, FRED _always_ pushes an error code + * after pushing the return RIP, thus the CALL instruction CANNOT be + * used here to push the return RIP, otherwise there is no chance to + * push an error code before invoking the IRQ/NMI handler. + * + * Use LEA to get the return RIP and push it, then push an error code. + */ + lea 1f(%rip), %rax + push %rax /* Return RIP */ + push $0 /* Error code, 0 for IRQ/NMI */ + + PUSH_AND_CLEAR_REGS clear_bp=0 unwind_hint=0 + movq %rsp, %rdi /* %rdi -> pt_regs */ + call __fred_entry_from_kvm /* Call the C entry point */ + POP_REGS + ERETS +1: + /* + * Objtool doesn't understand what ERETS does, this hint tells it that + * yes, we'll reach here and with what stack state. A save/restore pair + * isn't strictly needed, but it's the simplest form. + */ + UNWIND_HINT_RESTORE + pop %rbp + RET + +SYM_FUNC_END(asm_fred_entry_from_kvm) +EXPORT_SYMBOL_GPL(asm_fred_entry_from_kvm); +#endif diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c new file mode 100644 index 000000000000..ac120cbdaaf2 --- /dev/null +++ b/arch/x86/entry/entry_fred.c @@ -0,0 +1,294 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * The FRED specific kernel/user entry functions which are invoked from + * assembly code and dispatch to the associated handlers. + */ +#include <linux/kernel.h> +#include <linux/kdebug.h> +#include <linux/nospec.h> + +#include <asm/desc.h> +#include <asm/fred.h> +#include <asm/idtentry.h> +#include <asm/syscall.h> +#include <asm/trapnr.h> +#include <asm/traps.h> + +/* FRED EVENT_TYPE_OTHER vector numbers */ +#define FRED_SYSCALL 1 +#define FRED_SYSENTER 2 + +static noinstr void fred_bad_type(struct pt_regs *regs, unsigned long error_code) +{ + irqentry_state_t irq_state = irqentry_nmi_enter(regs); + + instrumentation_begin(); + + /* Panic on events from a high stack level */ + if (regs->fred_cs.sl > 0) { + pr_emerg("PANIC: invalid or fatal FRED event; event type %u " + "vector %u error 0x%lx aux 0x%lx at %04x:%016lx\n", + regs->fred_ss.type, regs->fred_ss.vector, regs->orig_ax, + fred_event_data(regs), regs->cs, regs->ip); + die("invalid or fatal FRED event", regs, regs->orig_ax); + panic("invalid or fatal FRED event"); + } else { + unsigned long flags = oops_begin(); + int sig = SIGKILL; + + pr_alert("BUG: invalid or fatal FRED event; event type %u " + "vector %u error 0x%lx aux 0x%lx at %04x:%016lx\n", + regs->fred_ss.type, regs->fred_ss.vector, regs->orig_ax, + fred_event_data(regs), regs->cs, regs->ip); + + if (__die("Invalid or fatal FRED event", regs, regs->orig_ax)) + sig = 0; + + oops_end(flags, regs, sig); + } + + instrumentation_end(); + irqentry_nmi_exit(regs, irq_state); +} + +static noinstr void fred_intx(struct pt_regs *regs) +{ + switch (regs->fred_ss.vector) { + /* Opcode 0xcd, 0x3, NOT INT3 (opcode 0xcc) */ + case X86_TRAP_BP: + return exc_int3(regs); + + /* Opcode 0xcd, 0x4, NOT INTO (opcode 0xce) */ + case X86_TRAP_OF: + return exc_overflow(regs); + +#ifdef CONFIG_IA32_EMULATION + /* INT80 */ + case IA32_SYSCALL_VECTOR: + if (ia32_enabled()) + return int80_emulation(regs); + fallthrough; +#endif + + default: + return exc_general_protection(regs, 0); + } +} + +static __always_inline void fred_other(struct pt_regs *regs) +{ + /* The compiler can fold these conditions into a single test */ + if (likely(regs->fred_ss.vector == FRED_SYSCALL && regs->fred_ss.lm)) { + regs->orig_ax = regs->ax; + regs->ax = -ENOSYS; + do_syscall_64(regs, regs->orig_ax); + return; + } else if (ia32_enabled() && + likely(regs->fred_ss.vector == FRED_SYSENTER && !regs->fred_ss.lm)) { + regs->orig_ax = regs->ax; + regs->ax = -ENOSYS; + do_fast_syscall_32(regs); + return; + } else { + exc_invalid_op(regs); + return; + } +} + +#define SYSVEC(_vector, _function) [_vector - FIRST_SYSTEM_VECTOR] = fred_sysvec_##_function + +static idtentry_t sysvec_table[NR_SYSTEM_VECTORS] __ro_after_init = { + SYSVEC(ERROR_APIC_VECTOR, error_interrupt), + SYSVEC(SPURIOUS_APIC_VECTOR, spurious_apic_interrupt), + SYSVEC(LOCAL_TIMER_VECTOR, apic_timer_interrupt), + SYSVEC(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi), + + SYSVEC(RESCHEDULE_VECTOR, reschedule_ipi), + SYSVEC(CALL_FUNCTION_SINGLE_VECTOR, call_function_single), + SYSVEC(CALL_FUNCTION_VECTOR, call_function), + SYSVEC(REBOOT_VECTOR, reboot), + + SYSVEC(THRESHOLD_APIC_VECTOR, threshold), + SYSVEC(DEFERRED_ERROR_VECTOR, deferred_error), + SYSVEC(THERMAL_APIC_VECTOR, thermal), + + SYSVEC(IRQ_WORK_VECTOR, irq_work), + + SYSVEC(POSTED_INTR_VECTOR, kvm_posted_intr_ipi), + SYSVEC(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi), + SYSVEC(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi), +}; + +static bool fred_setup_done __initdata; + +void __init fred_install_sysvec(unsigned int sysvec, idtentry_t handler) +{ + if (WARN_ON_ONCE(sysvec < FIRST_SYSTEM_VECTOR)) + return; + + if (WARN_ON_ONCE(fred_setup_done)) + return; + + if (!WARN_ON_ONCE(sysvec_table[sysvec - FIRST_SYSTEM_VECTOR])) + sysvec_table[sysvec - FIRST_SYSTEM_VECTOR] = handler; +} + +static noinstr void fred_handle_spurious_interrupt(struct pt_regs *regs) +{ + spurious_interrupt(regs, regs->fred_ss.vector); +} + +void __init fred_complete_exception_setup(void) +{ + unsigned int vector; + + for (vector = 0; vector < FIRST_EXTERNAL_VECTOR; vector++) + set_bit(vector, system_vectors); + + for (vector = 0; vector < NR_SYSTEM_VECTORS; vector++) { + if (sysvec_table[vector]) + set_bit(vector + FIRST_SYSTEM_VECTOR, system_vectors); + else + sysvec_table[vector] = fred_handle_spurious_interrupt; + } + fred_setup_done = true; +} + +static noinstr void fred_extint(struct pt_regs *regs) +{ + unsigned int vector = regs->fred_ss.vector; + unsigned int index = array_index_nospec(vector - FIRST_SYSTEM_VECTOR, + NR_SYSTEM_VECTORS); + + if (WARN_ON_ONCE(vector < FIRST_EXTERNAL_VECTOR)) + return; + + if (likely(vector >= FIRST_SYSTEM_VECTOR)) { + irqentry_state_t state = irqentry_enter(regs); + + instrumentation_begin(); + sysvec_table[index](regs); + instrumentation_end(); + irqentry_exit(regs, state); + } else { + common_interrupt(regs, vector); + } +} + +static noinstr void fred_hwexc(struct pt_regs *regs, unsigned long error_code) +{ + /* Optimize for #PF. That's the only exception which matters performance wise */ + if (likely(regs->fred_ss.vector == X86_TRAP_PF)) + return exc_page_fault(regs, error_code); + + switch (regs->fred_ss.vector) { + case X86_TRAP_DE: return exc_divide_error(regs); + case X86_TRAP_DB: return fred_exc_debug(regs); + case X86_TRAP_BR: return exc_bounds(regs); + case X86_TRAP_UD: return exc_invalid_op(regs); + case X86_TRAP_NM: return exc_device_not_available(regs); + case X86_TRAP_DF: return exc_double_fault(regs, error_code); + case X86_TRAP_TS: return exc_invalid_tss(regs, error_code); + case X86_TRAP_NP: return exc_segment_not_present(regs, error_code); + case X86_TRAP_SS: return exc_stack_segment(regs, error_code); + case X86_TRAP_GP: return exc_general_protection(regs, error_code); + case X86_TRAP_MF: return exc_coprocessor_error(regs); + case X86_TRAP_AC: return exc_alignment_check(regs, error_code); + case X86_TRAP_XF: return exc_simd_coprocessor_error(regs); + +#ifdef CONFIG_X86_MCE + case X86_TRAP_MC: return fred_exc_machine_check(regs); +#endif +#ifdef CONFIG_INTEL_TDX_GUEST + case X86_TRAP_VE: return exc_virtualization_exception(regs); +#endif +#ifdef CONFIG_X86_CET + case X86_TRAP_CP: return exc_control_protection(regs, error_code); +#endif + default: return fred_bad_type(regs, error_code); + } + +} + +static noinstr void fred_swexc(struct pt_regs *regs, unsigned long error_code) +{ + switch (regs->fred_ss.vector) { + case X86_TRAP_BP: return exc_int3(regs); + case X86_TRAP_OF: return exc_overflow(regs); + default: return fred_bad_type(regs, error_code); + } +} + +__visible noinstr void fred_entry_from_user(struct pt_regs *regs) +{ + unsigned long error_code = regs->orig_ax; + + /* Invalidate orig_ax so that syscall_get_nr() works correctly */ + regs->orig_ax = -1; + + switch (regs->fred_ss.type) { + case EVENT_TYPE_EXTINT: + return fred_extint(regs); + case EVENT_TYPE_NMI: + if (likely(regs->fred_ss.vector == X86_TRAP_NMI)) + return fred_exc_nmi(regs); + break; + case EVENT_TYPE_HWEXC: + return fred_hwexc(regs, error_code); + case EVENT_TYPE_SWINT: + return fred_intx(regs); + case EVENT_TYPE_PRIV_SWEXC: + if (likely(regs->fred_ss.vector == X86_TRAP_DB)) + return fred_exc_debug(regs); + break; + case EVENT_TYPE_SWEXC: + return fred_swexc(regs, error_code); + case EVENT_TYPE_OTHER: + return fred_other(regs); + default: break; + } + + return fred_bad_type(regs, error_code); +} + +__visible noinstr void fred_entry_from_kernel(struct pt_regs *regs) +{ + unsigned long error_code = regs->orig_ax; + + /* Invalidate orig_ax so that syscall_get_nr() works correctly */ + regs->orig_ax = -1; + + switch (regs->fred_ss.type) { + case EVENT_TYPE_EXTINT: + return fred_extint(regs); + case EVENT_TYPE_NMI: + if (likely(regs->fred_ss.vector == X86_TRAP_NMI)) + return fred_exc_nmi(regs); + break; + case EVENT_TYPE_HWEXC: + return fred_hwexc(regs, error_code); + case EVENT_TYPE_PRIV_SWEXC: + if (likely(regs->fred_ss.vector == X86_TRAP_DB)) + return fred_exc_debug(regs); + break; + case EVENT_TYPE_SWEXC: + return fred_swexc(regs, error_code); + default: break; + } + + return fred_bad_type(regs, error_code); +} + +#if IS_ENABLED(CONFIG_KVM_INTEL) +__visible noinstr void __fred_entry_from_kvm(struct pt_regs *regs) +{ + switch (regs->fred_ss.type) { + case EVENT_TYPE_EXTINT: + return fred_extint(regs); + case EVENT_TYPE_NMI: + return fred_exc_nmi(regs); + default: + WARN_ON_ONCE(1); + } +} +#endif diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index e0ca8120aea8..a3c0df11d0e6 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -76,7 +76,7 @@ static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, if (!show_unhandled_signals) return; - printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", + printk_ratelimited("%s%s[%d] %s ip:%lx cs:%x sp:%lx ax:%lx si:%lx di:%lx\n", level, current->comm, task_pid_nr(current), message, regs->ip, regs->cs, regs->sp, regs->ax, regs->si, regs->di); diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 81f6d8275b6b..69a3b02e50bb 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -579,7 +579,7 @@ static void amd_pmu_cpu_starting(int cpu) if (!x86_pmu.amd_nb_constraints) return; - nb_id = topology_die_id(cpu); + nb_id = topology_amd_node_id(cpu); WARN_ON_ONCE(nb_id == BAD_APICID); for_each_online_cpu(i) { diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 4b50a3a9818a..326c8cd5aa2d 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -834,7 +834,7 @@ static int __init cstate_init(void) } if (has_cstate_pkg) { - if (topology_max_die_per_package() > 1) { + if (topology_max_dies_per_package() > 1) { err = perf_pmu_register(&cstate_pkg_pmu, "cstate_die", -1); } else { diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 7927c0b832fa..258e2cdf28fa 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1893,7 +1893,7 @@ static int __init intel_uncore_init(void) return -ENODEV; __uncore_max_dies = - topology_max_packages() * topology_max_die_per_package(); + topology_max_packages() * topology_max_dies_per_package(); id = x86_match_cpu(intel_uncore_match); if (!id) { diff --git a/arch/x86/events/intel/uncore_nhmex.c b/arch/x86/events/intel/uncore_nhmex.c index 56eea2c66cfb..92da8aaa5966 100644 --- a/arch/x86/events/intel/uncore_nhmex.c +++ b/arch/x86/events/intel/uncore_nhmex.c @@ -1221,8 +1221,8 @@ void nhmex_uncore_cpu_init(void) uncore_nhmex = true; else nhmex_uncore_mbox.event_descs = wsmex_uncore_mbox_events; - if (nhmex_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) - nhmex_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; + if (nhmex_uncore_cbox.num_boxes > topology_num_cores_per_package()) + nhmex_uncore_cbox.num_boxes = topology_num_cores_per_package(); uncore_msr_uncores = nhmex_msr_uncores; } /* end of Nehalem-EX uncore support */ diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 7fd4334e12a1..9462fd9f3b7a 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -364,8 +364,8 @@ static struct intel_uncore_type *snb_msr_uncores[] = { void snb_uncore_cpu_init(void) { uncore_msr_uncores = snb_msr_uncores; - if (snb_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) - snb_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; + if (snb_uncore_cbox.num_boxes > topology_num_cores_per_package()) + snb_uncore_cbox.num_boxes = topology_num_cores_per_package(); } static void skl_uncore_msr_init_box(struct intel_uncore_box *box) @@ -428,8 +428,8 @@ static struct intel_uncore_type *skl_msr_uncores[] = { void skl_uncore_cpu_init(void) { uncore_msr_uncores = skl_msr_uncores; - if (skl_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) - skl_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; + if (skl_uncore_cbox.num_boxes > topology_num_cores_per_package()) + skl_uncore_cbox.num_boxes = topology_num_cores_per_package(); snb_uncore_arb.ops = &skl_uncore_msr_ops; } diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index a96496bef678..2eaf0f339849 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -1172,8 +1172,8 @@ static struct intel_uncore_type *snbep_msr_uncores[] = { void snbep_uncore_cpu_init(void) { - if (snbep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) - snbep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; + if (snbep_uncore_cbox.num_boxes > topology_num_cores_per_package()) + snbep_uncore_cbox.num_boxes = topology_num_cores_per_package(); uncore_msr_uncores = snbep_msr_uncores; } @@ -1406,7 +1406,7 @@ static int topology_gidnid_map(int nodeid, u32 gidnid) */ for (i = 0; i < 8; i++) { if (nodeid == GIDNIDMAP(gidnid, i)) { - if (topology_max_die_per_package() > 1) + if (topology_max_dies_per_package() > 1) die_id = i; else die_id = topology_phys_to_logical_pkg(i); @@ -1845,8 +1845,8 @@ static struct intel_uncore_type *ivbep_msr_uncores[] = { void ivbep_uncore_cpu_init(void) { - if (ivbep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) - ivbep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; + if (ivbep_uncore_cbox.num_boxes > topology_num_cores_per_package()) + ivbep_uncore_cbox.num_boxes = topology_num_cores_per_package(); uncore_msr_uncores = ivbep_msr_uncores; } @@ -2917,8 +2917,8 @@ static bool hswep_has_limit_sbox(unsigned int device) void hswep_uncore_cpu_init(void) { - if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) - hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; + if (hswep_uncore_cbox.num_boxes > topology_num_cores_per_package()) + hswep_uncore_cbox.num_boxes = topology_num_cores_per_package(); /* Detect 6-8 core systems with only two SBOXes */ if (hswep_has_limit_sbox(HSWEP_PCU_DID)) @@ -3280,8 +3280,8 @@ static struct event_constraint bdx_uncore_pcu_constraints[] = { void bdx_uncore_cpu_init(void) { - if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) - bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; + if (bdx_uncore_cbox.num_boxes > topology_num_cores_per_package()) + bdx_uncore_cbox.num_boxes = topology_num_cores_per_package(); uncore_msr_uncores = bdx_msr_uncores; /* Detect systems with no SBOXes */ diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index 8d98d468b976..fb2b1961e5a3 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -674,7 +674,7 @@ static const struct attribute_group *rapl_attr_update[] = { static int __init init_rapl_pmus(void) { - int maxdie = topology_max_packages() * topology_max_die_per_package(); + int maxdie = topology_max_packages() * topology_max_dies_per_package(); size_t size; size = sizeof(*rapl_pmus) + maxdie * sizeof(struct rapl_pmu *); diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c index 96e6c51515f5..edd2f35b2a5e 100644 --- a/arch/x86/hyperv/hv_vtl.c +++ b/arch/x86/hyperv/hv_vtl.c @@ -16,6 +16,11 @@ extern struct boot_params boot_params; static struct real_mode_header hv_vtl_real_mode_header; +static bool __init hv_vtl_msi_ext_dest_id(void) +{ + return true; +} + void __init hv_vtl_init_platform(void) { pr_info("Linux runs in Hyper-V Virtual Trust Level\n"); @@ -26,8 +31,9 @@ void __init hv_vtl_init_platform(void) x86_init.timers.timer_init = x86_init_noop; /* Avoid searching for BIOS MP tables */ - x86_init.mpparse.find_smp_config = x86_init_noop; - x86_init.mpparse.get_smp_config = x86_init_uint_noop; + x86_init.mpparse.find_mptable = x86_init_noop; + x86_init.mpparse.early_parse_smp_cfg = x86_init_noop; + x86_init.mpparse.parse_smp_cfg = x86_init_noop; x86_platform.get_wallclock = get_rtc_noop; x86_platform.set_wallclock = set_rtc_noop; @@ -38,6 +44,8 @@ void __init hv_vtl_init_platform(void) x86_platform.legacy.warm_reset = 0; x86_platform.legacy.reserve_bios_regions = 0; x86_platform.legacy.devices.pnpbios = 0; + + x86_init.hyper.msi_ext_dest_id = hv_vtl_msi_ext_dest_id; } static inline u64 hv_vtl_system_desc_base(struct ldttss_desc *desc) diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c index 7dcbf153ad72..768d73de0d09 100644 --- a/arch/x86/hyperv/ivm.c +++ b/arch/x86/hyperv/ivm.c @@ -15,6 +15,7 @@ #include <asm/io.h> #include <asm/coco.h> #include <asm/mem_encrypt.h> +#include <asm/set_memory.h> #include <asm/mshyperv.h> #include <asm/hypervisor.h> #include <asm/mtrr.h> @@ -503,6 +504,31 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[], } /* + * When transitioning memory between encrypted and decrypted, the caller + * of set_memory_encrypted() or set_memory_decrypted() is responsible for + * ensuring that the memory isn't in use and isn't referenced while the + * transition is in progress. The transition has multiple steps, and the + * memory is in an inconsistent state until all steps are complete. A + * reference while the state is inconsistent could result in an exception + * that can't be cleanly fixed up. + * + * But the Linux kernel load_unaligned_zeropad() mechanism could cause a + * stray reference that can't be prevented by the caller, so Linux has + * specific code to handle this case. But when the #VC and #VE exceptions + * routed to a paravisor, the specific code doesn't work. To avoid this + * problem, mark the pages as "not present" while the transition is in + * progress. If load_unaligned_zeropad() causes a stray reference, a normal + * page fault is generated instead of #VC or #VE, and the page-fault-based + * handlers for load_unaligned_zeropad() resolve the reference. When the + * transition is complete, hv_vtom_set_host_visibility() marks the pages + * as "present" again. + */ +static bool hv_vtom_clear_present(unsigned long kbuffer, int pagecount, bool enc) +{ + return !set_memory_np(kbuffer, pagecount); +} + +/* * hv_vtom_set_host_visibility - Set specified memory visible to host. * * In Isolation VM, all guest memory is encrypted from host and guest @@ -515,16 +541,28 @@ static bool hv_vtom_set_host_visibility(unsigned long kbuffer, int pagecount, bo enum hv_mem_host_visibility visibility = enc ? VMBUS_PAGE_NOT_VISIBLE : VMBUS_PAGE_VISIBLE_READ_WRITE; u64 *pfn_array; + phys_addr_t paddr; + void *vaddr; int ret = 0; bool result = true; int i, pfn; pfn_array = kmalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL); - if (!pfn_array) - return false; + if (!pfn_array) { + result = false; + goto err_set_memory_p; + } for (i = 0, pfn = 0; i < pagecount; i++) { - pfn_array[pfn] = virt_to_hvpfn((void *)kbuffer + i * HV_HYP_PAGE_SIZE); + /* + * Use slow_virt_to_phys() because the PRESENT bit has been + * temporarily cleared in the PTEs. slow_virt_to_phys() works + * without the PRESENT bit while virt_to_hvpfn() or similar + * does not. + */ + vaddr = (void *)kbuffer + (i * HV_HYP_PAGE_SIZE); + paddr = slow_virt_to_phys(vaddr); + pfn_array[pfn] = paddr >> HV_HYP_PAGE_SHIFT; pfn++; if (pfn == HV_MAX_MODIFY_GPA_REP_COUNT || i == pagecount - 1) { @@ -538,14 +576,30 @@ static bool hv_vtom_set_host_visibility(unsigned long kbuffer, int pagecount, bo } } - err_free_pfn_array: +err_free_pfn_array: kfree(pfn_array); + +err_set_memory_p: + /* + * Set the PTE PRESENT bits again to revert what hv_vtom_clear_present() + * did. Do this even if there is an error earlier in this function in + * order to avoid leaving the memory range in a "broken" state. Setting + * the PRESENT bits shouldn't fail, but return an error if it does. + */ + if (set_memory_p(kbuffer, pagecount)) + result = false; + return result; } static bool hv_vtom_tlb_flush_required(bool private) { - return true; + /* + * Since hv_vtom_clear_present() marks the PTEs as "not present" + * and flushes the TLB, they can't be in the TLB. That makes the + * flush controlled by this function redundant, so return "false". + */ + return false; } static bool hv_vtom_cache_flush_required(void) @@ -608,6 +662,7 @@ void __init hv_vtom_init(void) x86_platform.hyper.is_private_mmio = hv_is_private_mmio; x86_platform.guest.enc_cache_flush_required = hv_vtom_cache_flush_required; x86_platform.guest.enc_tlb_flush_required = hv_vtom_tlb_flush_required; + x86_platform.guest.enc_status_change_prepare = hv_vtom_clear_present; x86_platform.guest.enc_status_change_finish = hv_vtom_set_host_visibility; /* Set WB as the default cache mode. */ diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 9d159b771dc8..94ce0f7c9d3a 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -46,6 +46,10 @@ extern void x86_32_probe_apic(void); static inline void x86_32_probe_apic(void) { } #endif +extern u32 cpuid_to_apicid[]; + +#define CPU_ACPIID_INVALID U32_MAX + #ifdef CONFIG_X86_LOCAL_APIC extern int apic_verbosity; @@ -54,8 +58,6 @@ extern int local_apic_timer_c2_ok; extern bool apic_is_disabled; extern unsigned int lapic_timer_period; -extern u32 cpuid_to_apicid[]; - extern enum apic_intr_mode_id apic_intr_mode; enum apic_intr_mode_id { APIC_PIC, @@ -169,6 +171,14 @@ extern bool apic_needs_pit(void); extern void apic_send_IPI_allbutself(unsigned int vector); +extern void topology_register_apic(u32 apic_id, u32 acpi_id, bool present); +extern void topology_register_boot_apic(u32 apic_id); +extern int topology_hotplug_apic(u32 apic_id, u32 acpi_id); +extern void topology_hotunplug_apic(unsigned int cpu); +extern void topology_apply_cmdline_limits_early(void); +extern void topology_init_possible_cpus(void); +extern void topology_reset_possible_cpus_up(void); + #else /* !CONFIG_X86_LOCAL_APIC */ static inline void lapic_shutdown(void) { } #define local_apic_timer_c2_ok 1 @@ -183,6 +193,8 @@ static inline void apic_intr_mode_init(void) { } static inline void lapic_assign_system_vectors(void) { } static inline void lapic_assign_legacy_vector(unsigned int i, bool r) { } static inline bool apic_needs_pit(void) { return true; } +static inline void topology_apply_cmdline_limits_early(void) { } +static inline void topology_init_possible_cpus(void) { } #endif /* !CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_X2APIC @@ -289,16 +301,11 @@ struct apic { /* Probe, setup and smpboot functions */ int (*probe)(void); int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id); - bool (*apic_id_registered)(void); - bool (*check_apicid_used)(physid_mask_t *map, u32 apicid); void (*init_apic_ldr)(void); - void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap); u32 (*cpu_present_to_apicid)(int mps_cpu); - u32 (*phys_pkg_id)(u32 cpuid_apic, int index_msb); u32 (*get_apic_id)(u32 id); - u32 (*set_apic_id)(u32 apicid); /* wakeup_secondary_cpu */ int (*wakeup_secondary_cpu)(u32 apicid, unsigned long start_eip); @@ -527,7 +534,6 @@ extern int default_apic_id_valid(u32 apicid); extern u32 apic_default_calc_apicid(unsigned int cpu); extern u32 apic_flat_calc_apicid(unsigned int cpu); -extern void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap); extern u32 default_cpu_present_to_apicid(int mps_cpu); void apic_send_nmi_to_offline_cpu(unsigned int cpu); diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index b1a98fa38828..076bf8dee702 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -12,6 +12,7 @@ #include <asm/special_insns.h> #include <asm/preempt.h> #include <asm/asm.h> +#include <asm/fred.h> #include <asm/gsseg.h> #ifndef CONFIG_X86_CMPXCHG64 diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index fbcfec4dc4cc..ca8eed1d496a 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -113,6 +113,20 @@ #endif +#ifndef __ASSEMBLY__ +#ifndef __pic__ +static __always_inline __pure void *rip_rel_ptr(void *p) +{ + asm("leaq %c1(%%rip), %0" : "=r"(p) : "i"(p)); + + return p; +} +#define RIP_REL_REF(var) (*(typeof(&(var)))rip_rel_ptr(&(var))) +#else +#define RIP_REL_REF(var) (var) +#endif +#endif + /* * Macros to generate condition code outputs from inline assembly, * The output operand must be type "bool". diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 0216f63a366b..fe1e7e3cc844 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -33,7 +33,7 @@ * Returns: * 0 - (index < size) */ -static inline unsigned long array_index_mask_nospec(unsigned long index, +static __always_inline unsigned long array_index_mask_nospec(unsigned long index, unsigned long size) { unsigned long mask; diff --git a/arch/x86/include/asm/coco.h b/arch/x86/include/asm/coco.h index 6ae2d16a7613..fb7388bbc212 100644 --- a/arch/x86/include/asm/coco.h +++ b/arch/x86/include/asm/coco.h @@ -2,6 +2,7 @@ #ifndef _ASM_X86_COCO_H #define _ASM_X86_COCO_H +#include <asm/asm.h> #include <asm/types.h> enum cc_vendor { @@ -10,13 +11,20 @@ enum cc_vendor { CC_VENDOR_INTEL, }; +#ifdef CONFIG_ARCH_HAS_CC_PLATFORM extern enum cc_vendor cc_vendor; +extern u64 cc_mask; + +static inline void cc_set_mask(u64 mask) +{ + RIP_REL_REF(cc_mask) = mask; +} -#ifdef CONFIG_ARCH_HAS_CC_PLATFORM -void cc_set_mask(u64 mask); u64 cc_mkenc(u64 val); u64 cc_mkdec(u64 val); #else +#define cc_vendor (CC_VENDOR_NONE) + static inline u64 cc_mkenc(u64 val) { return val; diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index f8f9a9b79395..aa30fd8cad7f 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -9,18 +9,10 @@ #include <linux/percpu.h> #include <asm/ibt.h> -#ifdef CONFIG_SMP - -extern void prefill_possible_map(void); - -#else /* CONFIG_SMP */ - -static inline void prefill_possible_map(void) {} - +#ifndef CONFIG_SMP #define cpu_physical_id(cpu) boot_cpu_physical_apicid #define cpu_acpi_id(cpu) 0 #define safe_smp_processor_id() 0 - #endif /* CONFIG_SMP */ #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index a26bebbdff87..a1273698fc43 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -168,7 +168,7 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); */ static __always_inline bool _static_cpu_has(u16 bit) { - asm_volatile_goto( + asm goto( ALTERNATIVE_TERNARY("jmp 6f", %P[feature], "", "jmp %l[t_no]") ".pushsection .altinstr_aux,\"ax\"\n" "6:\n" diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 29cb275a219d..0343caa016a9 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -81,10 +81,8 @@ #define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */ #define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */ #define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */ - -/* CPU types for specific tunings: */ #define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ -/* FREE, was #define X86_FEATURE_K7 ( 3*32+ 5) "" Athlon */ +#define X86_FEATURE_ZEN5 ( 3*32+ 5) /* "" CPU based on Zen5 microarchitecture */ #define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ #define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ #define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ @@ -97,7 +95,7 @@ #define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */ #define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */ #define X86_FEATURE_AMD_LBR_V2 ( 3*32+17) /* AMD Last Branch Record Extension Version 2 */ -/* FREE, was #define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) "" LFENCE synchronizes RDTSC */ +#define X86_FEATURE_CLEAR_CPU_BUF ( 3*32+18) /* "" Clear CPU buffers using VERW */ #define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ #define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ #define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ @@ -326,7 +324,9 @@ #define X86_FEATURE_FZRM (12*32+10) /* "" Fast zero-length REP MOVSB */ #define X86_FEATURE_FSRS (12*32+11) /* "" Fast short REP STOSB */ #define X86_FEATURE_FSRC (12*32+12) /* "" Fast short REP {CMPSB,SCASB} */ +#define X86_FEATURE_FRED (12*32+17) /* Flexible Return and Event Delivery */ #define X86_FEATURE_LKGS (12*32+18) /* "" Load "kernel" (userspace) GS */ +#define X86_FEATURE_WRMSRNS (12*32+19) /* "" Non-serializing WRMSR */ #define X86_FEATURE_AMX_FP16 (12*32+21) /* "" AMX fp16 Support */ #define X86_FEATURE_AVX_IFMA (12*32+23) /* "" Support for VPMADD52[H,L]UQ */ #define X86_FEATURE_LAM (12*32+26) /* Linear Address Masking */ @@ -442,6 +442,7 @@ #define X86_FEATURE_SEV (19*32+ 1) /* AMD Secure Encrypted Virtualization */ #define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* "" VM Page Flush MSR is supported */ #define X86_FEATURE_SEV_ES (19*32+ 3) /* AMD Secure Encrypted Virtualization - Encrypted State */ +#define X86_FEATURE_SEV_SNP (19*32+ 4) /* AMD Secure Encrypted Virtualization - Secure Nested Paging */ #define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* "" Virtual TSC_AUX */ #define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */ #define X86_FEATURE_DEBUG_SWAP (19*32+14) /* AMD SEV-ES full debug state swap support */ diff --git a/arch/x86/include/asm/cpuid.h b/arch/x86/include/asm/cpuid.h index 9bee3e7bf973..6b122a31da06 100644 --- a/arch/x86/include/asm/cpuid.h +++ b/arch/x86/include/asm/cpuid.h @@ -127,6 +127,42 @@ static inline unsigned int cpuid_edx(unsigned int op) return edx; } +static inline void __cpuid_read(unsigned int leaf, unsigned int subleaf, u32 *regs) +{ + regs[CPUID_EAX] = leaf; + regs[CPUID_ECX] = subleaf; + __cpuid(regs + CPUID_EAX, regs + CPUID_EBX, regs + CPUID_ECX, regs + CPUID_EDX); +} + +#define cpuid_subleaf(leaf, subleaf, regs) { \ + static_assert(sizeof(*(regs)) == 16); \ + __cpuid_read(leaf, subleaf, (u32 *)(regs)); \ +} + +#define cpuid_leaf(leaf, regs) { \ + static_assert(sizeof(*(regs)) == 16); \ + __cpuid_read(leaf, 0, (u32 *)(regs)); \ +} + +static inline void __cpuid_read_reg(unsigned int leaf, unsigned int subleaf, + enum cpuid_regs_idx regidx, u32 *reg) +{ + u32 regs[4]; + + __cpuid_read(leaf, subleaf, regs); + *reg = regs[regidx]; +} + +#define cpuid_subleaf_reg(leaf, subleaf, regidx, reg) { \ + static_assert(sizeof(*(reg)) == 4); \ + __cpuid_read_reg(leaf, subleaf, regidx, (u32 *)(reg)); \ +} + +#define cpuid_leaf_reg(leaf, regidx, reg) { \ + static_assert(sizeof(*(reg)) == 4); \ + __cpuid_read_reg(leaf, 0, regidx, (u32 *)(reg)); \ +} + static __always_inline bool cpuid_function_is_indexed(u32 function) { switch (function) { diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index ab97b22ac04a..ec95fe44fa3a 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -402,8 +402,6 @@ static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) desc->limit1 = (limit >> 16) & 0xf; } -void alloc_intr_gate(unsigned int n, const void *addr); - static inline void init_idt_data(struct idt_data *data, unsigned int n, const void *addr) { diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 702d93fdd10e..d73fea9c3bf1 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -117,6 +117,18 @@ #define DISABLE_IBT (1 << (X86_FEATURE_IBT & 31)) #endif +#ifdef CONFIG_X86_FRED +# define DISABLE_FRED 0 +#else +# define DISABLE_FRED (1 << (X86_FEATURE_FRED & 31)) +#endif + +#ifdef CONFIG_KVM_AMD_SEV +#define DISABLE_SEV_SNP 0 +#else +#define DISABLE_SEV_SNP (1 << (X86_FEATURE_SEV_SNP & 31)) +#endif + /* * Make sure to add features to the correct mask */ @@ -133,7 +145,7 @@ #define DISABLED_MASK10 0 #define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \ DISABLE_CALL_DEPTH_TRACKING|DISABLE_USER_SHSTK) -#define DISABLED_MASK12 (DISABLE_LAM) +#define DISABLED_MASK12 (DISABLE_FRED|DISABLE_LAM) #define DISABLED_MASK13 0 #define DISABLED_MASK14 0 #define DISABLED_MASK15 0 @@ -141,7 +153,7 @@ DISABLE_ENQCMD) #define DISABLED_MASK17 0 #define DISABLED_MASK18 (DISABLE_IBT) -#define DISABLED_MASK19 0 +#define DISABLED_MASK19 (DISABLE_SEV_SNP) #define DISABLED_MASK20 0 #define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 21) diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h index ce8f50192ae3..7e523bb3d2d3 100644 --- a/arch/x86/include/asm/entry-common.h +++ b/arch/x86/include/asm/entry-common.h @@ -91,7 +91,6 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, static __always_inline void arch_exit_to_user_mode(void) { - mds_user_clear_cpu_buffers(); amd_clear_divider(); } #define arch_exit_to_user_mode arch_exit_to_user_mode diff --git a/arch/x86/include/asm/extable_fixup_types.h b/arch/x86/include/asm/extable_fixup_types.h index fe6312045042..7acf0383be80 100644 --- a/arch/x86/include/asm/extable_fixup_types.h +++ b/arch/x86/include/asm/extable_fixup_types.h @@ -64,6 +64,8 @@ #define EX_TYPE_UCOPY_LEN4 (EX_TYPE_UCOPY_LEN | EX_DATA_IMM(4)) #define EX_TYPE_UCOPY_LEN8 (EX_TYPE_UCOPY_LEN | EX_DATA_IMM(8)) -#define EX_TYPE_ZEROPAD 20 /* longword load with zeropad on fault */ +#define EX_TYPE_ZEROPAD 20 /* longword load with zeropad on fault */ + +#define EX_TYPE_ERETU 21 #endif diff --git a/arch/x86/include/asm/fred.h b/arch/x86/include/asm/fred.h new file mode 100644 index 000000000000..e86c7ba32435 --- /dev/null +++ b/arch/x86/include/asm/fred.h @@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Macros for Flexible Return and Event Delivery (FRED) + */ + +#ifndef ASM_X86_FRED_H +#define ASM_X86_FRED_H + +#include <linux/const.h> + +#include <asm/asm.h> +#include <asm/trapnr.h> + +/* + * FRED event return instruction opcodes for ERET{S,U}; supported in + * binutils >= 2.41. + */ +#define ERETS _ASM_BYTES(0xf2,0x0f,0x01,0xca) +#define ERETU _ASM_BYTES(0xf3,0x0f,0x01,0xca) + +/* + * RSP is aligned to a 64-byte boundary before used to push a new stack frame + */ +#define FRED_STACK_FRAME_RSP_MASK _AT(unsigned long, (~0x3f)) + +/* + * Used for the return address for call emulation during code patching, + * and measured in 64-byte cache lines. + */ +#define FRED_CONFIG_REDZONE_AMOUNT 1 +#define FRED_CONFIG_REDZONE (_AT(unsigned long, FRED_CONFIG_REDZONE_AMOUNT) << 6) +#define FRED_CONFIG_INT_STKLVL(l) (_AT(unsigned long, l) << 9) +#define FRED_CONFIG_ENTRYPOINT(p) _AT(unsigned long, (p)) + +#ifndef __ASSEMBLY__ + +#ifdef CONFIG_X86_FRED +#include <linux/kernel.h> + +#include <asm/ptrace.h> + +struct fred_info { + /* Event data: CR2, DR6, ... */ + unsigned long edata; + unsigned long resv; +}; + +/* Full format of the FRED stack frame */ +struct fred_frame { + struct pt_regs regs; + struct fred_info info; +}; + +static __always_inline struct fred_info *fred_info(struct pt_regs *regs) +{ + return &container_of(regs, struct fred_frame, regs)->info; +} + +static __always_inline unsigned long fred_event_data(struct pt_regs *regs) +{ + return fred_info(regs)->edata; +} + +void asm_fred_entrypoint_user(void); +void asm_fred_entrypoint_kernel(void); +void asm_fred_entry_from_kvm(struct fred_ss); + +__visible void fred_entry_from_user(struct pt_regs *regs); +__visible void fred_entry_from_kernel(struct pt_regs *regs); +__visible void __fred_entry_from_kvm(struct pt_regs *regs); + +/* Can be called from noinstr code, thus __always_inline */ +static __always_inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) +{ + struct fred_ss ss = { + .ss =__KERNEL_DS, + .type = type, + .vector = vector, + .nmi = type == EVENT_TYPE_NMI, + .lm = 1, + }; + + asm_fred_entry_from_kvm(ss); +} + +void cpu_init_fred_exceptions(void); +void fred_complete_exception_setup(void); + +#else /* CONFIG_X86_FRED */ +static __always_inline unsigned long fred_event_data(struct pt_regs *regs) { return 0; } +static inline void cpu_init_fred_exceptions(void) { } +static inline void fred_complete_exception_setup(void) { } +static __always_inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) { } +#endif /* CONFIG_X86_FRED */ +#endif /* !__ASSEMBLY__ */ + +#endif /* ASM_X86_FRED_H */ diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index b02c3cd3c0f6..edebf1020e04 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -16,8 +16,6 @@ #include <asm/irq_vectors.h> -#define IRQ_MATRIX_BITS NR_VECTORS - #ifndef __ASSEMBLY__ #include <linux/percpu.h> diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h index c7ef6ea2fa99..4212c00c9708 100644 --- a/arch/x86/include/asm/ia32.h +++ b/arch/x86/include/asm/ia32.h @@ -69,7 +69,7 @@ extern void ia32_pick_mmap_layout(struct mm_struct *mm); extern bool __ia32_enabled; -static inline bool ia32_enabled(void) +static __always_inline bool ia32_enabled(void) { return __ia32_enabled; } @@ -81,7 +81,7 @@ static inline void ia32_disable(void) #else /* !CONFIG_IA32_EMULATION */ -static inline bool ia32_enabled(void) +static __always_inline bool ia32_enabled(void) { return IS_ENABLED(CONFIG_X86_32); } diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 13639e57e1f8..47d4c04d103d 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -13,15 +13,18 @@ #include <asm/irq_stack.h> +typedef void (*idtentry_t)(struct pt_regs *regs); + /** * DECLARE_IDTENTRY - Declare functions for simple IDT entry points * No error code pushed by hardware * @vector: Vector number (ignored for C) * @func: Function name of the entry point * - * Declares three functions: + * Declares four functions: * - The ASM entry point: asm_##func * - The XEN PV trap entry point: xen_##func (maybe unused) + * - The C handler called from the FRED event dispatcher (maybe unused) * - The C handler called from the ASM entry point * * Note: This is the C variant of DECLARE_IDTENTRY(). As the name says it @@ -31,6 +34,7 @@ #define DECLARE_IDTENTRY(vector, func) \ asmlinkage void asm_##func(void); \ asmlinkage void xen_asm_##func(void); \ + void fred_##func(struct pt_regs *regs); \ __visible void func(struct pt_regs *regs) /** @@ -138,6 +142,17 @@ static __always_inline void __##func(struct pt_regs *regs, \ __visible noinstr void func(struct pt_regs *regs) /** + * DEFINE_FREDENTRY_RAW - Emit code for raw FRED entry points + * @func: Function name of the entry point + * + * @func is called from the FRED event dispatcher with interrupts disabled. + * + * See @DEFINE_IDTENTRY_RAW for further details. + */ +#define DEFINE_FREDENTRY_RAW(func) \ +noinstr void fred_##func(struct pt_regs *regs) + +/** * DECLARE_IDTENTRY_RAW_ERRORCODE - Declare functions for raw IDT entry points * Error code pushed by hardware * @vector: Vector number (ignored for C) @@ -233,17 +248,27 @@ static noinline void __##func(struct pt_regs *regs, u32 vector) #define DEFINE_IDTENTRY_SYSVEC(func) \ static void __##func(struct pt_regs *regs); \ \ +static __always_inline void instr_##func(struct pt_regs *regs) \ +{ \ + kvm_set_cpu_l1tf_flush_l1d(); \ + run_sysvec_on_irqstack_cond(__##func, regs); \ +} \ + \ __visible noinstr void func(struct pt_regs *regs) \ { \ irqentry_state_t state = irqentry_enter(regs); \ \ instrumentation_begin(); \ - kvm_set_cpu_l1tf_flush_l1d(); \ - run_sysvec_on_irqstack_cond(__##func, regs); \ + instr_##func (regs); \ instrumentation_end(); \ irqentry_exit(regs, state); \ } \ \ +void fred_##func(struct pt_regs *regs) \ +{ \ + instr_##func (regs); \ +} \ + \ static noinline void __##func(struct pt_regs *regs) /** @@ -260,19 +285,29 @@ static noinline void __##func(struct pt_regs *regs) #define DEFINE_IDTENTRY_SYSVEC_SIMPLE(func) \ static __always_inline void __##func(struct pt_regs *regs); \ \ -__visible noinstr void func(struct pt_regs *regs) \ +static __always_inline void instr_##func(struct pt_regs *regs) \ { \ - irqentry_state_t state = irqentry_enter(regs); \ - \ - instrumentation_begin(); \ __irq_enter_raw(); \ kvm_set_cpu_l1tf_flush_l1d(); \ __##func (regs); \ __irq_exit_raw(); \ +} \ + \ +__visible noinstr void func(struct pt_regs *regs) \ +{ \ + irqentry_state_t state = irqentry_enter(regs); \ + \ + instrumentation_begin(); \ + instr_##func (regs); \ instrumentation_end(); \ irqentry_exit(regs, state); \ } \ \ +void fred_##func(struct pt_regs *regs) \ +{ \ + instr_##func (regs); \ +} \ + \ static __always_inline void __##func(struct pt_regs *regs) /** @@ -410,17 +445,35 @@ __visible noinstr void func(struct pt_regs *regs, \ /* C-Code mapping */ #define DECLARE_IDTENTRY_NMI DECLARE_IDTENTRY_RAW #define DEFINE_IDTENTRY_NMI DEFINE_IDTENTRY_RAW +#define DEFINE_FREDENTRY_NMI DEFINE_FREDENTRY_RAW #ifdef CONFIG_X86_64 #define DECLARE_IDTENTRY_MCE DECLARE_IDTENTRY_IST #define DEFINE_IDTENTRY_MCE DEFINE_IDTENTRY_IST #define DEFINE_IDTENTRY_MCE_USER DEFINE_IDTENTRY_NOIST +#define DEFINE_FREDENTRY_MCE DEFINE_FREDENTRY_RAW #define DECLARE_IDTENTRY_DEBUG DECLARE_IDTENTRY_IST #define DEFINE_IDTENTRY_DEBUG DEFINE_IDTENTRY_IST #define DEFINE_IDTENTRY_DEBUG_USER DEFINE_IDTENTRY_NOIST +#define DEFINE_FREDENTRY_DEBUG DEFINE_FREDENTRY_RAW +#endif + +void idt_install_sysvec(unsigned int n, const void *function); + +#ifdef CONFIG_X86_FRED +void fred_install_sysvec(unsigned int vector, const idtentry_t function); +#else +static inline void fred_install_sysvec(unsigned int vector, const idtentry_t function) { } #endif +#define sysvec_install(vector, function) { \ + if (cpu_feature_enabled(X86_FEATURE_FRED)) \ + fred_install_sysvec(vector, function); \ + else \ + idt_install_sysvec(vector, asm_##function); \ +} + #else /* !__ASSEMBLY__ */ /* @@ -447,7 +500,7 @@ __visible noinstr void func(struct pt_regs *regs, \ /* System vector entries */ #define DECLARE_IDTENTRY_SYSVEC(vector, func) \ - idtentry_sysvec vector func + DECLARE_IDTENTRY(vector, func) #ifdef CONFIG_X86_64 # define DECLARE_IDTENTRY_MCE(vector, func) \ @@ -655,23 +708,36 @@ DECLARE_IDTENTRY(RESCHEDULE_VECTOR, sysvec_reschedule_ipi); DECLARE_IDTENTRY_SYSVEC(REBOOT_VECTOR, sysvec_reboot); DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_SINGLE_VECTOR, sysvec_call_function_single); DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_VECTOR, sysvec_call_function); +#else +# define fred_sysvec_reschedule_ipi NULL +# define fred_sysvec_reboot NULL +# define fred_sysvec_call_function_single NULL +# define fred_sysvec_call_function NULL #endif #ifdef CONFIG_X86_LOCAL_APIC # ifdef CONFIG_X86_MCE_THRESHOLD DECLARE_IDTENTRY_SYSVEC(THRESHOLD_APIC_VECTOR, sysvec_threshold); +# else +# define fred_sysvec_threshold NULL # endif # ifdef CONFIG_X86_MCE_AMD DECLARE_IDTENTRY_SYSVEC(DEFERRED_ERROR_VECTOR, sysvec_deferred_error); +# else +# define fred_sysvec_deferred_error NULL # endif # ifdef CONFIG_X86_THERMAL_VECTOR DECLARE_IDTENTRY_SYSVEC(THERMAL_APIC_VECTOR, sysvec_thermal); +# else +# define fred_sysvec_thermal NULL # endif # ifdef CONFIG_IRQ_WORK DECLARE_IDTENTRY_SYSVEC(IRQ_WORK_VECTOR, sysvec_irq_work); +# else +# define fred_sysvec_irq_work NULL # endif #endif @@ -679,12 +745,16 @@ DECLARE_IDTENTRY_SYSVEC(IRQ_WORK_VECTOR, sysvec_irq_work); DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_VECTOR, sysvec_kvm_posted_intr_ipi); DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_WAKEUP_VECTOR, sysvec_kvm_posted_intr_wakeup_ipi); DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_NESTED_VECTOR, sysvec_kvm_posted_intr_nested_ipi); +#else +# define fred_sysvec_kvm_posted_intr_ipi NULL +# define fred_sysvec_kvm_posted_intr_wakeup_ipi NULL +# define fred_sysvec_kvm_posted_intr_nested_ipi NULL #endif #if IS_ENABLED(CONFIG_HYPERV) DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_hyperv_callback); DECLARE_IDTENTRY_SYSVEC(HYPERV_REENLIGHTENMENT_VECTOR, sysvec_hyperv_reenlightenment); -DECLARE_IDTENTRY_SYSVEC(HYPERV_STIMER0_VECTOR, sysvec_hyperv_stimer0); +DECLARE_IDTENTRY_SYSVEC(HYPERV_STIMER0_VECTOR, sysvec_hyperv_stimer0); #endif #if IS_ENABLED(CONFIG_ACRN_GUEST) diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 197316121f04..b65e9c46b922 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -162,6 +162,8 @@ #define INTEL_FAM6_ATOM_CRESTMONT_X 0xAF /* Sierra Forest */ #define INTEL_FAM6_ATOM_CRESTMONT 0xB6 /* Grand Ridge */ +#define INTEL_FAM6_ATOM_DARKMONT_X 0xDD /* Clearwater Forest */ + /* Xeon Phi */ #define INTEL_FAM6_XEON_PHI_KNL 0x57 /* Knights Landing */ diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 51c782600e02..0d806513c4b3 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -140,7 +140,6 @@ extern void mask_ioapic_entries(void); extern int restore_ioapic_entries(void); extern void setup_ioapic_ids_from_mpc(void); -extern void setup_ioapic_ids_from_mpc_nocheck(void); extern int mp_find_ioapic(u32 gsi); extern int mp_find_ioapic_pin(int ioapic, u32 gsi); diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h index 2fd52b65deac..3be2451e7bc8 100644 --- a/arch/x86/include/asm/iommu.h +++ b/arch/x86/include/asm/iommu.h @@ -10,6 +10,7 @@ extern int force_iommu, no_iommu; extern int iommu_detected; extern int iommu_merge; extern int panic_on_overflow; +extern bool amd_iommu_snp_en; #ifdef CONFIG_SWIOTLB extern bool x86_swiotlb_enable; diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 071572e23d3a..cbbef32517f0 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -24,7 +24,7 @@ static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { - asm_volatile_goto("1:" + asm goto("1:" "jmp %l[l_yes] # objtool NOPs this \n\t" JUMP_TABLE_ENTRY : : "i" (key), "i" (2 | branch) : : l_yes); @@ -38,7 +38,7 @@ l_yes: static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch) { - asm_volatile_goto("1:" + asm goto("1:" ".byte " __stringify(BYTES_NOP5) "\n\t" JUMP_TABLE_ENTRY : : "i" (key), "i" (branch) : : l_yes); @@ -52,7 +52,7 @@ l_yes: static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch) { - asm_volatile_goto("1:" + asm goto("1:" "jmp %l[l_yes]\n\t" JUMP_TABLE_ENTRY : : "i" (key), "i" (branch) : : l_yes); diff --git a/arch/x86/include/asm/kmsan.h b/arch/x86/include/asm/kmsan.h index 8fa6ac0e2d76..d91b37f5b4bb 100644 --- a/arch/x86/include/asm/kmsan.h +++ b/arch/x86/include/asm/kmsan.h @@ -64,6 +64,7 @@ static inline bool kmsan_virt_addr_valid(void *addr) { unsigned long x = (unsigned long)addr; unsigned long y = x - __START_KERNEL_map; + bool ret; /* use the carry flag to determine if x was < __START_KERNEL_map */ if (unlikely(x > y)) { @@ -79,7 +80,21 @@ static inline bool kmsan_virt_addr_valid(void *addr) return false; } - return pfn_valid(x >> PAGE_SHIFT); + /* + * pfn_valid() relies on RCU, and may call into the scheduler on exiting + * the critical section. However, this would result in recursion with + * KMSAN. Therefore, disable preemption here, and re-enable preemption + * below while suppressing reschedules to avoid recursion. + * + * Note, this sacrifices occasionally breaking scheduling guarantees. + * Although, a kernel compiled with KMSAN has already given up on any + * performance guarantees due to being heavily instrumented. + */ + preempt_disable(); + ret = pfn_valid(x >> PAGE_SHIFT); + preempt_enable_no_resched(); + + return ret; } #endif /* !MODULE */ diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 378ed944b849..ab24ce207988 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -138,6 +138,7 @@ KVM_X86_OP(complete_emulated_msr) KVM_X86_OP(vcpu_deliver_sipi_vector) KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons); KVM_X86_OP_OPTIONAL(get_untagged_addr) +KVM_X86_OP_OPTIONAL(alloc_apic_backing_page) #undef KVM_X86_OP #undef KVM_X86_OP_OPTIONAL diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b5b2d0fde579..18cbde14cf81 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1145,6 +1145,8 @@ struct kvm_hv { unsigned int synic_auto_eoi_used; struct kvm_hv_syndbg hv_syndbg; + + bool xsaves_xsavec_checked; }; #endif @@ -1794,6 +1796,7 @@ struct kvm_x86_ops { unsigned long (*vcpu_get_apicv_inhibit_reasons)(struct kvm_vcpu *vcpu); gva_t (*get_untagged_addr)(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags); + void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu); }; struct kvm_x86_nested_ops { diff --git a/arch/x86/include/asm/kvmclock.h b/arch/x86/include/asm/kvmclock.h index 511b35069187..f163176d6f7f 100644 --- a/arch/x86/include/asm/kvmclock.h +++ b/arch/x86/include/asm/kvmclock.h @@ -4,8 +4,6 @@ #include <linux/percpu.h> -extern struct clocksource kvm_clock; - DECLARE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu); static __always_inline struct pvclock_vcpu_time_info *this_cpu_pvti(void) diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 359ada486fa9..b31eb9fd5954 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -15,7 +15,8 @@ #include <linux/init.h> #include <linux/cc_platform.h> -#include <asm/bootparam.h> +#include <asm/asm.h> +struct boot_params; #ifdef CONFIG_X86_MEM_ENCRYPT void __init mem_encrypt_init(void); @@ -58,6 +59,11 @@ void __init mem_encrypt_free_decrypted_mem(void); void __init sev_es_init_vc_handling(void); +static inline u64 sme_get_me_mask(void) +{ + return RIP_REL_REF(sme_me_mask); +} + #define __bss_decrypted __section(".bss..decrypted") #else /* !CONFIG_AMD_MEM_ENCRYPT */ @@ -89,6 +95,8 @@ early_set_mem_enc_dec_hypercall(unsigned long vaddr, unsigned long size, bool en static inline void mem_encrypt_free_decrypted_mem(void) { } +static inline u64 sme_get_me_mask(void) { return 0; } + #define __bss_decrypted #endif /* CONFIG_AMD_MEM_ENCRYPT */ @@ -106,11 +114,6 @@ void add_encrypt_protection_map(void); extern char __start_bss_decrypted[], __end_bss_decrypted[], __start_bss_decrypted_unused[]; -static inline u64 sme_get_me_mask(void) -{ - return sme_me_mask; -} - #endif /* __ASSEMBLY__ */ #endif /* __X86_MEM_ENCRYPT_H__ */ diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index 4b0f98a8d338..c72c7ff78fcd 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -2,6 +2,7 @@ #ifndef _ASM_X86_MPSPEC_H #define _ASM_X86_MPSPEC_H +#include <linux/types.h> #include <asm/mpspec_def.h> #include <asm/x86_init.h> @@ -46,70 +47,31 @@ extern int smp_found_config; # define smp_found_config 0 #endif -static inline void get_smp_config(void) -{ - x86_init.mpparse.get_smp_config(0); -} - -static inline void early_get_smp_config(void) -{ - x86_init.mpparse.get_smp_config(1); -} - -static inline void find_smp_config(void) -{ - x86_init.mpparse.find_smp_config(); -} - #ifdef CONFIG_X86_MPPARSE extern void e820__memblock_alloc_reserved_mpc_new(void); extern int enable_update_mptable; -extern void default_find_smp_config(void); -extern void default_get_smp_config(unsigned int early); +extern void mpparse_find_mptable(void); +extern void mpparse_parse_early_smp_config(void); +extern void mpparse_parse_smp_config(void); #else static inline void e820__memblock_alloc_reserved_mpc_new(void) { } -#define enable_update_mptable 0 -#define default_find_smp_config x86_init_noop -#define default_get_smp_config x86_init_uint_noop +#define enable_update_mptable 0 +#define mpparse_find_mptable x86_init_noop +#define mpparse_parse_early_smp_config x86_init_noop +#define mpparse_parse_smp_config x86_init_noop #endif -int generic_processor_info(int apicid); +extern DECLARE_BITMAP(phys_cpu_present_map, MAX_LOCAL_APIC); -#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_LOCAL_APIC) - -struct physid_mask { - unsigned long mask[PHYSID_ARRAY_SIZE]; -}; - -typedef struct physid_mask physid_mask_t; - -#define physid_set(physid, map) set_bit(physid, (map).mask) -#define physid_isset(physid, map) test_bit(physid, (map).mask) - -#define physids_or(dst, src1, src2) \ - bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC) - -#define physids_clear(map) \ - bitmap_zero((map).mask, MAX_LOCAL_APIC) - -#define physids_empty(map) \ - bitmap_empty((map).mask, MAX_LOCAL_APIC) - -static inline void physids_promote(unsigned long physids, physid_mask_t *map) +static inline void reset_phys_cpu_present_map(u32 apicid) { - physids_clear(*map); - map->mask[0] = physids; + bitmap_zero(phys_cpu_present_map, MAX_LOCAL_APIC); + set_bit(apicid, phys_cpu_present_map); } -static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map) +static inline void copy_phys_cpu_present_map(unsigned long *dst) { - physids_clear(*map); - physid_set(physid, *map); + bitmap_copy(dst, phys_cpu_present_map, MAX_LOCAL_APIC); } -#define PHYSID_MASK_ALL { {[0 ... PHYSID_ARRAY_SIZE-1] = ~0UL} } -#define PHYSID_MASK_NONE { {[0 ... PHYSID_ARRAY_SIZE-1] = 0UL} } - -extern physid_mask_t phys_cpu_present_map; - #endif /* _ASM_X86_MPSPEC_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index f1bd7b91b3c6..24c575cdd6b9 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -36,8 +36,19 @@ #define EFER_FFXSR (1<<_EFER_FFXSR) #define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS) -/* Intel MSRs. Some also available on other CPUs */ +/* FRED MSRs */ +#define MSR_IA32_FRED_RSP0 0x1cc /* Level 0 stack pointer */ +#define MSR_IA32_FRED_RSP1 0x1cd /* Level 1 stack pointer */ +#define MSR_IA32_FRED_RSP2 0x1ce /* Level 2 stack pointer */ +#define MSR_IA32_FRED_RSP3 0x1cf /* Level 3 stack pointer */ +#define MSR_IA32_FRED_STKLVLS 0x1d0 /* Exception stack levels */ +#define MSR_IA32_FRED_SSP0 MSR_IA32_PL0_SSP /* Level 0 shadow stack pointer */ +#define MSR_IA32_FRED_SSP1 0x1d1 /* Level 1 shadow stack pointer */ +#define MSR_IA32_FRED_SSP2 0x1d2 /* Level 2 shadow stack pointer */ +#define MSR_IA32_FRED_SSP3 0x1d3 /* Level 3 shadow stack pointer */ +#define MSR_IA32_FRED_CONFIG 0x1d4 /* Entrypoint and interrupt stack level */ +/* Intel MSRs. Some also available on other CPUs */ #define MSR_TEST_CTRL 0x00000033 #define MSR_TEST_CTRL_SPLIT_LOCK_DETECT_BIT 29 #define MSR_TEST_CTRL_SPLIT_LOCK_DETECT BIT(MSR_TEST_CTRL_SPLIT_LOCK_DETECT_BIT) @@ -594,34 +605,47 @@ #define MSR_AMD64_SEV_ES_GHCB 0xc0010130 #define MSR_AMD64_SEV 0xc0010131 #define MSR_AMD64_SEV_ENABLED_BIT 0 -#define MSR_AMD64_SEV_ES_ENABLED_BIT 1 -#define MSR_AMD64_SEV_SNP_ENABLED_BIT 2 #define MSR_AMD64_SEV_ENABLED BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT) +#define MSR_AMD64_SEV_ES_ENABLED_BIT 1 #define MSR_AMD64_SEV_ES_ENABLED BIT_ULL(MSR_AMD64_SEV_ES_ENABLED_BIT) +#define MSR_AMD64_SEV_SNP_ENABLED_BIT 2 #define MSR_AMD64_SEV_SNP_ENABLED BIT_ULL(MSR_AMD64_SEV_SNP_ENABLED_BIT) - -/* SNP feature bits enabled by the hypervisor */ -#define MSR_AMD64_SNP_VTOM BIT_ULL(3) -#define MSR_AMD64_SNP_REFLECT_VC BIT_ULL(4) -#define MSR_AMD64_SNP_RESTRICTED_INJ BIT_ULL(5) -#define MSR_AMD64_SNP_ALT_INJ BIT_ULL(6) -#define MSR_AMD64_SNP_DEBUG_SWAP BIT_ULL(7) -#define MSR_AMD64_SNP_PREVENT_HOST_IBS BIT_ULL(8) -#define MSR_AMD64_SNP_BTB_ISOLATION BIT_ULL(9) -#define MSR_AMD64_SNP_VMPL_SSS BIT_ULL(10) -#define MSR_AMD64_SNP_SECURE_TSC BIT_ULL(11) -#define MSR_AMD64_SNP_VMGEXIT_PARAM BIT_ULL(12) -#define MSR_AMD64_SNP_IBS_VIRT BIT_ULL(14) -#define MSR_AMD64_SNP_VMSA_REG_PROTECTION BIT_ULL(16) -#define MSR_AMD64_SNP_SMT_PROTECTION BIT_ULL(17) - -/* SNP feature bits reserved for future use. */ -#define MSR_AMD64_SNP_RESERVED_BIT13 BIT_ULL(13) -#define MSR_AMD64_SNP_RESERVED_BIT15 BIT_ULL(15) -#define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, 18) +#define MSR_AMD64_SNP_VTOM_BIT 3 +#define MSR_AMD64_SNP_VTOM BIT_ULL(MSR_AMD64_SNP_VTOM_BIT) +#define MSR_AMD64_SNP_REFLECT_VC_BIT 4 +#define MSR_AMD64_SNP_REFLECT_VC BIT_ULL(MSR_AMD64_SNP_REFLECT_VC_BIT) +#define MSR_AMD64_SNP_RESTRICTED_INJ_BIT 5 +#define MSR_AMD64_SNP_RESTRICTED_INJ BIT_ULL(MSR_AMD64_SNP_RESTRICTED_INJ_BIT) +#define MSR_AMD64_SNP_ALT_INJ_BIT 6 +#define MSR_AMD64_SNP_ALT_INJ BIT_ULL(MSR_AMD64_SNP_ALT_INJ_BIT) +#define MSR_AMD64_SNP_DEBUG_SWAP_BIT 7 +#define MSR_AMD64_SNP_DEBUG_SWAP BIT_ULL(MSR_AMD64_SNP_DEBUG_SWAP_BIT) +#define MSR_AMD64_SNP_PREVENT_HOST_IBS_BIT 8 +#define MSR_AMD64_SNP_PREVENT_HOST_IBS BIT_ULL(MSR_AMD64_SNP_PREVENT_HOST_IBS_BIT) +#define MSR_AMD64_SNP_BTB_ISOLATION_BIT 9 +#define MSR_AMD64_SNP_BTB_ISOLATION BIT_ULL(MSR_AMD64_SNP_BTB_ISOLATION_BIT) +#define MSR_AMD64_SNP_VMPL_SSS_BIT 10 +#define MSR_AMD64_SNP_VMPL_SSS BIT_ULL(MSR_AMD64_SNP_VMPL_SSS_BIT) +#define MSR_AMD64_SNP_SECURE_TSC_BIT 11 +#define MSR_AMD64_SNP_SECURE_TSC BIT_ULL(MSR_AMD64_SNP_SECURE_TSC_BIT) +#define MSR_AMD64_SNP_VMGEXIT_PARAM_BIT 12 +#define MSR_AMD64_SNP_VMGEXIT_PARAM BIT_ULL(MSR_AMD64_SNP_VMGEXIT_PARAM_BIT) +#define MSR_AMD64_SNP_RESERVED_BIT13 BIT_ULL(13) +#define MSR_AMD64_SNP_IBS_VIRT_BIT 14 +#define MSR_AMD64_SNP_IBS_VIRT BIT_ULL(MSR_AMD64_SNP_IBS_VIRT_BIT) +#define MSR_AMD64_SNP_RESERVED_BIT15 BIT_ULL(15) +#define MSR_AMD64_SNP_VMSA_REG_PROT_BIT 16 +#define MSR_AMD64_SNP_VMSA_REG_PROT BIT_ULL(MSR_AMD64_SNP_VMSA_REG_PROT_BIT) +#define MSR_AMD64_SNP_SMT_PROT_BIT 17 +#define MSR_AMD64_SNP_SMT_PROT BIT_ULL(MSR_AMD64_SNP_SMT_PROT_BIT) +#define MSR_AMD64_SNP_RESV_BIT 18 +#define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, MSR_AMD64_SNP_RESV_BIT) #define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f +#define MSR_AMD64_RMP_BASE 0xc0010132 +#define MSR_AMD64_RMP_END 0xc0010133 + /* AMD Collaborative Processor Performance Control MSRs */ #define MSR_AMD_CPPC_CAP1 0xc00102b0 #define MSR_AMD_CPPC_ENABLE 0xc00102b1 @@ -708,8 +732,15 @@ #define MSR_K8_TOP_MEM1 0xc001001a #define MSR_K8_TOP_MEM2 0xc001001d #define MSR_AMD64_SYSCFG 0xc0010010 -#define MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT 23 +#define MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT 23 #define MSR_AMD64_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT) +#define MSR_AMD64_SYSCFG_SNP_EN_BIT 24 +#define MSR_AMD64_SYSCFG_SNP_EN BIT_ULL(MSR_AMD64_SYSCFG_SNP_EN_BIT) +#define MSR_AMD64_SYSCFG_SNP_VMPL_EN_BIT 25 +#define MSR_AMD64_SYSCFG_SNP_VMPL_EN BIT_ULL(MSR_AMD64_SYSCFG_SNP_VMPL_EN_BIT) +#define MSR_AMD64_SYSCFG_MFDM_BIT 19 +#define MSR_AMD64_SYSCFG_MFDM BIT_ULL(MSR_AMD64_SYSCFG_MFDM_BIT) + #define MSR_K8_INT_PENDING_MSG 0xc0010055 /* C1E active bits in int pending message */ #define K8_INTP_C1E_ACTIVE_MASK 0x18000000 diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 65ec1965cd28..c284ff9ebe67 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -97,6 +97,19 @@ static __always_inline void __wrmsr(unsigned int msr, u32 low, u32 high) : : "c" (msr), "a"(low), "d" (high) : "memory"); } +/* + * WRMSRNS behaves exactly like WRMSR with the only difference being + * that it is not a serializing instruction by default. + */ +static __always_inline void __wrmsrns(u32 msr, u32 low, u32 high) +{ + /* Instruction opcode for WRMSRNS; supported in binutils >= 2.40. */ + asm volatile("1: .byte 0x0f,0x01,0xc6\n" + "2:\n" + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR) + : : "c" (msr), "a"(low), "d" (high)); +} + #define native_rdmsr(msr, val1, val2) \ do { \ u64 __val = __rdmsr((msr)); \ @@ -297,6 +310,11 @@ do { \ #endif /* !CONFIG_PARAVIRT_XXL */ +static __always_inline void wrmsrns(u32 msr, u64 val) +{ + __wrmsrns(msr, val, val >> 32); +} + /* * 64-bit version of wrmsr_safe(): */ diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 262e65539f83..ab19c7f1167b 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -315,6 +315,17 @@ #endif .endm +/* + * Macro to execute VERW instruction that mitigate transient data sampling + * attacks such as MDS. On affected systems a microcode update overloaded VERW + * instruction to also clear the CPU buffers. VERW clobbers CFLAGS.ZF. + * + * Note: Only the memory operand variant of VERW clears the CPU buffers. + */ +.macro CLEAR_CPU_BUFFERS + ALTERNATIVE "", __stringify(verw mds_verw_sel), X86_FEATURE_CLEAR_CPU_BUF +.endm + #else /* __ASSEMBLY__ */ #define ANNOTATE_RETPOLINE_SAFE \ @@ -529,13 +540,14 @@ DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp); DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb); -DECLARE_STATIC_KEY_FALSE(mds_user_clear); DECLARE_STATIC_KEY_FALSE(mds_idle_clear); DECLARE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); DECLARE_STATIC_KEY_FALSE(mmio_stale_data_clear); +extern u16 mds_verw_sel; + #include <asm/segment.h> /** @@ -562,17 +574,6 @@ static __always_inline void mds_clear_cpu_buffers(void) } /** - * mds_user_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability - * - * Clear CPU buffers if the corresponding static key is enabled - */ -static __always_inline void mds_user_clear_cpu_buffers(void) -{ - if (static_branch_likely(&mds_user_clear)) - mds_clear_cpu_buffers(); -} - -/** * mds_idle_clear_cpu_buffers - Mitigation for MDS vulnerability * * Clear CPU buffers if the corresponding static key is enabled diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h index 94de1a05aeba..d65e338b6a5f 100644 --- a/arch/x86/include/asm/perf_event_p4.h +++ b/arch/x86/include/asm/perf_event_p4.h @@ -181,7 +181,7 @@ static inline u64 p4_clear_ht_bit(u64 config) static inline int p4_ht_active(void) { #ifdef CONFIG_SMP - return smp_num_siblings > 1; + return __max_threads_per_core > 1; #endif return 0; } @@ -189,7 +189,7 @@ static inline int p4_ht_active(void) static inline int p4_ht_thread(int cpu) { #ifdef CONFIG_SMP - if (smp_num_siblings == 2) + if (__max_threads_per_core == 2) return cpu != cpumask_first(this_cpu_cpumask_var_ptr(cpu_sibling_map)); #endif return 0; diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 26620d7642a9..89cf39dbd306 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -100,6 +100,9 @@ struct cpuinfo_topology { u32 logical_pkg_id; u32 logical_die_id; + // AMD Node ID and Nodes per Package info + u32 amd_node_id; + // Cache level topology IDs u32 llc_id; u32 l2c_id; @@ -119,8 +122,6 @@ struct cpuinfo_x86 { #endif __u8 x86_virt_bits; __u8 x86_phys_bits; - /* CPUID returned core id bits: */ - __u8 x86_coreid_bits; /* Max extended CPUID function supported: */ __u32 extended_cpuid_level; /* Maximum supported CPUID level, -1=no CPUID: */ @@ -148,8 +149,6 @@ struct cpuinfo_x86 { unsigned long loops_per_jiffy; /* protected processor identification number */ u64 ppin; - /* cpuid returned max cores value: */ - u16 x86_max_cores; u16 x86_clflush_size; /* number of cores as seen by the OS: */ u16 booted_cores; @@ -664,8 +663,10 @@ static __always_inline void prefetchw(const void *x) #else extern unsigned long __end_init_task[]; -#define INIT_THREAD { \ - .sp = (unsigned long)&__end_init_task - sizeof(struct pt_regs), \ +#define INIT_THREAD { \ + .sp = (unsigned long)&__end_init_task - \ + TOP_OF_KERNEL_STACK_PADDING - \ + sizeof(struct pt_regs), \ } extern unsigned long KSTK_ESP(struct task_struct *task); @@ -704,12 +705,10 @@ static inline u32 per_cpu_l2c_id(unsigned int cpu) } #ifdef CONFIG_CPU_SUP_AMD -extern u32 amd_get_nodes_per_socket(void); extern u32 amd_get_highest_perf(void); extern void amd_clear_divider(void); extern void amd_check_microcode(void); #else -static inline u32 amd_get_nodes_per_socket(void) { return 0; } static inline u32 amd_get_highest_perf(void) { return 0; } static inline void amd_clear_divider(void) { } static inline void amd_check_microcode(void) { } diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h index 65dee2420624..043758a2e627 100644 --- a/arch/x86/include/asm/prom.h +++ b/arch/x86/include/asm/prom.h @@ -23,11 +23,11 @@ extern int of_ioapic; extern u64 initial_dtb; extern void add_dtb(u64 data); void x86_of_pci_init(void); -void x86_dtb_init(void); +void x86_dtb_parse_smp_config(void); #else static inline void add_dtb(u64 data) { } static inline void x86_of_pci_init(void) { } -static inline void x86_dtb_init(void) { } +static inline void x86_dtb_parse_smp_config(void) { } #define of_ioapic 0 #endif diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index f4db78b09c8f..5a83fbd9bc0b 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -56,18 +56,64 @@ struct pt_regs { #else /* __i386__ */ +struct fred_cs { + /* CS selector */ + u64 cs : 16, + /* Stack level at event time */ + sl : 2, + /* IBT in WAIT_FOR_ENDBRANCH state */ + wfe : 1, + : 45; +}; + +struct fred_ss { + /* SS selector */ + u64 ss : 16, + /* STI state */ + sti : 1, + /* Set if syscall, sysenter or INT n */ + swevent : 1, + /* Event is NMI type */ + nmi : 1, + : 13, + /* Event vector */ + vector : 8, + : 8, + /* Event type */ + type : 4, + : 4, + /* Event was incident to enclave execution */ + enclave : 1, + /* CPU was in long mode */ + lm : 1, + /* + * Nested exception during FRED delivery, not set + * for #DF. + */ + nested : 1, + : 1, + /* + * The length of the instruction causing the event. + * Only set for INTO, INT1, INT3, INT n, SYSCALL + * and SYSENTER. 0 otherwise. + */ + insnlen : 4; +}; + struct pt_regs { -/* - * C ABI says these regs are callee-preserved. They aren't saved on kernel entry - * unless syscall needs a complete, fully filled "struct pt_regs". - */ + /* + * C ABI says these regs are callee-preserved. They aren't saved on + * kernel entry unless syscall needs a complete, fully filled + * "struct pt_regs". + */ unsigned long r15; unsigned long r14; unsigned long r13; unsigned long r12; unsigned long bp; unsigned long bx; -/* These regs are callee-clobbered. Always saved on kernel entry. */ + + /* These regs are callee-clobbered. Always saved on kernel entry. */ unsigned long r11; unsigned long r10; unsigned long r9; @@ -77,18 +123,50 @@ struct pt_regs { unsigned long dx; unsigned long si; unsigned long di; -/* - * On syscall entry, this is syscall#. On CPU exception, this is error code. - * On hw interrupt, it's IRQ number: - */ + + /* + * orig_ax is used on entry for: + * - the syscall number (syscall, sysenter, int80) + * - error_code stored by the CPU on traps and exceptions + * - the interrupt number for device interrupts + * + * A FRED stack frame starts here: + * 1) It _always_ includes an error code; + * + * 2) The return frame for ERET[US] starts here, but + * the content of orig_ax is ignored. + */ unsigned long orig_ax; -/* Return frame for iretq */ + + /* The IRETQ return frame starts here */ unsigned long ip; - unsigned long cs; + + union { + /* CS selector */ + u16 cs; + /* The extended 64-bit data slot containing CS */ + u64 csx; + /* The FRED CS extension */ + struct fred_cs fred_cs; + }; + unsigned long flags; unsigned long sp; - unsigned long ss; -/* top of stack page */ + + union { + /* SS selector */ + u16 ss; + /* The extended 64-bit data slot containing SS */ + u64 ssx; + /* The FRED SS extension */ + struct fred_ss fred_ss; + }; + + /* + * Top of stack on IDT systems, while FRED systems have extra fields + * defined above for storing exception related information, e.g. CR2 or + * DR6. + */ }; #endif /* !__i386__ */ diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index 255a78d9d906..12dbd2588ca7 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -7,6 +7,13 @@ #include <linux/sched.h> #include <linux/jump_label.h> +/* + * This value can never be a valid CLOSID, and is used when mapping a + * (closid, rmid) pair to an index and back. On x86 only the RMID is + * needed. The index is a software defined value. + */ +#define X86_RESCTRL_EMPTY_CLOSID ((u32)~0) + /** * struct resctrl_pqr_state - State cache for the PQR MSR * @cur_rmid: The cached Resource Monitoring ID @@ -31,10 +38,47 @@ struct resctrl_pqr_state { DECLARE_PER_CPU(struct resctrl_pqr_state, pqr_state); +extern bool rdt_alloc_capable; +extern bool rdt_mon_capable; + DECLARE_STATIC_KEY_FALSE(rdt_enable_key); DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key); +static inline bool resctrl_arch_alloc_capable(void) +{ + return rdt_alloc_capable; +} + +static inline void resctrl_arch_enable_alloc(void) +{ + static_branch_enable_cpuslocked(&rdt_alloc_enable_key); + static_branch_inc_cpuslocked(&rdt_enable_key); +} + +static inline void resctrl_arch_disable_alloc(void) +{ + static_branch_disable_cpuslocked(&rdt_alloc_enable_key); + static_branch_dec_cpuslocked(&rdt_enable_key); +} + +static inline bool resctrl_arch_mon_capable(void) +{ + return rdt_mon_capable; +} + +static inline void resctrl_arch_enable_mon(void) +{ + static_branch_enable_cpuslocked(&rdt_mon_enable_key); + static_branch_inc_cpuslocked(&rdt_enable_key); +} + +static inline void resctrl_arch_disable_mon(void) +{ + static_branch_disable_cpuslocked(&rdt_mon_enable_key); + static_branch_dec_cpuslocked(&rdt_enable_key); +} + /* * __resctrl_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR * @@ -88,12 +132,58 @@ static inline unsigned int resctrl_arch_round_mon_val(unsigned int val) return val * scale; } +static inline void resctrl_arch_set_closid_rmid(struct task_struct *tsk, + u32 closid, u32 rmid) +{ + WRITE_ONCE(tsk->closid, closid); + WRITE_ONCE(tsk->rmid, rmid); +} + +static inline bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid) +{ + return READ_ONCE(tsk->closid) == closid; +} + +static inline bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 ignored, + u32 rmid) +{ + return READ_ONCE(tsk->rmid) == rmid; +} + static inline void resctrl_sched_in(struct task_struct *tsk) { if (static_branch_likely(&rdt_enable_key)) __resctrl_sched_in(tsk); } +static inline u32 resctrl_arch_system_num_rmid_idx(void) +{ + /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */ + return boot_cpu_data.x86_cache_max_rmid + 1; +} + +static inline void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid) +{ + *rmid = idx; + *closid = X86_RESCTRL_EMPTY_CLOSID; +} + +static inline u32 resctrl_arch_rmid_idx_encode(u32 ignored, u32 rmid) +{ + return rmid; +} + +/* x86 can always read an rmid, nothing needs allocating */ +struct rdt_resource; +static inline void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, int evtid) +{ + might_sleep(); + return NULL; +}; + +static inline void resctrl_arch_mon_ctx_free(struct rdt_resource *r, int evtid, + void *ctx) { }; + void resctrl_cpu_detect(struct cpuinfo_x86 *c); #else diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h index 4b081e0d3306..363266cbcada 100644 --- a/arch/x86/include/asm/rmwcc.h +++ b/arch/x86/include/asm/rmwcc.h @@ -13,7 +13,7 @@ #define __GEN_RMWcc(fullop, _var, cc, clobbers, ...) \ ({ \ bool c = false; \ - asm_volatile_goto (fullop "; j" #cc " %l[cc_label]" \ + asm goto (fullop "; j" #cc " %l[cc_label]" \ : : [var] "m" (_var), ## __VA_ARGS__ \ : clobbers : cc_label); \ if (0) { \ diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index a5e89641bd2d..9aee31862b4a 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -47,6 +47,7 @@ int set_memory_uc(unsigned long addr, int numpages); int set_memory_wc(unsigned long addr, int numpages); int set_memory_wb(unsigned long addr, int numpages); int set_memory_np(unsigned long addr, int numpages); +int set_memory_p(unsigned long addr, int numpages); int set_memory_4k(unsigned long addr, int numpages); int set_memory_encrypted(unsigned long addr, int numpages); int set_memory_decrypted(unsigned long addr, int numpages); diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 5b4a1ce3d368..f000635d6061 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -87,9 +87,23 @@ extern bool handle_vc_boot_ghcb(struct pt_regs *regs); /* Software defined (when rFlags.CF = 1) */ #define PVALIDATE_FAIL_NOUPDATE 255 +/* RMUPDATE detected 4K page and 2MB page overlap. */ +#define RMPUPDATE_FAIL_OVERLAP 4 + /* RMP page size */ #define RMP_PG_SIZE_4K 0 #define RMP_PG_SIZE_2M 1 +#define RMP_TO_PG_LEVEL(level) (((level) == RMP_PG_SIZE_4K) ? PG_LEVEL_4K : PG_LEVEL_2M) +#define PG_LEVEL_TO_RMP(level) (((level) == PG_LEVEL_4K) ? RMP_PG_SIZE_4K : RMP_PG_SIZE_2M) + +struct rmp_state { + u64 gpa; + u8 assigned; + u8 pagesize; + u8 immutable; + u8 rsvd; + u32 asid; +} __packed; #define RMPADJUST_VMSA_PAGE_BIT BIT(16) @@ -213,6 +227,8 @@ int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct sn void snp_accept_memory(phys_addr_t start, phys_addr_t end); u64 snp_get_unsupported_features(u64 status); u64 sev_get_status(void); +void kdump_sev_callback(void); +void sev_show_status(void); #else static inline void sev_es_ist_enter(struct pt_regs *regs) { } static inline void sev_es_ist_exit(void) { } @@ -241,6 +257,30 @@ static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *in static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { } static inline u64 snp_get_unsupported_features(u64 status) { return 0; } static inline u64 sev_get_status(void) { return 0; } +static inline void kdump_sev_callback(void) { } +static inline void sev_show_status(void) { } +#endif + +#ifdef CONFIG_KVM_AMD_SEV +bool snp_probe_rmptable_info(void); +int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level); +void snp_dump_hva_rmpentry(unsigned long address); +int psmash(u64 pfn); +int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid, bool immutable); +int rmp_make_shared(u64 pfn, enum pg_level level); +void snp_leak_pages(u64 pfn, unsigned int npages); +#else +static inline bool snp_probe_rmptable_info(void) { return false; } +static inline int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level) { return -ENODEV; } +static inline void snp_dump_hva_rmpentry(unsigned long address) {} +static inline int psmash(u64 pfn) { return -ENODEV; } +static inline int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid, + bool immutable) +{ + return -ENODEV; +} +static inline int rmp_make_shared(u64 pfn, enum pg_level level) { return -ENODEV; } +static inline void snp_leak_pages(u64 pfn, unsigned int npages) {} #endif #endif diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 4fab2ed454f3..54d6d71e0eca 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -8,9 +8,6 @@ #include <asm/current.h> #include <asm/thread_info.h> -extern int smp_num_siblings; -extern unsigned int num_processors; - DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map); DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map); @@ -110,7 +107,6 @@ void cpu_disable_common(void); void native_smp_prepare_boot_cpu(void); void smp_prepare_cpus_common(void); void native_smp_prepare_cpus(unsigned int max_cpus); -void calculate_max_logical_packages(void); void native_smp_cpus_done(unsigned int max_cpus); int common_cpu_up(unsigned int cpunum, struct task_struct *tidle); int native_kick_ap(unsigned int cpu, struct task_struct *tidle); @@ -174,8 +170,6 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu) } #endif /* CONFIG_SMP */ -extern unsigned disabled_cpus; - #ifdef CONFIG_DEBUG_NMI_SELFTEST extern void nmi_selftest(void); #else diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index d6cd9344f6c7..48f8dd47cf68 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -205,7 +205,7 @@ static inline void clwb(volatile void *__p) #ifdef CONFIG_X86_USER_SHADOW_STACK static inline int write_user_shstk_64(u64 __user *addr, u64 val) { - asm_volatile_goto("1: wrussq %[val], (%[addr])\n" + asm goto("1: wrussq %[val], (%[addr])\n" _ASM_EXTABLE(1b, %l[fail]) :: [addr] "r" (addr), [val] "r" (val) :: fail); diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index f42dbf17f52b..c3bd0c0758c9 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -70,9 +70,13 @@ static inline void update_task_stack(struct task_struct *task) #ifdef CONFIG_X86_32 this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0); #else - /* Xen PV enters the kernel on the thread stack. */ - if (cpu_feature_enabled(X86_FEATURE_XENPV)) + if (cpu_feature_enabled(X86_FEATURE_FRED)) { + /* WRMSRNS is a baseline feature for FRED. */ + wrmsrns(MSR_IA32_FRED_RSP0, (unsigned long)task_stack_page(task) + THREAD_SIZE); + } else if (cpu_feature_enabled(X86_FEATURE_XENPV)) { + /* Xen PV enters the kernel on the thread stack. */ load_sp0(task_top_of_stack(task)); + } #endif } diff --git a/arch/x86/include/asm/syscall_wrapper.h b/arch/x86/include/asm/syscall_wrapper.h index 21f9407be5d3..7e88705e907f 100644 --- a/arch/x86/include/asm/syscall_wrapper.h +++ b/arch/x86/include/asm/syscall_wrapper.h @@ -58,12 +58,29 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); ,,regs->di,,regs->si,,regs->dx \ ,,regs->r10,,regs->r8,,regs->r9) \ + +/* SYSCALL_PT_ARGS is Adapted from s390x */ +#define SYSCALL_PT_ARG6(m, t1, t2, t3, t4, t5, t6) \ + SYSCALL_PT_ARG5(m, t1, t2, t3, t4, t5), m(t6, (regs->bp)) +#define SYSCALL_PT_ARG5(m, t1, t2, t3, t4, t5) \ + SYSCALL_PT_ARG4(m, t1, t2, t3, t4), m(t5, (regs->di)) +#define SYSCALL_PT_ARG4(m, t1, t2, t3, t4) \ + SYSCALL_PT_ARG3(m, t1, t2, t3), m(t4, (regs->si)) +#define SYSCALL_PT_ARG3(m, t1, t2, t3) \ + SYSCALL_PT_ARG2(m, t1, t2), m(t3, (regs->dx)) +#define SYSCALL_PT_ARG2(m, t1, t2) \ + SYSCALL_PT_ARG1(m, t1), m(t2, (regs->cx)) +#define SYSCALL_PT_ARG1(m, t1) m(t1, (regs->bx)) +#define SYSCALL_PT_ARGS(x, ...) SYSCALL_PT_ARG##x(__VA_ARGS__) + +#define __SC_COMPAT_CAST(t, a) \ + (__typeof(__builtin_choose_expr(__TYPE_IS_L(t), 0, 0U))) \ + (unsigned int)a + /* Mapping of registers to parameters for syscalls on i386 */ #define SC_IA32_REGS_TO_ARGS(x, ...) \ - __MAP(x,__SC_ARGS \ - ,,(unsigned int)regs->bx,,(unsigned int)regs->cx \ - ,,(unsigned int)regs->dx,,(unsigned int)regs->si \ - ,,(unsigned int)regs->di,,(unsigned int)regs->bp) + SYSCALL_PT_ARGS(x, __SC_COMPAT_CAST, \ + __MAP(x, __SC_TYPE, __VA_ARGS__)) \ #define __SYS_STUB0(abi, name) \ long __##abi##_##name(const struct pt_regs *regs); \ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index d63b02940747..12da7dfd5ef1 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -31,7 +31,9 @@ * In vm86 mode, the hardware frame is much longer still, so add 16 * bytes to make room for the real-mode segments. * - * x86_64 has a fixed-length stack frame. + * x86-64 has a fixed-length stack frame, but it depends on whether + * or not FRED is enabled. Future versions of FRED might make this + * dynamic, but for now it is always 2 words longer. */ #ifdef CONFIG_X86_32 # ifdef CONFIG_VM86 @@ -39,8 +41,12 @@ # else # define TOP_OF_KERNEL_STACK_PADDING 8 # endif -#else -# define TOP_OF_KERNEL_STACK_PADDING 0 +#else /* x86-64 */ +# ifdef CONFIG_X86_FRED +# define TOP_OF_KERNEL_STACK_PADDING (2 * 8) +# else +# define TOP_OF_KERNEL_STACK_PADDING 0 +# endif #endif /* diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 5f87f6b9b09e..abe3a8f22cbd 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -102,6 +102,35 @@ static inline void setup_node_to_cpumask_map(void) { } #include <asm-generic/topology.h> +/* Topology information */ +enum x86_topology_domains { + TOPO_SMT_DOMAIN, + TOPO_CORE_DOMAIN, + TOPO_MODULE_DOMAIN, + TOPO_TILE_DOMAIN, + TOPO_DIE_DOMAIN, + TOPO_DIEGRP_DOMAIN, + TOPO_PKG_DOMAIN, + TOPO_MAX_DOMAIN, +}; + +struct x86_topology_system { + unsigned int dom_shifts[TOPO_MAX_DOMAIN]; + unsigned int dom_size[TOPO_MAX_DOMAIN]; +}; + +extern struct x86_topology_system x86_topo_system; + +static inline unsigned int topology_get_domain_size(enum x86_topology_domains dom) +{ + return x86_topo_system.dom_size[dom]; +} + +static inline unsigned int topology_get_domain_shift(enum x86_topology_domains dom) +{ + return dom == TOPO_SMT_DOMAIN ? 0 : x86_topo_system.dom_shifts[dom - 1]; +} + extern const struct cpumask *cpu_coregroup_mask(int cpu); extern const struct cpumask *cpu_clustergroup_mask(int cpu); @@ -112,7 +141,42 @@ extern const struct cpumask *cpu_clustergroup_mask(int cpu); #define topology_core_id(cpu) (cpu_data(cpu).topo.core_id) #define topology_ppin(cpu) (cpu_data(cpu).ppin) -extern unsigned int __max_die_per_package; +#define topology_amd_node_id(cpu) (cpu_data(cpu).topo.amd_node_id) + +extern unsigned int __max_dies_per_package; +extern unsigned int __max_logical_packages; +extern unsigned int __max_threads_per_core; +extern unsigned int __num_threads_per_package; +extern unsigned int __num_cores_per_package; + +static inline unsigned int topology_max_packages(void) +{ + return __max_logical_packages; +} + +static inline unsigned int topology_max_dies_per_package(void) +{ + return __max_dies_per_package; +} + +static inline unsigned int topology_num_cores_per_package(void) +{ + return __num_cores_per_package; +} + +static inline unsigned int topology_num_threads_per_package(void) +{ + return __num_threads_per_package; +} + +#ifdef CONFIG_X86_LOCAL_APIC +int topology_get_logical_id(u32 apicid, enum x86_topology_domains at_level); +#else +static inline int topology_get_logical_id(u32 apicid, enum x86_topology_domains at_level) +{ + return 0; +} +#endif #ifdef CONFIG_SMP #define topology_cluster_id(cpu) (cpu_data(cpu).topo.l2c_id) @@ -121,12 +185,11 @@ extern unsigned int __max_die_per_package; #define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) #define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) -extern unsigned int __max_logical_packages; -#define topology_max_packages() (__max_logical_packages) -static inline int topology_max_die_per_package(void) +static inline int topology_phys_to_logical_pkg(unsigned int pkg) { - return __max_die_per_package; + return topology_get_logical_id(pkg << x86_topo_system.dom_shifts[TOPO_PKG_DOMAIN], + TOPO_PKG_DOMAIN); } extern int __max_smt_threads; @@ -138,9 +201,12 @@ static inline int topology_max_smt_threads(void) #include <linux/cpu_smt.h> -int topology_update_package_map(unsigned int apicid, unsigned int cpu); -int topology_update_die_map(unsigned int dieid, unsigned int cpu); -int topology_phys_to_logical_pkg(unsigned int pkg); +extern unsigned int __amd_nodes_per_pkg; + +static inline unsigned int topology_amd_nodes_per_pkg(void) +{ + return __amd_nodes_per_pkg; +} extern struct cpumask __cpu_primary_thread_mask; #define cpu_primary_thread_mask ((const struct cpumask *)&__cpu_primary_thread_mask) @@ -153,16 +219,12 @@ static inline bool topology_is_primary_thread(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_primary_thread_mask); } + #else /* CONFIG_SMP */ -#define topology_max_packages() (1) -static inline int -topology_update_package_map(unsigned int apicid, unsigned int cpu) { return 0; } -static inline int -topology_update_die_map(unsigned int dieid, unsigned int cpu) { return 0; } static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; } -static inline int topology_max_die_per_package(void) { return 1; } static inline int topology_max_smt_threads(void) { return 1; } static inline bool topology_is_primary_thread(unsigned int cpu) { return true; } +static inline unsigned int topology_amd_nodes_per_pkg(void) { return 1; } #endif /* !CONFIG_SMP */ static inline void arch_fix_phys_package_id(int num, u32 slot) diff --git a/arch/x86/include/asm/trap_pf.h b/arch/x86/include/asm/trap_pf.h index afa524325e55..a23a7b707b64 100644 --- a/arch/x86/include/asm/trap_pf.h +++ b/arch/x86/include/asm/trap_pf.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_TRAP_PF_H #define _ASM_X86_TRAP_PF_H +#include <linux/bits.h> + /* * Page fault error code bits: * @@ -13,16 +15,18 @@ * bit 5 == 1: protection keys block access * bit 6 == 1: shadow stack access fault * bit 15 == 1: SGX MMU page-fault + * bit 31 == 1: fault was due to RMP violation */ enum x86_pf_error_code { - X86_PF_PROT = 1 << 0, - X86_PF_WRITE = 1 << 1, - X86_PF_USER = 1 << 2, - X86_PF_RSVD = 1 << 3, - X86_PF_INSTR = 1 << 4, - X86_PF_PK = 1 << 5, - X86_PF_SHSTK = 1 << 6, - X86_PF_SGX = 1 << 15, + X86_PF_PROT = BIT(0), + X86_PF_WRITE = BIT(1), + X86_PF_USER = BIT(2), + X86_PF_RSVD = BIT(3), + X86_PF_INSTR = BIT(4), + X86_PF_PK = BIT(5), + X86_PF_SHSTK = BIT(6), + X86_PF_SGX = BIT(15), + X86_PF_RMP = BIT(31), }; #endif /* _ASM_X86_TRAP_PF_H */ diff --git a/arch/x86/include/asm/trapnr.h b/arch/x86/include/asm/trapnr.h index f5d2325aa0b7..8d1154cdf787 100644 --- a/arch/x86/include/asm/trapnr.h +++ b/arch/x86/include/asm/trapnr.h @@ -2,6 +2,18 @@ #ifndef _ASM_X86_TRAPNR_H #define _ASM_X86_TRAPNR_H +/* + * Event type codes used by FRED, Intel VT-x and AMD SVM + */ +#define EVENT_TYPE_EXTINT 0 // External interrupt +#define EVENT_TYPE_RESERVED 1 +#define EVENT_TYPE_NMI 2 // NMI +#define EVENT_TYPE_HWEXC 3 // Hardware originated traps, exceptions +#define EVENT_TYPE_SWINT 4 // INT n +#define EVENT_TYPE_PRIV_SWEXC 5 // INT1 +#define EVENT_TYPE_SWEXC 6 // INTO, INT3 +#define EVENT_TYPE_OTHER 7 // FRED SYSCALL/SYSENTER, VT-x MTF + /* Interrupts/Exceptions */ #define X86_TRAP_DE 0 /* Divide-by-zero */ diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 5c367c1290c3..237dc8cdd12b 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -133,7 +133,7 @@ extern int __get_user_bad(void); #ifdef CONFIG_X86_32 #define __put_user_goto_u64(x, addr, label) \ - asm_volatile_goto("\n" \ + asm goto("\n" \ "1: movl %%eax,0(%1)\n" \ "2: movl %%edx,4(%1)\n" \ _ASM_EXTABLE_UA(1b, %l2) \ @@ -295,7 +295,7 @@ do { \ } while (0) #define __get_user_asm(x, addr, itype, ltype, label) \ - asm_volatile_goto("\n" \ + asm_goto_output("\n" \ "1: mov"itype" %[umem],%[output]\n" \ _ASM_EXTABLE_UA(1b, %l2) \ : [output] ltype(x) \ @@ -375,7 +375,7 @@ do { \ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \ __typeof__(*(_ptr)) __old = *_old; \ __typeof__(*(_ptr)) __new = (_new); \ - asm_volatile_goto("\n" \ + asm_goto_output("\n" \ "1: " LOCK_PREFIX "cmpxchg"itype" %[new], %[ptr]\n"\ _ASM_EXTABLE_UA(1b, %l[label]) \ : CC_OUT(z) (success), \ @@ -394,7 +394,7 @@ do { \ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \ __typeof__(*(_ptr)) __old = *_old; \ __typeof__(*(_ptr)) __new = (_new); \ - asm_volatile_goto("\n" \ + asm_goto_output("\n" \ "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n" \ _ASM_EXTABLE_UA(1b, %l[label]) \ : CC_OUT(z) (success), \ @@ -477,7 +477,7 @@ struct __large_struct { unsigned long buf[100]; }; * aliasing issues. */ #define __put_user_goto(x, addr, itype, ltype, label) \ - asm_volatile_goto("\n" \ + asm goto("\n" \ "1: mov"itype" %0,%1\n" \ _ASM_EXTABLE_UA(1b, %l2) \ : : ltype(x), "m" (__m(addr)) \ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 0e73616b82f3..4dba17363008 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -17,6 +17,7 @@ #include <linux/types.h> #include <uapi/asm/vmx.h> +#include <asm/trapnr.h> #include <asm/vmxfeatures.h> #define VMCS_CONTROL_BIT(x) BIT(VMX_FEATURE_##x & 0x1f) @@ -374,14 +375,14 @@ enum vmcs_field { #define VECTORING_INFO_DELIVER_CODE_MASK INTR_INFO_DELIVER_CODE_MASK #define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK -#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ -#define INTR_TYPE_RESERVED (1 << 8) /* reserved */ -#define INTR_TYPE_NMI_INTR (2 << 8) /* NMI */ -#define INTR_TYPE_HARD_EXCEPTION (3 << 8) /* processor exception */ -#define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */ -#define INTR_TYPE_PRIV_SW_EXCEPTION (5 << 8) /* ICE breakpoint - undocumented */ -#define INTR_TYPE_SOFT_EXCEPTION (6 << 8) /* software exception */ -#define INTR_TYPE_OTHER_EVENT (7 << 8) /* other event */ +#define INTR_TYPE_EXT_INTR (EVENT_TYPE_EXTINT << 8) /* external interrupt */ +#define INTR_TYPE_RESERVED (EVENT_TYPE_RESERVED << 8) /* reserved */ +#define INTR_TYPE_NMI_INTR (EVENT_TYPE_NMI << 8) /* NMI */ +#define INTR_TYPE_HARD_EXCEPTION (EVENT_TYPE_HWEXC << 8) /* processor exception */ +#define INTR_TYPE_SOFT_INTR (EVENT_TYPE_SWINT << 8) /* software interrupt */ +#define INTR_TYPE_PRIV_SW_EXCEPTION (EVENT_TYPE_PRIV_SWEXC << 8) /* ICE breakpoint */ +#define INTR_TYPE_SOFT_EXCEPTION (EVENT_TYPE_SWEXC << 8) /* software exception */ +#define INTR_TYPE_OTHER_EVENT (EVENT_TYPE_OTHER << 8) /* other event */ /* GUEST_INTERRUPTIBILITY_INFO flags. */ #define GUEST_INTR_STATE_STI 0x00000001 diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index ab60a71a8dcb..472f0263dbc6 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -4,6 +4,7 @@ #include <linux/seqlock.h> #include <uapi/asm/vsyscall.h> +#include <asm/page_types.h> #ifdef CONFIG_X86_VSYSCALL_EMULATION extern void map_vsyscall(void); @@ -24,4 +25,13 @@ static inline bool emulate_vsyscall(unsigned long error_code, } #endif +/* + * The (legacy) vsyscall page is the long page in the kernel portion + * of the address space that has user-accessible permissions. + */ +static inline bool is_vsyscall_vaddr(unsigned long vaddr) +{ + return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR); +} + #endif /* _ASM_X86_VSYSCALL_H */ diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index c878616a18b8..9ca624749176 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -15,13 +15,15 @@ struct irq_domain; /** * struct x86_init_mpparse - platform specific mpparse ops * @setup_ioapic_ids: platform specific ioapic id override - * @find_smp_config: find the smp configuration - * @get_smp_config: get the smp configuration + * @find_mptable: Find MPTABLE early to reserve the memory region + * @early_parse_smp_cfg: Parse the SMP configuration data early before initmem_init() + * @parse_smp_cfg: Parse the SMP configuration data */ struct x86_init_mpparse { void (*setup_ioapic_ids)(void); - void (*find_smp_config)(void); - void (*get_smp_config)(unsigned int early); + void (*find_mptable)(void); + void (*early_parse_smp_cfg)(void); + void (*parse_smp_cfg)(void); }; /** diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h index d898432947ff..f1a4adc78272 100644 --- a/arch/x86/include/uapi/asm/processor-flags.h +++ b/arch/x86/include/uapi/asm/processor-flags.h @@ -139,6 +139,13 @@ #define X86_CR4_LAM_SUP_BIT 28 /* LAM for supervisor pointers */ #define X86_CR4_LAM_SUP _BITUL(X86_CR4_LAM_SUP_BIT) +#ifdef __x86_64__ +#define X86_CR4_FRED_BIT 32 /* enable FRED kernel entry */ +#define X86_CR4_FRED _BITUL(X86_CR4_FRED_BIT) +#else +#define X86_CR4_FRED (0) +#endif + /* * x86-64 Task Priority Register, CR8 */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0000325ab98f..d0c744cb2a0e 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -33,6 +33,7 @@ KASAN_SANITIZE_sev.o := n KCSAN_SANITIZE := n KMSAN_SANITIZE_head$(BITS).o := n KMSAN_SANITIZE_nmi.o := n +KMSAN_SANITIZE_sev.o := n # If instrumentation of the following files is enabled, boot hangs during # first second. @@ -48,6 +49,7 @@ obj-y += platform-quirks.o obj-y += process_$(BITS).o signal.o signal_$(BITS).o obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o dumpstack.o nmi.o +obj-$(CONFIG_X86_FRED) += fred.o obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o obj-$(CONFIG_X86_KERNEL_IBT) += ibt_selftest.o obj-y += setup.o x86_init.o i8259.o irqinit.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 85a3ce2a3666..4bf82dbd2a6b 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -164,35 +164,6 @@ static int __init acpi_parse_madt(struct acpi_table_header *table) return 0; } -/** - * acpi_register_lapic - register a local apic and generates a logic cpu number - * @id: local apic id to register - * @acpiid: ACPI id to register - * @enabled: this cpu is enabled or not - * - * Returns the logic cpu number which maps to the local apic - */ -static int acpi_register_lapic(int id, u32 acpiid, u8 enabled) -{ - int cpu; - - if (id >= MAX_LOCAL_APIC) { - pr_info("skipped apicid that is too big\n"); - return -EINVAL; - } - - if (!enabled) { - ++disabled_cpus; - return -EINVAL; - } - - cpu = generic_processor_info(id); - if (cpu >= 0) - early_per_cpu(x86_cpu_to_acpiid, cpu) = acpiid; - - return cpu; -} - static bool __init acpi_is_processor_usable(u32 lapic_flags) { if (lapic_flags & ACPI_MADT_ENABLED) @@ -254,7 +225,7 @@ acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end) return 0; } - acpi_register_lapic(apic_id, processor->uid, enabled); + topology_register_apic(apic_id, processor->uid, enabled); #else pr_warn("x2apic entry ignored\n"); #endif @@ -289,9 +260,9 @@ acpi_parse_lapic(union acpi_subtable_headers * header, const unsigned long end) * to not preallocating memory for all NR_CPUS * when we use CPU hotplug. */ - acpi_register_lapic(processor->id, /* APIC ID */ - processor->processor_id, /* ACPI ID */ - processor->lapic_flags & ACPI_MADT_ENABLED); + topology_register_apic(processor->id, /* APIC ID */ + processor->processor_id, /* ACPI ID */ + processor->lapic_flags & ACPI_MADT_ENABLED); has_lapic_cpus = true; return 0; @@ -309,9 +280,9 @@ acpi_parse_sapic(union acpi_subtable_headers *header, const unsigned long end) acpi_table_print_madt_entry(&header->common); - acpi_register_lapic((processor->id << 8) | processor->eid,/* APIC ID */ - processor->processor_id, /* ACPI ID */ - processor->lapic_flags & ACPI_MADT_ENABLED); + topology_register_apic((processor->id << 8) | processor->eid,/* APIC ID */ + processor->processor_id, /* ACPI ID */ + processor->lapic_flags & ACPI_MADT_ENABLED); return 0; } @@ -844,12 +815,10 @@ static int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) return 0; } -int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id, - int *pcpu) +int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id, int *pcpu) { - int cpu; + int cpu = topology_hotplug_apic(physid, acpi_id); - cpu = acpi_register_lapic(physid, acpi_id, ACPI_MADT_ENABLED); if (cpu < 0) { pr_info("Unable to map lapic to logical cpu number\n"); return cpu; @@ -868,15 +837,11 @@ int acpi_unmap_cpu(int cpu) #ifdef CONFIG_ACPI_NUMA set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE); #endif - - per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID; - set_cpu_present(cpu, false); - num_processors--; - - return (0); + topology_hotunplug_apic(cpu); + return 0; } EXPORT_SYMBOL(acpi_unmap_cpu); -#endif /* CONFIG_ACPI_HOTPLUG_CPU */ +#endif /* CONFIG_ACPI_HOTPLUG_CPU */ int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base) { diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index cc130b57542a..1d85cb7071cb 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -403,7 +403,7 @@ noinstr void BUG_func(void) { BUG(); } -EXPORT_SYMBOL_GPL(BUG_func); +EXPORT_SYMBOL(BUG_func); #define CALL_RIP_REL_OPCODE 0xff #define CALL_RIP_REL_MODRM 0x15 diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 053f6dcc6b2c..5bf5f9fc5753 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -386,7 +386,7 @@ struct resource *amd_get_mmconfig_range(struct resource *res) int amd_get_subcaches(int cpu) { - struct pci_dev *link = node_to_amd_nb(topology_die_id(cpu))->link; + struct pci_dev *link = node_to_amd_nb(topology_amd_node_id(cpu))->link; unsigned int mask; if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) @@ -400,7 +400,7 @@ int amd_get_subcaches(int cpu) int amd_set_subcaches(int cpu, unsigned long mask) { static unsigned int reset, ban; - struct amd_northbridge *nb = node_to_amd_nb(topology_die_id(cpu)); + struct amd_northbridge *nb = node_to_amd_nb(topology_amd_node_id(cpu)); unsigned int reg; int cuid; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 4667bc4b00ab..a42d8a6f7149 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -19,6 +19,7 @@ #include <linux/kernel_stat.h> #include <linux/mc146818rtc.h> #include <linux/acpi_pmtmr.h> +#include <linux/bitmap.h> #include <linux/clockchips.h> #include <linux/interrupt.h> #include <linux/memblock.h> @@ -67,10 +68,6 @@ #include "local.h" -unsigned int num_processors; - -unsigned disabled_cpus; - /* Processor that is doing the boot up */ u32 boot_cpu_physical_apicid __ro_after_init = BAD_APICID; EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid); @@ -78,18 +75,6 @@ EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid); u8 boot_cpu_apic_version __ro_after_init; /* - * Bitmask of physically existing CPUs: - */ -physid_mask_t phys_cpu_present_map; - -/* - * Processor to be disabled specified by kernel parameter - * disable_cpu_apicid=<int>, mostly used for the kdump 2nd kernel to - * avoid undefined behaviour caused by sending INIT from AP to BSP. - */ -static u32 disabled_cpu_apicid __ro_after_init = BAD_APICID; - -/* * This variable controls which CPUs receive external NMIs. By default, * external NMIs are delivered only to the BSP. */ @@ -108,14 +93,6 @@ static inline bool apic_accessible(void) return x2apic_mode || apic_mmio_base; } -/* - * Map cpu index to physical APIC ID - */ -DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_apicid, BAD_APICID); -DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid, U32_MAX); -EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); -EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid); - #ifdef CONFIG_X86_32 /* Local APIC was disabled by the BIOS and enabled by the kernel */ static int enabled_via_apicbase __ro_after_init; @@ -261,16 +238,6 @@ u64 native_apic_icr_read(void) return icr1 | ((u64)icr2 << 32); } -#ifdef CONFIG_X86_32 -/** - * get_physical_broadcast - Get number of physical broadcast IDs - */ -int get_physical_broadcast(void) -{ - return modern_apic() ? 0xff : 0xf; -} -#endif - /** * lapic_get_maxlvt - get the maximum number of local vector table entries */ @@ -1549,9 +1516,6 @@ static void setup_local_APIC(void) apic_write(APIC_ESR, 0); } #endif - /* Validate that the APIC is registered if required */ - BUG_ON(apic->apic_id_registered && !apic->apic_id_registered()); - /* * Intel recommends to set DFR, LDR and TPR before enabling * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel @@ -1690,8 +1654,6 @@ void apic_ap_setup(void) end_local_APIC_setup(); } -static __init void cpu_set_boot_apic(void); - static __init void apic_read_boot_cpu_id(bool x2apic) { /* @@ -1706,7 +1668,8 @@ static __init void apic_read_boot_cpu_id(bool x2apic) boot_cpu_physical_apicid = read_apic_id(); boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR)); } - cpu_set_boot_apic(); + topology_register_boot_apic(boot_cpu_physical_apicid); + x86_32_probe_bigsmp_early(); } #ifdef CONFIG_X86_X2APIC @@ -2091,7 +2054,6 @@ void __init init_apic_mappings(void) pr_info("APIC: disable apic facility\n"); apic_disable(); } - num_processors = 1; } } @@ -2305,155 +2267,6 @@ void disconnect_bsp_APIC(int virt_wire_setup) apic_write(APIC_LVT1, value); } -/* - * The number of allocated logical CPU IDs. Since logical CPU IDs are allocated - * contiguously, it equals to current allocated max logical CPU ID plus 1. - * All allocated CPU IDs should be in the [0, nr_logical_cpuids) range, - * so the maximum of nr_logical_cpuids is nr_cpu_ids. - * - * NOTE: Reserve 0 for BSP. - */ -static int nr_logical_cpuids = 1; - -/* - * Used to store mapping between logical CPU IDs and APIC IDs. - */ -u32 cpuid_to_apicid[] = { [0 ... NR_CPUS - 1] = BAD_APICID, }; - -bool arch_match_cpu_phys_id(int cpu, u64 phys_id) -{ - return phys_id == (u64)cpuid_to_apicid[cpu]; -} - -#ifdef CONFIG_SMP -static void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) -{ - /* Isolate the SMT bit(s) in the APICID and check for 0 */ - u32 mask = (1U << (fls(smp_num_siblings) - 1)) - 1; - - if (smp_num_siblings == 1 || !(apicid & mask)) - cpumask_set_cpu(cpu, &__cpu_primary_thread_mask); -} - -/* - * Due to the utter mess of CPUID evaluation smp_num_siblings is not valid - * during early boot. Initialize the primary thread mask before SMP - * bringup. - */ -static int __init smp_init_primary_thread_mask(void) -{ - unsigned int cpu; - - /* - * XEN/PV provides either none or useless topology information. - * Pretend that all vCPUs are primary threads. - */ - if (xen_pv_domain()) { - cpumask_copy(&__cpu_primary_thread_mask, cpu_possible_mask); - return 0; - } - - for (cpu = 0; cpu < nr_logical_cpuids; cpu++) - cpu_mark_primary_thread(cpu, cpuid_to_apicid[cpu]); - return 0; -} -early_initcall(smp_init_primary_thread_mask); -#else -static inline void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) { } -#endif - -/* - * Should use this API to allocate logical CPU IDs to keep nr_logical_cpuids - * and cpuid_to_apicid[] synchronized. - */ -static int allocate_logical_cpuid(int apicid) -{ - int i; - - /* - * cpuid <-> apicid mapping is persistent, so when a cpu is up, - * check if the kernel has allocated a cpuid for it. - */ - for (i = 0; i < nr_logical_cpuids; i++) { - if (cpuid_to_apicid[i] == apicid) - return i; - } - - /* Allocate a new cpuid. */ - if (nr_logical_cpuids >= nr_cpu_ids) { - WARN_ONCE(1, "APIC: NR_CPUS/possible_cpus limit of %u reached. " - "Processor %d/0x%x and the rest are ignored.\n", - nr_cpu_ids, nr_logical_cpuids, apicid); - return -EINVAL; - } - - cpuid_to_apicid[nr_logical_cpuids] = apicid; - return nr_logical_cpuids++; -} - -static void cpu_update_apic(int cpu, u32 apicid) -{ -#if defined(CONFIG_SMP) || defined(CONFIG_X86_64) - early_per_cpu(x86_cpu_to_apicid, cpu) = apicid; -#endif - set_cpu_possible(cpu, true); - physid_set(apicid, phys_cpu_present_map); - set_cpu_present(cpu, true); - num_processors++; - - if (system_state != SYSTEM_BOOTING) - cpu_mark_primary_thread(cpu, apicid); -} - -static __init void cpu_set_boot_apic(void) -{ - cpuid_to_apicid[0] = boot_cpu_physical_apicid; - cpu_update_apic(0, boot_cpu_physical_apicid); - x86_32_probe_bigsmp_early(); -} - -int generic_processor_info(int apicid) -{ - int cpu, max = nr_cpu_ids; - - /* The boot CPU must be set before MADT/MPTABLE parsing happens */ - if (cpuid_to_apicid[0] == BAD_APICID) - panic("Boot CPU APIC not registered yet\n"); - - if (apicid == boot_cpu_physical_apicid) - return 0; - - if (disabled_cpu_apicid == apicid) { - int thiscpu = num_processors + disabled_cpus; - - pr_warn("APIC: Disabling requested cpu. Processor %d/0x%x ignored.\n", - thiscpu, apicid); - - disabled_cpus++; - return -ENODEV; - } - - if (num_processors >= nr_cpu_ids) { - int thiscpu = max + disabled_cpus; - - pr_warn("APIC: NR_CPUS/possible_cpus limit of %i reached. " - "Processor %d/0x%x ignored.\n", max, thiscpu, apicid); - - disabled_cpus++; - return -EINVAL; - } - - cpu = allocate_logical_cpuid(apicid); - if (cpu < 0) { - disabled_cpus++; - return -EINVAL; - } - - cpu_update_apic(cpu, apicid); - return cpu; -} - - void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg, bool dmar) { @@ -2496,10 +2309,7 @@ EXPORT_SYMBOL_GPL(x86_msi_msg_get_destid); static void __init apic_bsp_up_setup(void) { -#ifdef CONFIG_X86_64 - apic_write(APIC_ID, apic->set_apic_id(boot_cpu_physical_apicid)); -#endif - physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); + reset_phys_cpu_present_map(boot_cpu_physical_apicid); } /** @@ -2845,15 +2655,6 @@ static int __init lapic_insert_resource(void) */ late_initcall(lapic_insert_resource); -static int __init apic_set_disabled_cpu_apicid(char *arg) -{ - if (!arg || !get_option(&arg, &disabled_cpu_apicid)) - return -EINVAL; - - return 0; -} -early_param("disable_cpu_apicid", apic_set_disabled_cpu_apicid); - static int __init apic_set_extnmi(char *arg) { if (!arg) diff --git a/arch/x86/kernel/apic/apic_common.c b/arch/x86/kernel/apic/apic_common.c index 8a00141073ea..9ef3be866832 100644 --- a/arch/x86/kernel/apic/apic_common.c +++ b/arch/x86/kernel/apic/apic_common.c @@ -18,16 +18,6 @@ u32 apic_flat_calc_apicid(unsigned int cpu) return 1U << cpu; } -bool default_check_apicid_used(physid_mask_t *map, u32 apicid) -{ - return physid_isset(apicid, *map); -} - -void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) -{ - *retmap = *phys_map; -} - u32 default_cpu_present_to_apicid(int mps_cpu) { if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu)) @@ -37,11 +27,6 @@ u32 default_cpu_present_to_apicid(int mps_cpu) } EXPORT_SYMBOL_GPL(default_cpu_present_to_apicid); -bool default_apic_id_registered(void) -{ - return physid_isset(read_apic_id(), phys_cpu_present_map); -} - /* * Set up the logical destination ID when the APIC operates in logical * destination mode. diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index b295a056a4fc..f37ad3392fec 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -61,16 +61,6 @@ static u32 flat_get_apic_id(u32 x) return (x >> 24) & 0xFF; } -static u32 set_apic_id(u32 id) -{ - return (id & 0xFF) << 24; -} - -static u32 flat_phys_pkg_id(u32 initial_apic_id, int index_msb) -{ - return initial_apic_id >> index_msb; -} - static int flat_probe(void) { return 1; @@ -80,7 +70,6 @@ static struct apic apic_flat __ro_after_init = { .name = "flat", .probe = flat_probe, .acpi_madt_oem_check = flat_acpi_madt_oem_check, - .apic_id_registered = default_apic_id_registered, .dest_mode_logical = true, @@ -88,11 +77,9 @@ static struct apic apic_flat __ro_after_init = { .init_apic_ldr = default_init_apic_ldr, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .phys_pkg_id = flat_phys_pkg_id, .max_apic_id = 0xFE, .get_apic_id = flat_get_apic_id, - .set_apic_id = set_apic_id, .calc_dest_apicid = apic_flat_calc_apicid, @@ -151,18 +138,15 @@ static struct apic apic_physflat __ro_after_init = { .name = "physical flat", .probe = physflat_probe, .acpi_madt_oem_check = physflat_acpi_madt_oem_check, - .apic_id_registered = default_apic_id_registered, .dest_mode_logical = false, .disable_esr = 0, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .phys_pkg_id = flat_phys_pkg_id, .max_apic_id = 0xFE, .get_apic_id = flat_get_apic_id, - .set_apic_id = set_apic_id, .calc_dest_apicid = apic_default_calc_apicid, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 9f1d553eb48f..b5bb7a2e8340 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -29,7 +29,6 @@ static void noop_send_IPI_self(int vector) { } static void noop_apic_icr_write(u32 low, u32 id) { } static int noop_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip) { return -1; } static u64 noop_apic_icr_read(void) { return 0; } -static u32 noop_phys_pkg_id(u32 cpuid_apic, int index_msb) { return 0; } static u32 noop_get_apic_id(u32 apicid) { return 0; } static void noop_apic_eoi(void) { } @@ -51,12 +50,8 @@ struct apic apic_noop __ro_after_init = { .disable_esr = 0, - .check_apicid_used = default_check_apicid_used, - .ioapic_phys_id_map = default_ioapic_phys_id_map, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .phys_pkg_id = noop_phys_pkg_id, - .max_apic_id = 0xFE, .get_apic_id = noop_get_apic_id, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 7d0c51b9d3bc..16410f087b7a 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -38,11 +38,6 @@ static u32 numachip1_get_apic_id(u32 x) return id; } -static u32 numachip1_set_apic_id(u32 id) -{ - return (id & 0xff) << 24; -} - static u32 numachip2_get_apic_id(u32 x) { u64 mcfg; @@ -51,16 +46,6 @@ static u32 numachip2_get_apic_id(u32 x) return ((mcfg >> (28 - 8)) & 0xfff00) | (x >> 24); } -static u32 numachip2_set_apic_id(u32 id) -{ - return id << 24; -} - -static u32 numachip_phys_pkg_id(u32 initial_apic_id, int index_msb) -{ - return initial_apic_id >> index_msb; -} - static void numachip1_apic_icr_write(int apicid, unsigned int val) { write_lcsr(CSR_G3_EXT_IRQ_GEN, (apicid << 16) | val); @@ -227,11 +212,9 @@ static const struct apic apic_numachip1 __refconst = { .disable_esr = 0, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .phys_pkg_id = numachip_phys_pkg_id, .max_apic_id = UINT_MAX, .get_apic_id = numachip1_get_apic_id, - .set_apic_id = numachip1_set_apic_id, .calc_dest_apicid = apic_default_calc_apicid, @@ -263,11 +246,9 @@ static const struct apic apic_numachip2 __refconst = { .disable_esr = 0, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .phys_pkg_id = numachip_phys_pkg_id, .max_apic_id = UINT_MAX, .get_apic_id = numachip2_get_apic_id, - .set_apic_id = numachip2_set_apic_id, .calc_dest_apicid = apic_default_calc_apicid, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 5a0d60b38e6b..9285d500d5b4 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -18,22 +18,6 @@ static u32 bigsmp_get_apic_id(u32 x) return (x >> 24) & 0xFF; } -static bool bigsmp_check_apicid_used(physid_mask_t *map, u32 apicid) -{ - return false; -} - -static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) -{ - /* For clustered we don't have a good way to do this yet - hack */ - physids_promote(0xFFL, retmap); -} - -static u32 bigsmp_phys_pkg_id(u32 cpuid_apic, int index_msb) -{ - return cpuid_apic >> index_msb; -} - static void bigsmp_send_IPI_allbutself(int vector) { default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector); @@ -84,14 +68,10 @@ static struct apic apic_bigsmp __ro_after_init = { .disable_esr = 1, - .check_apicid_used = bigsmp_check_apicid_used, - .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .phys_pkg_id = bigsmp_phys_pkg_id, .max_apic_id = 0xFE, .get_apic_id = bigsmp_get_apic_id, - .set_apic_id = NULL, .calc_dest_apicid = apic_default_calc_apicid, diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 40c7cf180c20..477b740b2f26 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1458,20 +1458,20 @@ void restore_boot_irq_mode(void) * * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 */ -void __init setup_ioapic_ids_from_mpc_nocheck(void) +static void __init setup_ioapic_ids_from_mpc_nocheck(void) { + DECLARE_BITMAP(phys_id_present_map, MAX_LOCAL_APIC); + const u32 broadcast_id = 0xF; union IO_APIC_reg_00 reg_00; - physid_mask_t phys_id_present_map; - int ioapic_idx; - int i; unsigned char old_id; unsigned long flags; + int ioapic_idx, i; /* * This is broken; anything with a real cpu count has to * circumvent this idiocy regardless. */ - apic->ioapic_phys_id_map(&phys_cpu_present_map, &phys_id_present_map); + copy_phys_cpu_present_map(phys_id_present_map); /* * Set the IOAPIC ID to the value stored in the MPC table. @@ -1484,11 +1484,10 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void) old_id = mpc_ioapic_id(ioapic_idx); - if (mpc_ioapic_id(ioapic_idx) >= get_physical_broadcast()) { - printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", - ioapic_idx, mpc_ioapic_id(ioapic_idx)); - printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", - reg_00.bits.ID); + if (mpc_ioapic_id(ioapic_idx) >= broadcast_id) { + pr_err(FW_BUG "IO-APIC#%d ID is %d in the MPC table!...\n", + ioapic_idx, mpc_ioapic_id(ioapic_idx)); + pr_err("... fixing up to %d. (tell your hw vendor)\n", reg_00.bits.ID); ioapics[ioapic_idx].mp_config.apicid = reg_00.bits.ID; } @@ -1497,23 +1496,21 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void) * system must have a unique ID or we get lots of nice * 'stuck on smp_invalidate_needed IPI wait' messages. */ - if (apic->check_apicid_used(&phys_id_present_map, - mpc_ioapic_id(ioapic_idx))) { - printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", - ioapic_idx, mpc_ioapic_id(ioapic_idx)); - for (i = 0; i < get_physical_broadcast(); i++) - if (!physid_isset(i, phys_id_present_map)) + if (test_bit(mpc_ioapic_id(ioapic_idx), phys_id_present_map)) { + pr_err(FW_BUG "IO-APIC#%d ID %d is already used!...\n", + ioapic_idx, mpc_ioapic_id(ioapic_idx)); + for (i = 0; i < broadcast_id; i++) + if (!test_bit(i, phys_id_present_map)) break; - if (i >= get_physical_broadcast()) + if (i >= broadcast_id) panic("Max APIC ID exceeded!\n"); - printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", - i); - physid_set(i, phys_id_present_map); + pr_err("... fixing up to %d. (tell your hw vendor)\n", i); + set_bit(i, phys_id_present_map); ioapics[ioapic_idx].mp_config.apicid = i; } else { apic_printk(APIC_VERBOSE, "Setting %d in the phys_id_present_map\n", mpc_ioapic_id(ioapic_idx)); - physid_set(mpc_ioapic_id(ioapic_idx), phys_id_present_map); + set_bit(mpc_ioapic_id(ioapic_idx), phys_id_present_map); } /* @@ -2209,7 +2206,7 @@ static inline void __init check_timer(void) * 8259A. */ if (pin1 == -1) { - panic_if_irq_remap("BIOS bug: timer not connected to IO-APIC"); + panic_if_irq_remap(FW_BUG "Timer not connected to IO-APIC"); pin1 = pin2; apic1 = apic2; no_pin1 = 1; @@ -2354,7 +2351,7 @@ static int mp_irqdomain_create(int ioapic) fwspec.param_count = 1; fwspec.param[0] = mpc_ioapic_id(ioapic); - parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_ANY); + parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_GENERIC_MSI); if (!parent) { if (!cfg->dev) irq_domain_free_fwnode(fn); @@ -2494,56 +2491,41 @@ unsigned int arch_dynirq_lower_bound(unsigned int from) #ifdef CONFIG_X86_32 static int io_apic_get_unique_id(int ioapic, int apic_id) { + static DECLARE_BITMAP(apic_id_map, MAX_LOCAL_APIC); + const u32 broadcast_id = 0xF; union IO_APIC_reg_00 reg_00; - static physid_mask_t apic_id_map = PHYSID_MASK_NONE; - physid_mask_t tmp; unsigned long flags; int i = 0; - /* - * The P4 platform supports up to 256 APIC IDs on two separate APIC - * buses (one for LAPICs, one for IOAPICs), where predecessors only - * supports up to 16 on one shared APIC bus. - * - * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full - * advantage of new APIC bus architecture. - */ - - if (physids_empty(apic_id_map)) - apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map); + /* Initialize the ID map */ + if (bitmap_empty(apic_id_map, MAX_LOCAL_APIC)) + copy_phys_cpu_present_map(apic_id_map); raw_spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(ioapic, 0); raw_spin_unlock_irqrestore(&ioapic_lock, flags); - if (apic_id >= get_physical_broadcast()) { - printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " - "%d\n", ioapic, apic_id, reg_00.bits.ID); + if (apic_id >= broadcast_id) { + pr_warn("IOAPIC[%d]: Invalid apic_id %d, trying %d\n", + ioapic, apic_id, reg_00.bits.ID); apic_id = reg_00.bits.ID; } - /* - * Every APIC in a system must have a unique ID or we get lots of nice - * 'stuck on smp_invalidate_needed IPI wait' messages. - */ - if (apic->check_apicid_used(&apic_id_map, apic_id)) { - - for (i = 0; i < get_physical_broadcast(); i++) { - if (!apic->check_apicid_used(&apic_id_map, i)) + /* Every APIC in a system must have a unique ID */ + if (test_bit(apic_id, apic_id_map)) { + for (i = 0; i < broadcast_id; i++) { + if (!test_bit(i, apic_id_map)) break; } - if (i == get_physical_broadcast()) + if (i == broadcast_id) panic("Max apic_id exceeded!\n"); - printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " - "trying %d\n", ioapic, apic_id, i); - + pr_warn("IOAPIC[%d]: apic_id %d already used, trying %d\n", ioapic, apic_id, i); apic_id = i; } - physid_set_mask_of_physid(apic_id, &tmp); - physids_or(apic_id_map, apic_id_map, tmp); + set_bit(apic_id, apic_id_map); if (reg_00.bits.ID != apic_id) { reg_00.bits.ID = apic_id; @@ -2569,11 +2551,9 @@ static int io_apic_get_unique_id(int ioapic, int apic_id) static u8 io_apic_unique_id(int idx, u8 id) { - if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && - !APIC_XAPIC(boot_cpu_apic_version)) + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && !APIC_XAPIC(boot_cpu_apic_version)) return io_apic_get_unique_id(idx, id); - else - return id; + return id; } #else static u8 io_apic_unique_id(int idx, u8 id) diff --git a/arch/x86/kernel/apic/local.h b/arch/x86/kernel/apic/local.h index 9ea6186ea88c..842fe28496be 100644 --- a/arch/x86/kernel/apic/local.h +++ b/arch/x86/kernel/apic/local.h @@ -16,8 +16,6 @@ /* X2APIC */ void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest); u32 x2apic_get_apic_id(u32 id); -u32 x2apic_set_apic_id(u32 id); -u32 x2apic_phys_pkg_id(u32 initial_apicid, int index_msb); void x2apic_send_IPI_all(int vector); void x2apic_send_IPI_allbutself(int vector); @@ -63,9 +61,6 @@ void default_send_IPI_allbutself(int vector); void default_send_IPI_all(int vector); void default_send_IPI_self(int vector); -bool default_apic_id_registered(void); -bool default_check_apicid_used(physid_mask_t *map, u32 apicid); - #ifdef CONFIG_X86_32 void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector); void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, int vector); diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index c0f78059f06a..f75ee345c02d 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -18,11 +18,6 @@ #include "local.h" -static u32 default_phys_pkg_id(u32 cpuid_apic, int index_msb) -{ - return cpuid_apic >> index_msb; -} - static u32 default_get_apic_id(u32 x) { unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); @@ -43,17 +38,13 @@ static struct apic apic_default __ro_after_init = { .name = "default", .probe = probe_default, - .apic_id_registered = default_apic_id_registered, .dest_mode_logical = true, .disable_esr = 0, - .check_apicid_used = default_check_apicid_used, .init_apic_ldr = default_init_apic_ldr, - .ioapic_phys_id_map = default_ioapic_phys_id_map, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .phys_pkg_id = default_phys_pkg_id, .max_apic_id = 0xFE, .get_apic_id = default_get_apic_id, diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 28a7d3f2312d..567dbd2fe4b6 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -231,16 +231,12 @@ static struct apic apic_x2apic_cluster __ro_after_init = { .disable_esr = 0, - .check_apicid_used = NULL, .init_apic_ldr = init_x2apic_ldr, - .ioapic_phys_id_map = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .phys_pkg_id = x2apic_phys_pkg_id, .max_apic_id = UINT_MAX, .x2apic_set_max_apicid = true, .get_apic_id = x2apic_get_apic_id, - .set_apic_id = x2apic_set_apic_id, .calc_dest_apicid = x2apic_calc_apicid, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 409815a40668..12d4c35547a6 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -129,16 +129,6 @@ u32 x2apic_get_apic_id(u32 id) return id; } -u32 x2apic_set_apic_id(u32 id) -{ - return id; -} - -u32 x2apic_phys_pkg_id(u32 initial_apicid, int index_msb) -{ - return initial_apicid >> index_msb; -} - static struct apic apic_x2apic_phys __ro_after_init = { .name = "physical x2apic", @@ -150,12 +140,10 @@ static struct apic apic_x2apic_phys __ro_after_init = { .disable_esr = 0, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .phys_pkg_id = x2apic_phys_pkg_id, .max_apic_id = UINT_MAX, .x2apic_set_max_apicid = true, .get_apic_id = x2apic_get_apic_id, - .set_apic_id = x2apic_set_apic_id, .calc_dest_apicid = apic_default_calc_apicid, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index f1766b18dcd0..7fef504ca508 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -241,54 +241,20 @@ static void __init uv_tsc_check_sync(void) is_uv(UV3) ? sname.s3.field : \ undef) -/* [Copied from arch/x86/kernel/cpu/topology.c:detect_extended_topology()] */ - -#define SMT_LEVEL 0 /* Leaf 0xb SMT level */ -#define INVALID_TYPE 0 /* Leaf 0xb sub-leaf types */ -#define SMT_TYPE 1 -#define CORE_TYPE 2 -#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff) -#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f) - -static void set_x2apic_bits(void) -{ - unsigned int eax, ebx, ecx, edx, sub_index; - unsigned int sid_shift; - - cpuid(0, &eax, &ebx, &ecx, &edx); - if (eax < 0xb) { - pr_info("UV: CPU does not have CPUID.11\n"); - return; - } - - cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx); - if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) { - pr_info("UV: CPUID.11 not implemented\n"); - return; - } - - sid_shift = BITS_SHIFT_NEXT_LEVEL(eax); - sub_index = 1; - do { - cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx); - if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) { - sid_shift = BITS_SHIFT_NEXT_LEVEL(eax); - break; - } - sub_index++; - } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE); - - uv_cpuid.apicid_shift = 0; - uv_cpuid.apicid_mask = (~(-1 << sid_shift)); - uv_cpuid.socketid_shift = sid_shift; -} - static void __init early_get_apic_socketid_shift(void) { + unsigned int sid_shift = topology_get_domain_shift(TOPO_PKG_DOMAIN); + if (is_uv2_hub() || is_uv3_hub()) uvh_apicid.v = uv_early_read_mmr(UVH_APICID); - set_x2apic_bits(); + if (sid_shift) { + uv_cpuid.apicid_shift = 0; + uv_cpuid.apicid_mask = (~(-1 << sid_shift)); + uv_cpuid.socketid_shift = sid_shift; + } else { + pr_info("UV: CPU does not have valid CPUID.11\n"); + } pr_info("UV: apicid_shift:%d apicid_mask:0x%x\n", uv_cpuid.apicid_shift, uv_cpuid.apicid_mask); pr_info("UV: socketid_shift:%d pnode_mask:0x%x\n", uv_cpuid.socketid_shift, uv_cpuid.pnode_mask); @@ -779,21 +745,6 @@ static void uv_send_IPI_all(int vector) uv_send_IPI_mask(cpu_online_mask, vector); } -static u32 set_apic_id(u32 id) -{ - return id; -} - -static unsigned int uv_read_apic_id(void) -{ - return x2apic_get_apic_id(apic_read(APIC_ID)); -} - -static u32 uv_phys_pkg_id(u32 initial_apicid, int index_msb) -{ - return uv_read_apic_id() >> index_msb; -} - static int uv_probe(void) { return apic == &apic_x2apic_uv_x; @@ -810,11 +761,9 @@ static struct apic apic_x2apic_uv_x __ro_after_init = { .disable_esr = 0, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .phys_pkg_id = uv_phys_pkg_id, .max_apic_id = UINT_MAX, .get_apic_id = x2apic_get_apic_id, - .set_apic_id = set_apic_id, .calc_dest_apicid = apic_default_calc_apicid, diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 93eabf544031..eb4dbcdf41f1 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -17,7 +17,8 @@ KMSAN_SANITIZE_common.o := n # As above, instrumenting secondary CPU boot code causes boot hangs. KCSAN_SANITIZE_common.o := n -obj-y := cacheinfo.o scattered.o topology.o +obj-y := cacheinfo.o scattered.o +obj-y += topology_common.o topology_ext.o topology_amd.o obj-y += common.o obj-y += rdrand.o obj-y += match.o @@ -25,14 +26,16 @@ obj-y += bugs.o obj-y += aperfmperf.o obj-y += cpuid-deps.o obj-y += umwait.o +obj-y += capflags.o powerflags.o -obj-$(CONFIG_PROC_FS) += proc.o -obj-y += capflags.o powerflags.o +obj-$(CONFIG_X86_LOCAL_APIC) += topology.o -obj-$(CONFIG_IA32_FEAT_CTL) += feat_ctl.o +obj-$(CONFIG_PROC_FS) += proc.o + +obj-$(CONFIG_IA32_FEAT_CTL) += feat_ctl.o ifdef CONFIG_CPU_SUP_INTEL -obj-y += intel.o intel_pconfig.o tsx.o -obj-$(CONFIG_PM) += intel_epb.o +obj-y += intel.o intel_pconfig.o tsx.o +obj-$(CONFIG_PM) += intel_epb.o endif obj-$(CONFIG_CPU_SUP_AMD) += amd.o obj-$(CONFIG_CPU_SUP_HYGON) += hygon.o diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c index bfeb18fad63f..2c5b51aad91a 100644 --- a/arch/x86/kernel/cpu/acrn.c +++ b/arch/x86/kernel/cpu/acrn.c @@ -26,8 +26,8 @@ static u32 __init acrn_detect(void) static void __init acrn_init_platform(void) { - /* Setup the IDT for ACRN hypervisor callback */ - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_acrn_hv_callback); + /* Install system interrupt handler for ACRN hypervisor callback */ + sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_acrn_hv_callback); x86_platform.calibrate_tsc = acrn_get_tsc_khz; x86_platform.calibrate_cpu = acrn_get_tsc_khz; diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 9f42d1c59e09..f8ae10222fd0 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -20,6 +20,7 @@ #include <asm/delay.h> #include <asm/debugreg.h> #include <asm/resctrl.h> +#include <asm/sev.h> #ifdef CONFIG_X86_64 # include <asm/mmconfig.h> @@ -27,13 +28,6 @@ #include "cpu.h" -/* - * nodes_per_socket: Stores the number of nodes per socket. - * Refer to Fam15h Models 00-0fh BKDG - CPUID Fn8000_001E_ECX - * Node Identifiers[10:8] - */ -static u32 nodes_per_socket = 1; - static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) { u32 gprs[8] = { 0 }; @@ -300,97 +294,6 @@ static int nearby_node(int apicid) } #endif -/* - * Fix up topo::core_id for pre-F17h systems to be in the - * [0 .. cores_per_node - 1] range. Not really needed but - * kept so as not to break existing setups. - */ -static void legacy_fixup_core_id(struct cpuinfo_x86 *c) -{ - u32 cus_per_node; - - if (c->x86 >= 0x17) - return; - - cus_per_node = c->x86_max_cores / nodes_per_socket; - c->topo.core_id %= cus_per_node; -} - -/* - * Fixup core topology information for - * (1) AMD multi-node processors - * Assumption: Number of cores in each internal node is the same. - * (2) AMD processors supporting compute units - */ -static void amd_get_topology(struct cpuinfo_x86 *c) -{ - /* get information required for multi-node processors */ - if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { - int err; - u32 eax, ebx, ecx, edx; - - cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); - - c->topo.die_id = ecx & 0xff; - - if (c->x86 == 0x15) - c->topo.cu_id = ebx & 0xff; - - if (c->x86 >= 0x17) { - c->topo.core_id = ebx & 0xff; - - if (smp_num_siblings > 1) - c->x86_max_cores /= smp_num_siblings; - } - - /* - * In case leaf B is available, use it to derive - * topology information. - */ - err = detect_extended_topology(c); - if (!err) - c->x86_coreid_bits = get_count_order(c->x86_max_cores); - - cacheinfo_amd_init_llc_id(c); - - } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { - u64 value; - - rdmsrl(MSR_FAM10H_NODE_ID, value); - c->topo.die_id = value & 7; - c->topo.llc_id = c->topo.die_id; - } else - return; - - if (nodes_per_socket > 1) { - set_cpu_cap(c, X86_FEATURE_AMD_DCM); - legacy_fixup_core_id(c); - } -} - -/* - * On a AMD dual core setup the lower bits of the APIC id distinguish the cores. - * Assumes number of cores is a power of two. - */ -static void amd_detect_cmp(struct cpuinfo_x86 *c) -{ - unsigned bits; - - bits = c->x86_coreid_bits; - /* Low order bits define the core id (index of core in socket) */ - c->topo.core_id = c->topo.initial_apicid & ((1 << bits)-1); - /* Convert the initial APIC ID into the socket ID */ - c->topo.pkg_id = c->topo.initial_apicid >> bits; - /* use socket ID also for last level cache */ - c->topo.llc_id = c->topo.die_id = c->topo.pkg_id; -} - -u32 amd_get_nodes_per_socket(void) -{ - return nodes_per_socket; -} -EXPORT_SYMBOL_GPL(amd_get_nodes_per_socket); - static void srat_detect_node(struct cpuinfo_x86 *c) { #ifdef CONFIG_NUMA @@ -442,32 +345,6 @@ static void srat_detect_node(struct cpuinfo_x86 *c) #endif } -static void early_init_amd_mc(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_SMP - unsigned bits, ecx; - - /* Multi core CPU? */ - if (c->extended_cpuid_level < 0x80000008) - return; - - ecx = cpuid_ecx(0x80000008); - - c->x86_max_cores = (ecx & 0xff) + 1; - - /* CPU telling us the core id bits shift? */ - bits = (ecx >> 12) & 0xF; - - /* Otherwise recompute */ - if (bits == 0) { - while ((1 << bits) < c->x86_max_cores) - bits++; - } - - c->x86_coreid_bits = bits; -#endif -} - static void bsp_init_amd(struct cpuinfo_x86 *c) { if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { @@ -500,18 +377,6 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_MWAITX)) use_mwaitx_delay(); - if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { - u32 ecx; - - ecx = cpuid_ecx(0x8000001e); - __max_die_per_package = nodes_per_socket = ((ecx >> 8) & 7) + 1; - } else if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) { - u64 value; - - rdmsrl(MSR_FAM10H_NODE_ID, value); - __max_die_per_package = nodes_per_socket = ((value >> 3) & 7) + 1; - } - if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) && !boot_cpu_has(X86_FEATURE_VIRT_SSBD) && c->x86 >= 0x15 && c->x86 <= 0x17) { @@ -538,7 +403,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) /* Figure out Zen generations: */ switch (c->x86) { - case 0x17: { + case 0x17: switch (c->x86_model) { case 0x00 ... 0x2f: case 0x50 ... 0x5f: @@ -554,8 +419,8 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) goto warn; } break; - } - case 0x19: { + + case 0x19: switch (c->x86_model) { case 0x00 ... 0x0f: case 0x20 ... 0x5f: @@ -569,11 +434,39 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) goto warn; } break; - } + + case 0x1a: + switch (c->x86_model) { + case 0x00 ... 0x0f: + case 0x20 ... 0x2f: + case 0x40 ... 0x4f: + case 0x70 ... 0x7f: + setup_force_cpu_cap(X86_FEATURE_ZEN5); + break; + default: + goto warn; + } + break; + default: break; } + if (cpu_has(c, X86_FEATURE_SEV_SNP)) { + /* + * RMP table entry format is not architectural and it can vary by processor + * and is defined by the per-processor PPR. Restrict SNP support on the + * known CPU model and family for which the RMP table entry format is + * currently defined for. + */ + if (!boot_cpu_has(X86_FEATURE_ZEN3) && + !boot_cpu_has(X86_FEATURE_ZEN4) && + !boot_cpu_has(X86_FEATURE_ZEN5)) + setup_clear_cpu_cap(X86_FEATURE_SEV_SNP); + else if (!snp_probe_rmptable_info()) + setup_clear_cpu_cap(X86_FEATURE_SEV_SNP); + } + return; warn: @@ -592,8 +485,8 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) * SME feature (set in scattered.c). * If the kernel has not enabled SME via any means then * don't advertise the SME feature. - * For SEV: If BIOS has not enabled SEV then don't advertise the - * SEV and SEV_ES feature (set in scattered.c). + * For SEV: If BIOS has not enabled SEV then don't advertise SEV and + * any additional functionality based on it. * * In all cases, since support for SME and SEV requires long mode, * don't advertise the feature under CONFIG_X86_32. @@ -628,6 +521,7 @@ clear_all: clear_sev: setup_clear_cpu_cap(X86_FEATURE_SEV); setup_clear_cpu_cap(X86_FEATURE_SEV_ES); + setup_clear_cpu_cap(X86_FEATURE_SEV_SNP); } } @@ -636,8 +530,6 @@ static void early_init_amd(struct cpuinfo_x86 *c) u64 value; u32 dummy; - early_init_amd_mc(c); - if (c->x86 >= 0xf) set_cpu_cap(c, X86_FEATURE_K8); @@ -717,9 +609,6 @@ static void early_init_amd(struct cpuinfo_x86 *c) } } - if (cpu_has(c, X86_FEATURE_TOPOEXT)) - smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1; - if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_IBPB_BRTYPE)) { if (c->x86 == 0x17 && boot_cpu_has(X86_FEATURE_AMD_IBPB)) setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE); @@ -956,7 +845,6 @@ static void init_amd_zen_common(void) static void init_amd_zen1(struct cpuinfo_x86 *c) { - init_amd_zen_common(); fix_erratum_1386(c); /* Fix up CPUID bits, but only if not virtualised. */ @@ -1010,7 +898,6 @@ static void zen2_zenbleed_check(struct cpuinfo_x86 *c) static void init_amd_zen2(struct cpuinfo_x86 *c) { - init_amd_zen_common(); init_spectral_chicken(c); fix_erratum_1386(c); zen2_zenbleed_check(c); @@ -1018,8 +905,6 @@ static void init_amd_zen2(struct cpuinfo_x86 *c) static void init_amd_zen3(struct cpuinfo_x86 *c) { - init_amd_zen_common(); - if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) { /* * Zen3 (Fam19 model < 0x10) parts are not susceptible to @@ -1033,12 +918,14 @@ static void init_amd_zen3(struct cpuinfo_x86 *c) static void init_amd_zen4(struct cpuinfo_x86 *c) { - init_amd_zen_common(); - if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT); } +static void init_amd_zen5(struct cpuinfo_x86 *c) +{ +} + static void init_amd(struct cpuinfo_x86 *c) { u64 vm_cr; @@ -1058,9 +945,6 @@ static void init_amd(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_FSRM)) set_cpu_cap(c, X86_FEATURE_FSRS); - /* get apicid instead of initial apic id from cpuid */ - c->topo.apicid = read_apic_id(); - /* K6s reports MCEs but don't actually have all the MSRs */ if (c->x86 < 6) clear_cpu_cap(c, X86_FEATURE_MCE); @@ -1076,6 +960,13 @@ static void init_amd(struct cpuinfo_x86 *c) case 0x16: init_amd_jg(c); break; } + /* + * Save up on some future enablement work and do common Zen + * settings. + */ + if (c->x86 >= 0x17) + init_amd_zen_common(); + if (boot_cpu_has(X86_FEATURE_ZEN1)) init_amd_zen1(c); else if (boot_cpu_has(X86_FEATURE_ZEN2)) @@ -1084,6 +975,8 @@ static void init_amd(struct cpuinfo_x86 *c) init_amd_zen3(c); else if (boot_cpu_has(X86_FEATURE_ZEN4)) init_amd_zen4(c); + else if (boot_cpu_has(X86_FEATURE_ZEN5)) + init_amd_zen5(c); /* * Enable workaround for FXSAVE leak on CPUs @@ -1094,8 +987,6 @@ static void init_amd(struct cpuinfo_x86 *c) cpu_detect_cache_sizes(c); - amd_detect_cmp(c); - amd_get_topology(c); srat_detect_node(c); init_amd_cacheinfo(c); diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index bb0ab8466b91..48d049cd74e7 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -111,9 +111,6 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); /* Control unconditional IBPB in switch_mm() */ DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb); -/* Control MDS CPU buffer clear before returning to user space */ -DEFINE_STATIC_KEY_FALSE(mds_user_clear); -EXPORT_SYMBOL_GPL(mds_user_clear); /* Control MDS CPU buffer clear before idling (halt, mwait) */ DEFINE_STATIC_KEY_FALSE(mds_idle_clear); EXPORT_SYMBOL_GPL(mds_idle_clear); @@ -252,7 +249,7 @@ static void __init mds_select_mitigation(void) if (!boot_cpu_has(X86_FEATURE_MD_CLEAR)) mds_mitigation = MDS_MITIGATION_VMWERV; - static_branch_enable(&mds_user_clear); + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) && (mds_nosmt || cpu_mitigations_auto_nosmt())) @@ -356,7 +353,7 @@ static void __init taa_select_mitigation(void) * For guests that can't determine whether the correct microcode is * present on host, enable the mitigation for UCODE_NEEDED as well. */ - static_branch_enable(&mds_user_clear); + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); if (taa_nosmt || cpu_mitigations_auto_nosmt()) cpu_smt_disable(false); @@ -424,7 +421,7 @@ static void __init mmio_select_mitigation(void) */ if (boot_cpu_has_bug(X86_BUG_MDS) || (boot_cpu_has_bug(X86_BUG_TAA) && boot_cpu_has(X86_FEATURE_RTM))) - static_branch_enable(&mds_user_clear); + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); else static_branch_enable(&mmio_stale_data_clear); @@ -484,12 +481,12 @@ static void __init md_clear_update_mitigation(void) if (cpu_mitigations_off()) return; - if (!static_key_enabled(&mds_user_clear)) + if (!boot_cpu_has(X86_FEATURE_CLEAR_CPU_BUF)) goto out; /* - * mds_user_clear is now enabled. Update MDS, TAA and MMIO Stale Data - * mitigation, if necessary. + * X86_FEATURE_CLEAR_CPU_BUF is now enabled. Update MDS, TAA and MMIO + * Stale Data mitigation, if necessary. */ if (mds_mitigation == MDS_MITIGATION_OFF && boot_cpu_has_bug(X86_BUG_MDS)) { diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index c131c412db89..392d09c936d6 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -301,7 +301,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, eax->split.type = types[leaf]; eax->split.level = levels[leaf]; eax->split.num_threads_sharing = 0; - eax->split.num_cores_on_die = __this_cpu_read(cpu_info.x86_max_cores) - 1; + eax->split.num_cores_on_die = topology_num_cores_per_package(); if (assoc == 0xffff) @@ -595,7 +595,7 @@ static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index) if (index < 3) return; - node = topology_die_id(smp_processor_id()); + node = topology_amd_node_id(smp_processor_id()); this_leaf->nb = node_to_amd_nb(node); if (this_leaf->nb && !this_leaf->nb->l3_cache.indices) amd_calc_l3_indices(this_leaf->nb); @@ -661,7 +661,7 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c) return i; } -void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c) +void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id) { /* * We may have multiple LLCs if L3 caches exist, so check if we @@ -672,7 +672,7 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c) if (c->x86 < 0x17) { /* LLC is at the node level. */ - c->topo.llc_id = c->topo.die_id; + c->topo.llc_id = die_id; } else if (c->x86 == 0x17 && c->x86_model <= 0x1F) { /* * LLC is at the core complex level. @@ -1118,15 +1118,16 @@ static void cache_cpu_init(void) unsigned long flags; local_irq_save(flags); - cache_disable(); - if (memory_caching_control & CACHE_MTRR) + if (memory_caching_control & CACHE_MTRR) { + cache_disable(); mtrr_generic_set_state(); + cache_enable(); + } if (memory_caching_control & CACHE_PAT) pat_cpu_init(); - cache_enable(); local_irq_restore(flags); } diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index 345f7d905db6..a3b55db35c96 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -128,10 +128,6 @@ static void init_centaur(struct cpuinfo_x86 *c) #endif early_init_centaur(c); init_intel_cacheinfo(c); - detect_num_cpu_cores(c); -#ifdef CONFIG_X86_32 - detect_ht(c); -#endif if (c->cpuid_level > 9) { unsigned int eax = cpuid_eax(10); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 0b97bcde70c6..e5d7dcaea209 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -61,6 +61,7 @@ #include <asm/microcode.h> #include <asm/intel-family.h> #include <asm/cpu_device_id.h> +#include <asm/fred.h> #include <asm/uv/uv.h> #include <asm/ia32.h> #include <asm/set_memory.h> @@ -73,8 +74,20 @@ u32 elf_hwcap2 __read_mostly; /* Number of siblings per CPU package */ -int smp_num_siblings = 1; -EXPORT_SYMBOL(smp_num_siblings); +unsigned int __max_threads_per_core __ro_after_init = 1; +EXPORT_SYMBOL(__max_threads_per_core); + +unsigned int __max_dies_per_package __ro_after_init = 1; +EXPORT_SYMBOL(__max_dies_per_package); + +unsigned int __max_logical_packages __ro_after_init = 1; +EXPORT_SYMBOL(__max_logical_packages); + +unsigned int __num_cores_per_package __ro_after_init = 1; +EXPORT_SYMBOL(__num_cores_per_package); + +unsigned int __num_threads_per_package __ro_after_init = 1; +EXPORT_SYMBOL(__num_threads_per_package); static struct ppin_info { int feature; @@ -382,9 +395,8 @@ out: } /* These bits should not change their value after CPU init is finished. */ -static const unsigned long cr4_pinned_mask = - X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP | - X86_CR4_FSGSBASE | X86_CR4_CET; +static const unsigned long cr4_pinned_mask = X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP | + X86_CR4_FSGSBASE | X86_CR4_CET | X86_CR4_FRED; static DEFINE_STATIC_KEY_FALSE_RO(cr_pinning); static unsigned long cr4_pinned_bits __ro_after_init; @@ -790,19 +802,6 @@ static void get_model_name(struct cpuinfo_x86 *c) *(s + 1) = '\0'; } -void detect_num_cpu_cores(struct cpuinfo_x86 *c) -{ - unsigned int eax, ebx, ecx, edx; - - c->x86_max_cores = 1; - if (!IS_ENABLED(CONFIG_SMP) || c->cpuid_level < 4) - return; - - cpuid_count(4, 0, &eax, &ebx, &ecx, &edx); - if (eax & 0x1f) - c->x86_max_cores = (eax >> 26) + 1; -} - void cpu_detect_cache_sizes(struct cpuinfo_x86 *c) { unsigned int n, dummy, ebx, ecx, edx, l2size; @@ -864,51 +863,6 @@ static void cpu_detect_tlb(struct cpuinfo_x86 *c) tlb_lld_4m[ENTRIES], tlb_lld_1g[ENTRIES]); } -int detect_ht_early(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_SMP - u32 eax, ebx, ecx, edx; - - if (!cpu_has(c, X86_FEATURE_HT)) - return -1; - - if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) - return -1; - - if (cpu_has(c, X86_FEATURE_XTOPOLOGY)) - return -1; - - cpuid(1, &eax, &ebx, &ecx, &edx); - - smp_num_siblings = (ebx & 0xff0000) >> 16; - if (smp_num_siblings == 1) - pr_info_once("CPU0: Hyper-Threading is disabled\n"); -#endif - return 0; -} - -void detect_ht(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_SMP - int index_msb, core_bits; - - if (detect_ht_early(c) < 0) - return; - - index_msb = get_count_order(smp_num_siblings); - c->topo.pkg_id = apic->phys_pkg_id(c->topo.initial_apicid, index_msb); - - smp_num_siblings = smp_num_siblings / c->x86_max_cores; - - index_msb = get_count_order(smp_num_siblings); - - core_bits = get_count_order(c->x86_max_cores); - - c->topo.core_id = apic->phys_pkg_id(c->topo.initial_apicid, index_msb) & - ((1 << core_bits) - 1); -#endif -} - static void get_cpu_vendor(struct cpuinfo_x86 *c) { char *v = c->x86_vendor_id; @@ -1355,8 +1309,13 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) /* * AMD's AutoIBRS is equivalent to Intel's eIBRS - use the Intel feature * flag and protect from vendor-specific bugs via the whitelist. + * + * Don't use AutoIBRS when SNP is enabled because it degrades host + * userspace indirect branch performance. */ - if ((ia32_cap & ARCH_CAP_IBRS_ALL) || cpu_has(c, X86_FEATURE_AUTOIBRS)) { + if ((ia32_cap & ARCH_CAP_IBRS_ALL) || + (cpu_has(c, X86_FEATURE_AUTOIBRS) && + !cpu_feature_enabled(X86_FEATURE_SEV_SNP))) { setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED); if (!cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) && !(ia32_cap & ARCH_CAP_PBRSB_NO)) @@ -1589,8 +1548,11 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) get_cpu_vendor(c); get_cpu_cap(c); setup_force_cpu_cap(X86_FEATURE_CPUID); + get_cpu_address_sizes(c); cpu_parse_early_param(); + cpu_init_topology(c); + if (this_cpu->c_early_init) this_cpu->c_early_init(c); @@ -1601,10 +1563,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) this_cpu->c_bsp_init(c); } else { setup_clear_cpu_cap(X86_FEATURE_CPUID); + get_cpu_address_sizes(c); + cpu_init_topology(c); } - get_cpu_address_sizes(c); - setup_force_cpu_cap(X86_FEATURE_ALWAYS); cpu_set_bug_bits(c); @@ -1748,18 +1710,6 @@ static void generic_identify(struct cpuinfo_x86 *c) get_cpu_address_sizes(c); - if (c->cpuid_level >= 0x00000001) { - c->topo.initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF; -#ifdef CONFIG_X86_32 -# ifdef CONFIG_SMP - c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0); -# else - c->topo.apicid = c->topo.initial_apicid; -# endif -#endif - c->topo.pkg_id = c->topo.initial_apicid; - } - get_model_name(c); /* Default name */ /* @@ -1781,29 +1731,6 @@ static void generic_identify(struct cpuinfo_x86 *c) } /* - * Validate that ACPI/mptables have the same information about the - * effective APIC id and update the package map. - */ -static void validate_apic_and_package_id(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_SMP - unsigned int cpu = smp_processor_id(); - u32 apicid; - - apicid = apic->cpu_present_to_apicid(cpu); - - if (apicid != c->topo.apicid) { - pr_err(FW_BUG "CPU%u: APIC id mismatch. Firmware: %x APIC: %x\n", - cpu, apicid, c->topo.initial_apicid); - } - BUG_ON(topology_update_package_map(c->topo.pkg_id, cpu)); - BUG_ON(topology_update_die_map(c->topo.die_id, cpu)); -#else - c->topo.logical_pkg_id = 0; -#endif -} - -/* * This does the hard work of actually picking apart the CPU stuff... */ static void identify_cpu(struct cpuinfo_x86 *c) @@ -1816,11 +1743,6 @@ static void identify_cpu(struct cpuinfo_x86 *c) c->x86_model = c->x86_stepping = 0; /* So far unknown... */ c->x86_vendor_id[0] = '\0'; /* Unset */ c->x86_model_id[0] = '\0'; /* Unset */ - c->x86_max_cores = 1; - c->x86_coreid_bits = 0; - c->topo.cu_id = 0xff; - c->topo.llc_id = BAD_APICID; - c->topo.l2c_id = BAD_APICID; #ifdef CONFIG_X86_64 c->x86_clflush_size = 64; c->x86_phys_bits = 36; @@ -1839,17 +1761,14 @@ static void identify_cpu(struct cpuinfo_x86 *c) generic_identify(c); + cpu_parse_topology(c); + if (this_cpu->c_identify) this_cpu->c_identify(c); /* Clear/Set all flags overridden by options, after probe */ apply_forced_caps(c); -#ifdef CONFIG_X86_64 - c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0); -#endif - - /* * Set default APIC and TSC_DEADLINE MSR fencing flag. AMD and * Hygon will clear it in ->c_init() below. @@ -1903,10 +1822,6 @@ static void identify_cpu(struct cpuinfo_x86 *c) c->x86, c->x86_model); } -#ifdef CONFIG_X86_64 - detect_ht(c); -#endif - x86_init_rdrand(c); setup_pku(c); setup_cet(c); @@ -1998,7 +1913,6 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_32 enable_sep_cpu(); #endif - validate_apic_and_package_id(c); x86_spec_ctrl_setup_ap(); update_srbds_msr(); if (boot_cpu_has_bug(X86_BUG_GDS)) @@ -2067,10 +1981,8 @@ static void wrmsrl_cstar(unsigned long val) wrmsrl(MSR_CSTAR, val); } -/* May not be marked __init: used by software suspend */ -void syscall_init(void) +static inline void idt_syscall_init(void) { - wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); if (ia32_enabled()) { @@ -2104,6 +2016,23 @@ void syscall_init(void) X86_EFLAGS_AC|X86_EFLAGS_ID); } +/* May not be marked __init: used by software suspend */ +void syscall_init(void) +{ + /* The default user and kernel segments */ + wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); + + /* + * Except the IA32_STAR MSR, there is NO need to setup SYSCALL and + * SYSENTER MSRs for FRED, because FRED uses the ring 3 FRED + * entrypoint for SYSCALL and SYSENTER, and ERETU is the only legit + * instruction to return to ring 3 (both sysexit and sysret cause + * #UD when FRED is enabled). + */ + if (!cpu_feature_enabled(X86_FEATURE_FRED)) + idt_syscall_init(); +} + #else /* CONFIG_X86_64 */ #ifdef CONFIG_STACKPROTECTOR @@ -2207,8 +2136,9 @@ void cpu_init_exception_handling(void) /* paranoid_entry() gets the CPU number from the GDT */ setup_getcpu(cpu); - /* IST vectors need TSS to be set up. */ - tss_setup_ist(tss); + /* For IDT mode, IST vectors need to be set in TSS. */ + if (!cpu_feature_enabled(X86_FEATURE_FRED)) + tss_setup_ist(tss); tss_setup_io_bitmap(tss); set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); @@ -2217,8 +2147,10 @@ void cpu_init_exception_handling(void) /* GHCB needs to be setup to handle #VC. */ setup_ghcb(); - /* Finally load the IDT */ - load_current_idt(); + if (cpu_feature_enabled(X86_FEATURE_FRED)) + cpu_init_fred_exceptions(); + else + load_current_idt(); } /* @@ -2347,7 +2279,7 @@ void __init arch_cpu_finalize_init(void) * identify_boot_cpu() initialized SMT support information, let the * core code know. */ - cpu_smt_set_num_threads(smp_num_siblings, smp_num_siblings); + cpu_smt_set_num_threads(__max_threads_per_core, __max_threads_per_core); if (!IS_ENABLED(CONFIG_SMP)) { pr_info("CPU: "); diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 885281ae79a5..ea9e07d57c8d 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -2,6 +2,11 @@ #ifndef ARCH_X86_CPU_H #define ARCH_X86_CPU_H +#include <asm/cpu.h> +#include <asm/topology.h> + +#include "topology.h" + /* attempt to consolidate cpu attributes */ struct cpu_dev { const char *c_vendor; @@ -71,14 +76,9 @@ extern void init_intel_cacheinfo(struct cpuinfo_x86 *c); extern void init_amd_cacheinfo(struct cpuinfo_x86 *c); extern void init_hygon_cacheinfo(struct cpuinfo_x86 *c); -extern void detect_num_cpu_cores(struct cpuinfo_x86 *c); -extern int detect_extended_topology_early(struct cpuinfo_x86 *c); -extern int detect_extended_topology(struct cpuinfo_x86 *c); -extern int detect_ht_early(struct cpuinfo_x86 *c); -extern void detect_ht(struct cpuinfo_x86 *c); extern void check_null_seg_clears_base(struct cpuinfo_x86 *c); -void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c); +void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id); void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c); unsigned int aperfmperf_get_khz(int cpu); @@ -96,4 +96,5 @@ static inline bool spectre_v2_in_eibrs_mode(enum spectre_v2_mitigation mode) mode == SPECTRE_V2_EIBRS_RETPOLINE || mode == SPECTRE_V2_EIBRS_LFENCE; } + #endif /* ARCH_X86_CPU_H */ diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index e462c1d3800a..b7174209d855 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -82,6 +82,8 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_XFD, X86_FEATURE_XGETBV1 }, { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD }, { X86_FEATURE_SHSTK, X86_FEATURE_XSAVES }, + { X86_FEATURE_FRED, X86_FEATURE_LKGS }, + { X86_FEATURE_FRED, X86_FEATURE_WRMSRNS }, {} }; diff --git a/arch/x86/kernel/cpu/debugfs.c b/arch/x86/kernel/cpu/debugfs.c index 0c179d684b3b..3baf3e435834 100644 --- a/arch/x86/kernel/cpu/debugfs.c +++ b/arch/x86/kernel/cpu/debugfs.c @@ -5,6 +5,8 @@ #include <asm/apic.h> #include <asm/processor.h> +#include "cpu.h" + static int cpu_debug_show(struct seq_file *m, void *p) { unsigned long cpu = (unsigned long)m->private; @@ -24,9 +26,12 @@ static int cpu_debug_show(struct seq_file *m, void *p) seq_printf(m, "logical_die_id: %u\n", c->topo.logical_die_id); seq_printf(m, "llc_id: %u\n", c->topo.llc_id); seq_printf(m, "l2c_id: %u\n", c->topo.l2c_id); - seq_printf(m, "max_cores: %u\n", c->x86_max_cores); - seq_printf(m, "max_die_per_pkg: %u\n", __max_die_per_package); - seq_printf(m, "smp_num_siblings: %u\n", smp_num_siblings); + seq_printf(m, "amd_node_id: %u\n", c->topo.amd_node_id); + seq_printf(m, "amd_nodes_per_pkg: %u\n", topology_amd_nodes_per_pkg()); + seq_printf(m, "num_threads: %u\n", __num_threads_per_package); + seq_printf(m, "num_cores: %u\n", __num_cores_per_package); + seq_printf(m, "max_dies_per_pkg: %u\n", __max_dies_per_package); + seq_printf(m, "max_threads_per_core:%u\n", __max_threads_per_core); return 0; } @@ -42,12 +47,48 @@ static const struct file_operations dfs_cpu_ops = { .release = single_release, }; +static int dom_debug_show(struct seq_file *m, void *p) +{ + static const char *domain_names[TOPO_MAX_DOMAIN] = { + [TOPO_SMT_DOMAIN] = "Thread", + [TOPO_CORE_DOMAIN] = "Core", + [TOPO_MODULE_DOMAIN] = "Module", + [TOPO_TILE_DOMAIN] = "Tile", + [TOPO_DIE_DOMAIN] = "Die", + [TOPO_DIEGRP_DOMAIN] = "DieGrp", + [TOPO_PKG_DOMAIN] = "Package", + }; + unsigned int dom, nthreads = 1; + + for (dom = 0; dom < TOPO_MAX_DOMAIN; dom++) { + nthreads *= x86_topo_system.dom_size[dom]; + seq_printf(m, "domain: %-10s shift: %u dom_size: %5u max_threads: %5u\n", + domain_names[dom], x86_topo_system.dom_shifts[dom], + x86_topo_system.dom_size[dom], nthreads); + } + return 0; +} + +static int dom_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, dom_debug_show, inode->i_private); +} + +static const struct file_operations dfs_dom_ops = { + .open = dom_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static __init int cpu_init_debugfs(void) { struct dentry *dir, *base = debugfs_create_dir("topo", arch_debugfs_dir); unsigned long id; char name[24]; + debugfs_create_file("domains", 0444, base, NULL, &dfs_dom_ops); + dir = debugfs_create_dir("cpus", base); for_each_possible_cpu(id) { sprintf(name, "%lu", id); diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index f0cd95502faa..c5191b06f9f2 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -18,14 +18,6 @@ #include "cpu.h" -#define APICID_SOCKET_ID_BIT 6 - -/* - * nodes_per_socket: Stores the number of nodes per socket. - * Refer to CPUID Fn8000_001E_ECX Node Identifiers[10:8] - */ -static u32 nodes_per_socket = 1; - #ifdef CONFIG_NUMA /* * To workaround broken NUMA config. Read the comment in @@ -49,80 +41,6 @@ static int nearby_node(int apicid) } #endif -static void hygon_get_topology_early(struct cpuinfo_x86 *c) -{ - if (cpu_has(c, X86_FEATURE_TOPOEXT)) - smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1; -} - -/* - * Fixup core topology information for - * (1) Hygon multi-node processors - * Assumption: Number of cores in each internal node is the same. - * (2) Hygon processors supporting compute units - */ -static void hygon_get_topology(struct cpuinfo_x86 *c) -{ - /* get information required for multi-node processors */ - if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { - int err; - u32 eax, ebx, ecx, edx; - - cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); - - c->topo.die_id = ecx & 0xff; - - c->topo.core_id = ebx & 0xff; - - if (smp_num_siblings > 1) - c->x86_max_cores /= smp_num_siblings; - - /* - * In case leaf B is available, use it to derive - * topology information. - */ - err = detect_extended_topology(c); - if (!err) - c->x86_coreid_bits = get_count_order(c->x86_max_cores); - - /* - * Socket ID is ApicId[6] for the processors with model <= 0x3 - * when running on host. - */ - if (!boot_cpu_has(X86_FEATURE_HYPERVISOR) && c->x86_model <= 0x3) - c->topo.pkg_id = c->topo.apicid >> APICID_SOCKET_ID_BIT; - - cacheinfo_hygon_init_llc_id(c); - } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { - u64 value; - - rdmsrl(MSR_FAM10H_NODE_ID, value); - c->topo.die_id = value & 7; - c->topo.llc_id = c->topo.die_id; - } else - return; - - if (nodes_per_socket > 1) - set_cpu_cap(c, X86_FEATURE_AMD_DCM); -} - -/* - * On Hygon setup the lower bits of the APIC id distinguish the cores. - * Assumes number of cores is a power of two. - */ -static void hygon_detect_cmp(struct cpuinfo_x86 *c) -{ - unsigned int bits; - - bits = c->x86_coreid_bits; - /* Low order bits define the core id (index of core in socket) */ - c->topo.core_id = c->topo.initial_apicid & ((1 << bits)-1); - /* Convert the initial APIC ID into the socket ID */ - c->topo.pkg_id = c->topo.initial_apicid >> bits; - /* Use package ID also for last level cache */ - c->topo.llc_id = c->topo.die_id = c->topo.pkg_id; -} - static void srat_detect_node(struct cpuinfo_x86 *c) { #ifdef CONFIG_NUMA @@ -173,32 +91,6 @@ static void srat_detect_node(struct cpuinfo_x86 *c) #endif } -static void early_init_hygon_mc(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_SMP - unsigned int bits, ecx; - - /* Multi core CPU? */ - if (c->extended_cpuid_level < 0x80000008) - return; - - ecx = cpuid_ecx(0x80000008); - - c->x86_max_cores = (ecx & 0xff) + 1; - - /* CPU telling us the core id bits shift? */ - bits = (ecx >> 12) & 0xF; - - /* Otherwise recompute */ - if (bits == 0) { - while ((1 << bits) < c->x86_max_cores) - bits++; - } - - c->x86_coreid_bits = bits; -#endif -} - static void bsp_init_hygon(struct cpuinfo_x86 *c) { if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { @@ -212,18 +104,6 @@ static void bsp_init_hygon(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_MWAITX)) use_mwaitx_delay(); - if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { - u32 ecx; - - ecx = cpuid_ecx(0x8000001e); - __max_die_per_package = nodes_per_socket = ((ecx >> 8) & 7) + 1; - } else if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) { - u64 value; - - rdmsrl(MSR_FAM10H_NODE_ID, value); - __max_die_per_package = nodes_per_socket = ((value >> 3) & 7) + 1; - } - if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) && !boot_cpu_has(X86_FEATURE_VIRT_SSBD)) { /* @@ -242,8 +122,6 @@ static void early_init_hygon(struct cpuinfo_x86 *c) { u32 dummy; - early_init_hygon_mc(c); - set_cpu_cap(c, X86_FEATURE_K8); rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); @@ -284,8 +162,6 @@ static void early_init_hygon(struct cpuinfo_x86 *c) * we can set it unconditionally. */ set_cpu_cap(c, X86_FEATURE_VMMCALL); - - hygon_get_topology_early(c); } static void init_hygon(struct cpuinfo_x86 *c) @@ -302,9 +178,6 @@ static void init_hygon(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_REP_GOOD); - /* get apicid instead of initial apic id from cpuid */ - c->topo.apicid = read_apic_id(); - /* * XXX someone from Hygon needs to confirm this DTRT * @@ -316,8 +189,6 @@ static void init_hygon(struct cpuinfo_x86 *c) cpu_detect_cache_sizes(c); - hygon_detect_cmp(c); - hygon_get_topology(c); srat_detect_node(c); init_hygon_cacheinfo(c); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index a927a8fc9624..be30d7fa2e66 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -184,6 +184,90 @@ static bool bad_spectre_microcode(struct cpuinfo_x86 *c) return false; } +#define MSR_IA32_TME_ACTIVATE 0x982 + +/* Helpers to access TME_ACTIVATE MSR */ +#define TME_ACTIVATE_LOCKED(x) (x & 0x1) +#define TME_ACTIVATE_ENABLED(x) (x & 0x2) + +#define TME_ACTIVATE_POLICY(x) ((x >> 4) & 0xf) /* Bits 7:4 */ +#define TME_ACTIVATE_POLICY_AES_XTS_128 0 + +#define TME_ACTIVATE_KEYID_BITS(x) ((x >> 32) & 0xf) /* Bits 35:32 */ + +#define TME_ACTIVATE_CRYPTO_ALGS(x) ((x >> 48) & 0xffff) /* Bits 63:48 */ +#define TME_ACTIVATE_CRYPTO_AES_XTS_128 1 + +/* Values for mktme_status (SW only construct) */ +#define MKTME_ENABLED 0 +#define MKTME_DISABLED 1 +#define MKTME_UNINITIALIZED 2 +static int mktme_status = MKTME_UNINITIALIZED; + +static void detect_tme_early(struct cpuinfo_x86 *c) +{ + u64 tme_activate, tme_policy, tme_crypto_algs; + int keyid_bits = 0, nr_keyids = 0; + static u64 tme_activate_cpu0 = 0; + + rdmsrl(MSR_IA32_TME_ACTIVATE, tme_activate); + + if (mktme_status != MKTME_UNINITIALIZED) { + if (tme_activate != tme_activate_cpu0) { + /* Broken BIOS? */ + pr_err_once("x86/tme: configuration is inconsistent between CPUs\n"); + pr_err_once("x86/tme: MKTME is not usable\n"); + mktme_status = MKTME_DISABLED; + + /* Proceed. We may need to exclude bits from x86_phys_bits. */ + } + } else { + tme_activate_cpu0 = tme_activate; + } + + if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) { + pr_info_once("x86/tme: not enabled by BIOS\n"); + mktme_status = MKTME_DISABLED; + return; + } + + if (mktme_status != MKTME_UNINITIALIZED) + goto detect_keyid_bits; + + pr_info("x86/tme: enabled by BIOS\n"); + + tme_policy = TME_ACTIVATE_POLICY(tme_activate); + if (tme_policy != TME_ACTIVATE_POLICY_AES_XTS_128) + pr_warn("x86/tme: Unknown policy is active: %#llx\n", tme_policy); + + tme_crypto_algs = TME_ACTIVATE_CRYPTO_ALGS(tme_activate); + if (!(tme_crypto_algs & TME_ACTIVATE_CRYPTO_AES_XTS_128)) { + pr_err("x86/mktme: No known encryption algorithm is supported: %#llx\n", + tme_crypto_algs); + mktme_status = MKTME_DISABLED; + } +detect_keyid_bits: + keyid_bits = TME_ACTIVATE_KEYID_BITS(tme_activate); + nr_keyids = (1UL << keyid_bits) - 1; + if (nr_keyids) { + pr_info_once("x86/mktme: enabled by BIOS\n"); + pr_info_once("x86/mktme: %d KeyIDs available\n", nr_keyids); + } else { + pr_info_once("x86/mktme: disabled by BIOS\n"); + } + + if (mktme_status == MKTME_UNINITIALIZED) { + /* MKTME is usable */ + mktme_status = MKTME_ENABLED; + } + + /* + * KeyID bits effectively lower the number of physical address + * bits. Update cpuinfo_x86::x86_phys_bits accordingly. + */ + c->x86_phys_bits -= keyid_bits; +} + static void early_init_intel(struct cpuinfo_x86 *c) { u64 misc_enable; @@ -317,11 +401,11 @@ static void early_init_intel(struct cpuinfo_x86 *c) check_memory_type_self_snoop_errata(c); /* - * Get the number of SMT siblings early from the extended topology - * leaf, if available. Otherwise try the legacy SMT detection. + * Adjust the number of physical bits early because it affects the + * valid bits of the MTRR mask registers. */ - if (detect_extended_topology_early(c) < 0) - detect_ht_early(c); + if (cpu_has(c, X86_FEATURE_TME)) + detect_tme_early(c); } static void bsp_init_intel(struct cpuinfo_x86 *c) @@ -482,90 +566,6 @@ static void srat_detect_node(struct cpuinfo_x86 *c) #endif } -#define MSR_IA32_TME_ACTIVATE 0x982 - -/* Helpers to access TME_ACTIVATE MSR */ -#define TME_ACTIVATE_LOCKED(x) (x & 0x1) -#define TME_ACTIVATE_ENABLED(x) (x & 0x2) - -#define TME_ACTIVATE_POLICY(x) ((x >> 4) & 0xf) /* Bits 7:4 */ -#define TME_ACTIVATE_POLICY_AES_XTS_128 0 - -#define TME_ACTIVATE_KEYID_BITS(x) ((x >> 32) & 0xf) /* Bits 35:32 */ - -#define TME_ACTIVATE_CRYPTO_ALGS(x) ((x >> 48) & 0xffff) /* Bits 63:48 */ -#define TME_ACTIVATE_CRYPTO_AES_XTS_128 1 - -/* Values for mktme_status (SW only construct) */ -#define MKTME_ENABLED 0 -#define MKTME_DISABLED 1 -#define MKTME_UNINITIALIZED 2 -static int mktme_status = MKTME_UNINITIALIZED; - -static void detect_tme(struct cpuinfo_x86 *c) -{ - u64 tme_activate, tme_policy, tme_crypto_algs; - int keyid_bits = 0, nr_keyids = 0; - static u64 tme_activate_cpu0 = 0; - - rdmsrl(MSR_IA32_TME_ACTIVATE, tme_activate); - - if (mktme_status != MKTME_UNINITIALIZED) { - if (tme_activate != tme_activate_cpu0) { - /* Broken BIOS? */ - pr_err_once("x86/tme: configuration is inconsistent between CPUs\n"); - pr_err_once("x86/tme: MKTME is not usable\n"); - mktme_status = MKTME_DISABLED; - - /* Proceed. We may need to exclude bits from x86_phys_bits. */ - } - } else { - tme_activate_cpu0 = tme_activate; - } - - if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) { - pr_info_once("x86/tme: not enabled by BIOS\n"); - mktme_status = MKTME_DISABLED; - return; - } - - if (mktme_status != MKTME_UNINITIALIZED) - goto detect_keyid_bits; - - pr_info("x86/tme: enabled by BIOS\n"); - - tme_policy = TME_ACTIVATE_POLICY(tme_activate); - if (tme_policy != TME_ACTIVATE_POLICY_AES_XTS_128) - pr_warn("x86/tme: Unknown policy is active: %#llx\n", tme_policy); - - tme_crypto_algs = TME_ACTIVATE_CRYPTO_ALGS(tme_activate); - if (!(tme_crypto_algs & TME_ACTIVATE_CRYPTO_AES_XTS_128)) { - pr_err("x86/mktme: No known encryption algorithm is supported: %#llx\n", - tme_crypto_algs); - mktme_status = MKTME_DISABLED; - } -detect_keyid_bits: - keyid_bits = TME_ACTIVATE_KEYID_BITS(tme_activate); - nr_keyids = (1UL << keyid_bits) - 1; - if (nr_keyids) { - pr_info_once("x86/mktme: enabled by BIOS\n"); - pr_info_once("x86/mktme: %d KeyIDs available\n", nr_keyids); - } else { - pr_info_once("x86/mktme: disabled by BIOS\n"); - } - - if (mktme_status == MKTME_UNINITIALIZED) { - /* MKTME is usable */ - mktme_status = MKTME_ENABLED; - } - - /* - * KeyID bits effectively lower the number of physical address - * bits. Update cpuinfo_x86::x86_phys_bits accordingly. - */ - c->x86_phys_bits -= keyid_bits; -} - static void init_cpuid_fault(struct cpuinfo_x86 *c) { u64 msr; @@ -603,24 +603,6 @@ static void init_intel(struct cpuinfo_x86 *c) intel_workarounds(c); - /* - * Detect the extended topology information if available. This - * will reinitialise the initial_apicid which will be used - * in init_intel_cacheinfo() - */ - detect_extended_topology(c); - - if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { - /* - * let's use the legacy cpuid vector 0x1 and 0x4 for topology - * detection. - */ - detect_num_cpu_cores(c); -#ifdef CONFIG_X86_32 - detect_ht(c); -#endif - } - init_intel_cacheinfo(c); if (c->cpuid_level > 9) { @@ -702,9 +684,6 @@ static void init_intel(struct cpuinfo_x86 *c) init_ia32_feat_ctl(c); - if (cpu_has(c, X86_FEATURE_TME)) - detect_tme(c); - init_intel_misc_features(c); split_lock_init(); diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 2b46eb0fdf3a..9a0133ef7e20 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -1231,7 +1231,7 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, return -ENODEV; if (is_shared_bank(bank)) { - nb = node_to_amd_nb(topology_die_id(cpu)); + nb = node_to_amd_nb(topology_amd_node_id(cpu)); /* threshold descriptor already initialized on this node? */ if (nb && nb->bank4) { @@ -1335,7 +1335,7 @@ static void threshold_remove_bank(struct threshold_bank *bank) * The last CPU on this node using the shared bank is going * away, remove that bank now. */ - nb = node_to_amd_nb(topology_die_id(smp_processor_id())); + nb = node_to_amd_nb(topology_amd_node_id(smp_processor_id())); nb->bank4 = NULL; } diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index bc39252bc54f..b5cc557cfc37 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -46,6 +46,7 @@ #include <linux/hardirq.h> #include <linux/kexec.h> +#include <asm/fred.h> #include <asm/intel-family.h> #include <asm/processor.h> #include <asm/traps.h> @@ -2166,6 +2167,31 @@ DEFINE_IDTENTRY_MCE_USER(exc_machine_check) exc_machine_check_user(regs); local_db_restore(dr7); } + +#ifdef CONFIG_X86_FRED +/* + * When occurred on different ring level, i.e., from user or kernel + * context, #MCE needs to be handled on different stack: User #MCE + * on current task stack, while kernel #MCE on a dedicated stack. + * + * This is exactly how FRED event delivery invokes an exception + * handler: ring 3 event on level 0 stack, i.e., current task stack; + * ring 0 event on the #MCE dedicated stack specified in the + * IA32_FRED_STKLVLS MSR. So unlike IDT, the FRED machine check entry + * stub doesn't do stack switch. + */ +DEFINE_FREDENTRY_MCE(exc_machine_check) +{ + unsigned long dr7; + + dr7 = local_db_save(); + if (user_mode(regs)) + exc_machine_check_user(regs); + else + exc_machine_check_kernel(regs); + local_db_restore(dr7); +} +#endif #else /* 32bit unified entry point */ DEFINE_IDTENTRY_RAW(exc_machine_check) @@ -2431,7 +2457,7 @@ static void mce_enable_ce(void *all) __mcheck_cpu_init_timer(); } -static struct bus_type mce_subsys = { +static const struct bus_type mce_subsys = { .name = "machinecheck", .dev_name = "machinecheck", }; diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 72f0695c3dc1..94953d749475 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -430,11 +430,9 @@ static void trigger_thr_int(void *info) static u32 get_nbc_for_node(int node_id) { - struct cpuinfo_x86 *c = &boot_cpu_data; u32 cores_per_node; - cores_per_node = (c->x86_max_cores * smp_num_siblings) / amd_get_nodes_per_socket(); - + cores_per_node = topology_num_threads_per_package() / topology_amd_nodes_per_pkg(); return cores_per_node * node_id; } @@ -543,8 +541,8 @@ static void do_inject(void) if (boot_cpu_has(X86_FEATURE_AMD_DCM) && b == 4 && boot_cpu_data.x86 < 0x17) { - toggle_nb_mca_mst_cpu(topology_die_id(cpu)); - cpu = get_nbc_for_node(topology_die_id(cpu)); + toggle_nb_mca_mst_cpu(topology_amd_node_id(cpu)); + cpu = get_nbc_for_node(topology_amd_node_id(cpu)); } cpus_read_lock(); diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 857e608af641..5f0414452b67 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -641,7 +641,7 @@ static __init void calc_llc_size_per_core(struct cpuinfo_x86 *c) { u64 llc_size = c->x86_cache_size * 1024ULL; - do_div(llc_size, c->x86_max_cores); + do_div(llc_size, topology_num_cores_per_package()); llc_size_per_core = (unsigned int)llc_size; } diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 01fa06dd06b6..45e0e70e238c 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -539,19 +539,18 @@ static void __init ms_hyperv_init_platform(void) */ x86_platform.apic_post_init = hyperv_init; hyperv_setup_mmu_ops(); - /* Setup the IDT for hypervisor callback */ - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_hyperv_callback); - /* Setup the IDT for reenlightenment notifications */ + /* Install system interrupt handler for hypervisor callback */ + sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_hyperv_callback); + + /* Install system interrupt handler for reenlightenment notifications */ if (ms_hyperv.features & HV_ACCESS_REENLIGHTENMENT) { - alloc_intr_gate(HYPERV_REENLIGHTENMENT_VECTOR, - asm_sysvec_hyperv_reenlightenment); + sysvec_install(HYPERV_REENLIGHTENMENT_VECTOR, sysvec_hyperv_reenlightenment); } - /* Setup the IDT for stimer0 */ + /* Install system interrupt handler for stimer0 */ if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE) { - alloc_intr_gate(HYPERV_STIMER0_VECTOR, - asm_sysvec_hyperv_stimer0); + sysvec_install(HYPERV_STIMER0_VECTOR, sysvec_hyperv_stimer0); } # ifdef CONFIG_SMP diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index d3524778a545..422a4ddc2ab7 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -108,6 +108,9 @@ static inline void k8_check_syscfg_dram_mod_en(void) (boot_cpu_data.x86 >= 0x0f))) return; + if (cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + return; + rdmsr(MSR_AMD64_SYSCFG, lo, hi); if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) { pr_err(FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]" diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 19e0681f0435..83e40341583e 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -16,6 +16,7 @@ #define pr_fmt(fmt) "resctrl: " fmt +#include <linux/cpu.h> #include <linux/slab.h> #include <linux/err.h> #include <linux/cacheinfo.h> @@ -25,8 +26,15 @@ #include <asm/resctrl.h> #include "internal.h" -/* Mutex to protect rdtgroup access. */ -DEFINE_MUTEX(rdtgroup_mutex); +/* + * rdt_domain structures are kfree()d when their last CPU goes offline, + * and allocated when the first CPU in a new domain comes online. + * The rdt_resource's domain list is updated when this happens. Readers of + * the domain list must either take cpus_read_lock(), or rely on an RCU + * read-side critical section, to avoid observing concurrent modification. + * All writers take this mutex: + */ +static DEFINE_MUTEX(domain_list_lock); /* * The cached resctrl_pqr_state is strictly per CPU and can never be @@ -136,15 +144,15 @@ static inline void cache_alloc_hsw_probe(void) { struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_L3]; struct rdt_resource *r = &hw_res->r_resctrl; - u32 l, h, max_cbm = BIT_MASK(20) - 1; + u64 max_cbm = BIT_ULL_MASK(20) - 1, l3_cbm_0; - if (wrmsr_safe(MSR_IA32_L3_CBM_BASE, max_cbm, 0)) + if (wrmsrl_safe(MSR_IA32_L3_CBM_BASE, max_cbm)) return; - rdmsr(MSR_IA32_L3_CBM_BASE, l, h); + rdmsrl(MSR_IA32_L3_CBM_BASE, l3_cbm_0); /* If all the bits were set in MSR, return success */ - if (l != max_cbm) + if (l3_cbm_0 != max_cbm) return; hw_res->num_closid = 4; @@ -231,9 +239,7 @@ static bool __get_mem_config_intel(struct rdt_resource *r) static bool __rdt_get_mem_config_amd(struct rdt_resource *r) { struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); - union cpuid_0x10_3_eax eax; - union cpuid_0x10_x_edx edx; - u32 ebx, ecx, subleaf; + u32 eax, ebx, ecx, edx, subleaf; /* * Query CPUID_Fn80000020_EDX_x01 for MBA and @@ -241,9 +247,9 @@ static bool __rdt_get_mem_config_amd(struct rdt_resource *r) */ subleaf = (r->rid == RDT_RESOURCE_SMBA) ? 2 : 1; - cpuid_count(0x80000020, subleaf, &eax.full, &ebx, &ecx, &edx.full); - hw_res->num_closid = edx.split.cos_max + 1; - r->default_ctrl = MAX_MBA_BW_AMD; + cpuid_count(0x80000020, subleaf, &eax, &ebx, &ecx, &edx); + hw_res->num_closid = edx + 1; + r->default_ctrl = 1 << eax; /* AMD does not use delay */ r->membw.delay_linear = false; @@ -512,6 +518,8 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) struct rdt_domain *d; int err; + lockdep_assert_held(&domain_list_lock); + d = rdt_find_domain(r, id, &add_pos); if (IS_ERR(d)) { pr_warn("Couldn't find cache id for CPU %d\n", cpu); @@ -545,11 +553,12 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) return; } - list_add_tail(&d->list, add_pos); + list_add_tail_rcu(&d->list, add_pos); err = resctrl_online_domain(r, d); if (err) { - list_del(&d->list); + list_del_rcu(&d->list); + synchronize_rcu(); domain_free(hw_dom); } } @@ -560,6 +569,8 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) struct rdt_hw_domain *hw_dom; struct rdt_domain *d; + lockdep_assert_held(&domain_list_lock); + d = rdt_find_domain(r, id, NULL); if (IS_ERR_OR_NULL(d)) { pr_warn("Couldn't find cache id for CPU %d\n", cpu); @@ -570,7 +581,8 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) cpumask_clear_cpu(cpu, &d->cpu_mask); if (cpumask_empty(&d->cpu_mask)) { resctrl_offline_domain(r, d); - list_del(&d->list); + list_del_rcu(&d->list); + synchronize_rcu(); /* * rdt_domain "d" is going to be freed below, so clear @@ -582,73 +594,47 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) return; } - - if (r == &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl) { - if (is_mbm_enabled() && cpu == d->mbm_work_cpu) { - cancel_delayed_work(&d->mbm_over); - mbm_setup_overflow_handler(d, 0); - } - if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu && - has_busy_rmid(r, d)) { - cancel_delayed_work(&d->cqm_limbo); - cqm_setup_limbo_handler(d, 0); - } - } } static void clear_closid_rmid(int cpu) { struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state); - state->default_closid = 0; - state->default_rmid = 0; - state->cur_closid = 0; - state->cur_rmid = 0; - wrmsr(MSR_IA32_PQR_ASSOC, 0, 0); + state->default_closid = RESCTRL_RESERVED_CLOSID; + state->default_rmid = RESCTRL_RESERVED_RMID; + state->cur_closid = RESCTRL_RESERVED_CLOSID; + state->cur_rmid = RESCTRL_RESERVED_RMID; + wrmsr(MSR_IA32_PQR_ASSOC, RESCTRL_RESERVED_RMID, + RESCTRL_RESERVED_CLOSID); } -static int resctrl_online_cpu(unsigned int cpu) +static int resctrl_arch_online_cpu(unsigned int cpu) { struct rdt_resource *r; - mutex_lock(&rdtgroup_mutex); + mutex_lock(&domain_list_lock); for_each_capable_rdt_resource(r) domain_add_cpu(cpu, r); - /* The cpu is set in default rdtgroup after online. */ - cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); + mutex_unlock(&domain_list_lock); + clear_closid_rmid(cpu); - mutex_unlock(&rdtgroup_mutex); + resctrl_online_cpu(cpu); return 0; } -static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) -{ - struct rdtgroup *cr; - - list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) { - if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) { - break; - } - } -} - -static int resctrl_offline_cpu(unsigned int cpu) +static int resctrl_arch_offline_cpu(unsigned int cpu) { - struct rdtgroup *rdtgrp; struct rdt_resource *r; - mutex_lock(&rdtgroup_mutex); + resctrl_offline_cpu(cpu); + + mutex_lock(&domain_list_lock); for_each_capable_rdt_resource(r) domain_remove_cpu(cpu, r); - list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { - if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) { - clear_childcpus(rdtgrp, cpu); - break; - } - } + mutex_unlock(&domain_list_lock); + clear_closid_rmid(cpu); - mutex_unlock(&rdtgroup_mutex); return 0; } @@ -968,7 +954,8 @@ static int __init resctrl_late_init(void) state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/resctrl/cat:online:", - resctrl_online_cpu, resctrl_offline_cpu); + resctrl_arch_online_cpu, + resctrl_arch_offline_cpu); if (state < 0) return state; @@ -992,8 +979,14 @@ late_initcall(resctrl_late_init); static void __exit resctrl_exit(void) { + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + cpuhp_remove_state(rdt_online); + rdtgroup_exit(); + + if (r->mon_capable) + rdt_put_mon_l3_config(); } __exitcall(resctrl_exit); diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index beccb0e87ba7..7997b47743a2 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -19,6 +19,8 @@ #include <linux/kernfs.h> #include <linux/seq_file.h> #include <linux/slab.h> +#include <linux/tick.h> + #include "internal.h" /* @@ -210,6 +212,9 @@ static int parse_line(char *line, struct resctrl_schema *s, struct rdt_domain *d; unsigned long dom_id; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) { rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n"); @@ -314,6 +319,9 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) struct rdt_domain *d; u32 idx; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) return -ENOMEM; @@ -379,11 +387,9 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, return -EINVAL; buf[nbytes - 1] = '\0'; - cpus_read_lock(); rdtgrp = rdtgroup_kn_lock_live(of->kn); if (!rdtgrp) { rdtgroup_kn_unlock(of->kn); - cpus_read_unlock(); return -ENOENT; } rdt_last_cmd_clear(); @@ -445,7 +451,6 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, out: rdt_staged_configs_clear(); rdtgroup_kn_unlock(of->kn); - cpus_read_unlock(); return ret ?: nbytes; } @@ -465,6 +470,9 @@ static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int clo bool sep = false; u32 ctrl_val; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + seq_printf(s, "%*s:", max_name_width, schema->name); list_for_each_entry(dom, &r->domains, list) { if (sep) @@ -522,12 +530,24 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, return ret; } +static int smp_mon_event_count(void *arg) +{ + mon_event_count(arg); + + return 0; +} + void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, struct rdt_domain *d, struct rdtgroup *rdtgrp, int evtid, int first) { + int cpu; + + /* When picking a CPU from cpu_mask, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + /* - * setup the parameters to send to the IPI to read the data. + * Setup the parameters to pass to mon_event_count() to read the data. */ rr->rgrp = rdtgrp; rr->evtid = evtid; @@ -535,8 +555,26 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, rr->d = d; rr->val = 0; rr->first = first; + rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid); + if (IS_ERR(rr->arch_mon_ctx)) { + rr->err = -EINVAL; + return; + } + + cpu = cpumask_any_housekeeping(&d->cpu_mask, RESCTRL_PICK_ANY_CPU); + + /* + * cpumask_any_housekeeping() prefers housekeeping CPUs, but + * are all the CPUs nohz_full? If yes, pick a CPU to IPI. + * MPAM's resctrl_arch_rmid_read() is unable to read the + * counters on some platforms if its called in IRQ context. + */ + if (tick_nohz_full_cpu(cpu)) + smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1); + else + smp_call_on_cpu(cpu, smp_mon_event_count, rr, false); - smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1); + resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx); } int rdtgroup_mondata_show(struct seq_file *m, void *arg) diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index a4f1aa15f0a2..c99f26ebe7a6 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -7,6 +7,9 @@ #include <linux/kernfs.h> #include <linux/fs_context.h> #include <linux/jump_label.h> +#include <linux/tick.h> + +#include <asm/resctrl.h> #define L3_QOS_CDP_ENABLE 0x01ULL @@ -18,7 +21,6 @@ #define MBM_OVERFLOW_INTERVAL 1000 #define MAX_MBA_BW 100u #define MBA_IS_LINEAR 0x4 -#define MAX_MBA_BW_AMD 0x800 #define MBM_CNTR_WIDTH_OFFSET_AMD 20 #define RMID_VAL_ERROR BIT_ULL(63) @@ -54,6 +56,46 @@ /* Max event bits supported */ #define MAX_EVT_CONFIG_BITS GENMASK(6, 0) +/** + * cpumask_any_housekeeping() - Choose any CPU in @mask, preferring those that + * aren't marked nohz_full + * @mask: The mask to pick a CPU from. + * @exclude_cpu:The CPU to avoid picking. + * + * Returns a CPU from @mask, but not @exclude_cpu. If there are housekeeping + * CPUs that don't use nohz_full, these are preferred. Pass + * RESCTRL_PICK_ANY_CPU to avoid excluding any CPUs. + * + * When a CPU is excluded, returns >= nr_cpu_ids if no CPUs are available. + */ +static inline unsigned int +cpumask_any_housekeeping(const struct cpumask *mask, int exclude_cpu) +{ + unsigned int cpu, hk_cpu; + + if (exclude_cpu == RESCTRL_PICK_ANY_CPU) + cpu = cpumask_any(mask); + else + cpu = cpumask_any_but(mask, exclude_cpu); + + if (!IS_ENABLED(CONFIG_NO_HZ_FULL)) + return cpu; + + /* If the CPU picked isn't marked nohz_full nothing more needs doing. */ + if (cpu < nr_cpu_ids && !tick_nohz_full_cpu(cpu)) + return cpu; + + /* Try to find a CPU that isn't nohz_full to use in preference */ + hk_cpu = cpumask_nth_andnot(0, mask, tick_nohz_full_mask); + if (hk_cpu == exclude_cpu) + hk_cpu = cpumask_nth_andnot(1, mask, tick_nohz_full_mask); + + if (hk_cpu < nr_cpu_ids) + cpu = hk_cpu; + + return cpu; +} + struct rdt_fs_context { struct kernfs_fs_context kfc; bool enable_cdpl2; @@ -69,9 +111,6 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) return container_of(kfc, struct rdt_fs_context, kfc); } -DECLARE_STATIC_KEY_FALSE(rdt_enable_key); -DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key); - /** * struct mon_evt - Entry in the event list of a resource * @evtid: event id @@ -112,12 +151,12 @@ struct rmid_read { bool first; int err; u64 val; + void *arch_mon_ctx; }; -extern bool rdt_alloc_capable; -extern bool rdt_mon_capable; extern unsigned int rdt_mon_features; extern struct list_head resctrl_schema_all; +extern bool resctrl_mounted; enum rdt_group_type { RDTCTRL_GROUP = 0, @@ -296,14 +335,10 @@ struct rftype { * struct mbm_state - status for each MBM counter in each domain * @prev_bw_bytes: Previous bytes value read for bandwidth calculation * @prev_bw: The most recent bandwidth in MBps - * @delta_bw: Difference between the current and previous bandwidth - * @delta_comp: Indicates whether to compute the delta_bw */ struct mbm_state { u64 prev_bw_bytes; u32 prev_bw; - u32 delta_bw; - bool delta_comp; }; /** @@ -395,6 +430,8 @@ struct rdt_parse_data { * @msr_update: Function pointer to update QOS MSRs * @mon_scale: cqm counter * mon_scale = occupancy in bytes * @mbm_width: Monitor width, to detect and correct for overflow. + * @mbm_cfg_mask: Bandwidth sources that can be tracked when Bandwidth + * Monitoring Event Configuration (BMEC) is supported. * @cdp_enabled: CDP state of this resource * * Members of this structure are either private to the architecture @@ -409,6 +446,7 @@ struct rdt_hw_resource { struct rdt_resource *r); unsigned int mon_scale; unsigned int mbm_width; + unsigned int mbm_cfg_mask; bool cdp_enabled; }; @@ -426,8 +464,6 @@ extern struct mutex rdtgroup_mutex; extern struct rdt_hw_resource rdt_resources_all[]; extern struct rdtgroup rdtgroup_default; -DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); - extern struct dentry *debugfs_resctrl; enum resctrl_res_level { @@ -543,9 +579,10 @@ void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp); struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r); int closids_supported(void); void closid_free(int closid); -int alloc_rmid(void); -void free_rmid(u32 rmid); +int alloc_rmid(u32 closid); +void free_rmid(u32 closid, u32 rmid); int rdt_get_mon_l3_config(struct rdt_resource *r); +void __exit rdt_put_mon_l3_config(void); bool __init rdt_cpu_has(int flag); void mon_event_count(void *info); int rdtgroup_mondata_show(struct seq_file *m, void *arg); @@ -553,17 +590,21 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, struct rdt_domain *d, struct rdtgroup *rdtgrp, int evtid, int first); void mbm_setup_overflow_handler(struct rdt_domain *dom, - unsigned long delay_ms); + unsigned long delay_ms, + int exclude_cpu); void mbm_handle_overflow(struct work_struct *work); void __init intel_rdt_mbm_apply_quirk(void); bool is_mba_sc(struct rdt_resource *r); -void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms); +void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms, + int exclude_cpu); void cqm_handle_limbo(struct work_struct *work); -bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d); +bool has_busy_rmid(struct rdt_domain *d); void __check_limbo(struct rdt_domain *d, bool force_free); void rdt_domain_reconfigure_cdp(struct rdt_resource *r); void __init thread_throttle_mode_init(void); void __init mbm_config_rftype_init(const char *config); void rdt_staged_configs_clear(void); +bool closid_allocated(unsigned int closid); +int resctrl_find_cleanest_closid(void); #endif /* _ASM_X86_RESCTRL_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index f136ac046851..c34a35ec0f03 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -15,6 +15,7 @@ * Software Developer Manual June 2016, volume 3, section 17.17. */ +#include <linux/cpu.h> #include <linux/module.h> #include <linux/sizes.h> #include <linux/slab.h> @@ -24,7 +25,20 @@ #include "internal.h" +/** + * struct rmid_entry - dirty tracking for all RMID. + * @closid: The CLOSID for this entry. + * @rmid: The RMID for this entry. + * @busy: The number of domains with cached data using this RMID. + * @list: Member of the rmid_free_lru list when busy == 0. + * + * Depending on the architecture the correct monitor is accessed using + * both @closid and @rmid, or @rmid only. + * + * Take the rdtgroup_mutex when accessing. + */ struct rmid_entry { + u32 closid; u32 rmid; int busy; struct list_head list; @@ -38,6 +52,13 @@ struct rmid_entry { static LIST_HEAD(rmid_free_lru); /* + * @closid_num_dirty_rmid The number of dirty RMID each CLOSID has. + * Only allocated when CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is defined. + * Indexed by CLOSID. Protected by rdtgroup_mutex. + */ +static u32 *closid_num_dirty_rmid; + +/* * @rmid_limbo_count - count of currently unused but (potentially) * dirty RMIDs. * This counts RMIDs that no one is currently using but that @@ -136,12 +157,29 @@ static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) return val; } -static inline struct rmid_entry *__rmid_entry(u32 rmid) +/* + * x86 and arm64 differ in their handling of monitoring. + * x86's RMID are independent numbers, there is only one source of traffic + * with an RMID value of '1'. + * arm64's PMG extends the PARTID/CLOSID space, there are multiple sources of + * traffic with a PMG value of '1', one for each CLOSID, meaning the RMID + * value is no longer unique. + * To account for this, resctrl uses an index. On x86 this is just the RMID, + * on arm64 it encodes the CLOSID and RMID. This gives a unique number. + * + * The domain's rmid_busy_llc and rmid_ptrs[] are sized by index. The arch code + * must accept an attempt to read every index. + */ +static inline struct rmid_entry *__rmid_entry(u32 idx) { struct rmid_entry *entry; + u32 closid, rmid; + + entry = &rmid_ptrs[idx]; + resctrl_arch_rmid_idx_decode(idx, &closid, &rmid); - entry = &rmid_ptrs[rmid]; - WARN_ON(entry->rmid != rmid); + WARN_ON_ONCE(entry->closid != closid); + WARN_ON_ONCE(entry->rmid != rmid); return entry; } @@ -190,7 +228,8 @@ static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_domain *hw_dom, } void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d, - u32 rmid, enum resctrl_event_id eventid) + u32 unused, u32 rmid, + enum resctrl_event_id eventid) { struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); struct arch_mbm_state *am; @@ -230,7 +269,8 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) } int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, - u32 rmid, enum resctrl_event_id eventid, u64 *val) + u32 unused, u32 rmid, enum resctrl_event_id eventid, + u64 *val, void *ignored) { struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); @@ -238,6 +278,8 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, u64 msr_val, chunks; int ret; + resctrl_arch_rmid_read_context_check(); + if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask)) return -EINVAL; @@ -260,6 +302,17 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, return 0; } +static void limbo_release_entry(struct rmid_entry *entry) +{ + lockdep_assert_held(&rdtgroup_mutex); + + rmid_limbo_count--; + list_add_tail(&entry->list, &rmid_free_lru); + + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + closid_num_dirty_rmid[entry->closid]--; +} + /* * Check the RMIDs that are marked as busy for this domain. If the * reported LLC occupancy is below the threshold clear the busy bit and @@ -269,11 +322,20 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, void __check_limbo(struct rdt_domain *d, bool force_free) { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); struct rmid_entry *entry; - u32 crmid = 1, nrmid; + u32 idx, cur_idx = 1; + void *arch_mon_ctx; bool rmid_dirty; u64 val = 0; + arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, QOS_L3_OCCUP_EVENT_ID); + if (IS_ERR(arch_mon_ctx)) { + pr_warn_ratelimited("Failed to allocate monitor context: %ld", + PTR_ERR(arch_mon_ctx)); + return; + } + /* * Skip RMID 0 and start from RMID 1 and check all the RMIDs that * are marked as busy for occupancy < threshold. If the occupancy @@ -281,53 +343,125 @@ void __check_limbo(struct rdt_domain *d, bool force_free) * RMID and move it to the free list when the counter reaches 0. */ for (;;) { - nrmid = find_next_bit(d->rmid_busy_llc, r->num_rmid, crmid); - if (nrmid >= r->num_rmid) + idx = find_next_bit(d->rmid_busy_llc, idx_limit, cur_idx); + if (idx >= idx_limit) break; - entry = __rmid_entry(nrmid); - - if (resctrl_arch_rmid_read(r, d, entry->rmid, - QOS_L3_OCCUP_EVENT_ID, &val)) { + entry = __rmid_entry(idx); + if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid, + QOS_L3_OCCUP_EVENT_ID, &val, + arch_mon_ctx)) { rmid_dirty = true; } else { rmid_dirty = (val >= resctrl_rmid_realloc_threshold); } if (force_free || !rmid_dirty) { - clear_bit(entry->rmid, d->rmid_busy_llc); - if (!--entry->busy) { - rmid_limbo_count--; - list_add_tail(&entry->list, &rmid_free_lru); - } + clear_bit(idx, d->rmid_busy_llc); + if (!--entry->busy) + limbo_release_entry(entry); } - crmid = nrmid + 1; + cur_idx = idx + 1; } + + resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx); } -bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d) +bool has_busy_rmid(struct rdt_domain *d) { - return find_first_bit(d->rmid_busy_llc, r->num_rmid) != r->num_rmid; + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); + + return find_first_bit(d->rmid_busy_llc, idx_limit) != idx_limit; +} + +static struct rmid_entry *resctrl_find_free_rmid(u32 closid) +{ + struct rmid_entry *itr; + u32 itr_idx, cmp_idx; + + if (list_empty(&rmid_free_lru)) + return rmid_limbo_count ? ERR_PTR(-EBUSY) : ERR_PTR(-ENOSPC); + + list_for_each_entry(itr, &rmid_free_lru, list) { + /* + * Get the index of this free RMID, and the index it would need + * to be if it were used with this CLOSID. + * If the CLOSID is irrelevant on this architecture, the two + * index values are always the same on every entry and thus the + * very first entry will be returned. + */ + itr_idx = resctrl_arch_rmid_idx_encode(itr->closid, itr->rmid); + cmp_idx = resctrl_arch_rmid_idx_encode(closid, itr->rmid); + + if (itr_idx == cmp_idx) + return itr; + } + + return ERR_PTR(-ENOSPC); +} + +/** + * resctrl_find_cleanest_closid() - Find a CLOSID where all the associated + * RMID are clean, or the CLOSID that has + * the most clean RMID. + * + * MPAM's equivalent of RMID are per-CLOSID, meaning a freshly allocated CLOSID + * may not be able to allocate clean RMID. To avoid this the allocator will + * choose the CLOSID with the most clean RMID. + * + * When the CLOSID and RMID are independent numbers, the first free CLOSID will + * be returned. + */ +int resctrl_find_cleanest_closid(void) +{ + u32 cleanest_closid = ~0; + int i = 0; + + lockdep_assert_held(&rdtgroup_mutex); + + if (!IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + return -EIO; + + for (i = 0; i < closids_supported(); i++) { + int num_dirty; + + if (closid_allocated(i)) + continue; + + num_dirty = closid_num_dirty_rmid[i]; + if (num_dirty == 0) + return i; + + if (cleanest_closid == ~0) + cleanest_closid = i; + + if (num_dirty < closid_num_dirty_rmid[cleanest_closid]) + cleanest_closid = i; + } + + if (cleanest_closid == ~0) + return -ENOSPC; + + return cleanest_closid; } /* - * As of now the RMIDs allocation is global. - * However we keep track of which packages the RMIDs - * are used to optimize the limbo list management. + * For MPAM the RMID value is not unique, and has to be considered with + * the CLOSID. The (CLOSID, RMID) pair is allocated on all domains, which + * allows all domains to be managed by a single free list. + * Each domain also has a rmid_busy_llc to reduce the work of the limbo handler. */ -int alloc_rmid(void) +int alloc_rmid(u32 closid) { struct rmid_entry *entry; lockdep_assert_held(&rdtgroup_mutex); - if (list_empty(&rmid_free_lru)) - return rmid_limbo_count ? -EBUSY : -ENOSPC; + entry = resctrl_find_free_rmid(closid); + if (IS_ERR(entry)) + return PTR_ERR(entry); - entry = list_first_entry(&rmid_free_lru, - struct rmid_entry, list); list_del(&entry->list); - return entry->rmid; } @@ -335,47 +469,50 @@ static void add_rmid_to_limbo(struct rmid_entry *entry) { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; struct rdt_domain *d; - int cpu, err; - u64 val = 0; + u32 idx; + + lockdep_assert_held(&rdtgroup_mutex); + + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + + idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid); entry->busy = 0; - cpu = get_cpu(); list_for_each_entry(d, &r->domains, list) { - if (cpumask_test_cpu(cpu, &d->cpu_mask)) { - err = resctrl_arch_rmid_read(r, d, entry->rmid, - QOS_L3_OCCUP_EVENT_ID, - &val); - if (err || val <= resctrl_rmid_realloc_threshold) - continue; - } - /* * For the first limbo RMID in the domain, * setup up the limbo worker. */ - if (!has_busy_rmid(r, d)) - cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL); - set_bit(entry->rmid, d->rmid_busy_llc); + if (!has_busy_rmid(d)) + cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL, + RESCTRL_PICK_ANY_CPU); + set_bit(idx, d->rmid_busy_llc); entry->busy++; } - put_cpu(); - if (entry->busy) - rmid_limbo_count++; - else - list_add_tail(&entry->list, &rmid_free_lru); + rmid_limbo_count++; + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + closid_num_dirty_rmid[entry->closid]++; } -void free_rmid(u32 rmid) +void free_rmid(u32 closid, u32 rmid) { + u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); struct rmid_entry *entry; - if (!rmid) - return; - lockdep_assert_held(&rdtgroup_mutex); - entry = __rmid_entry(rmid); + /* + * Do not allow the default rmid to be free'd. Comparing by index + * allows architectures that ignore the closid parameter to avoid an + * unnecessary check. + */ + if (idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, + RESCTRL_RESERVED_RMID)) + return; + + entry = __rmid_entry(idx); if (is_llc_occupancy_enabled()) add_rmid_to_limbo(entry); @@ -383,33 +520,36 @@ void free_rmid(u32 rmid) list_add_tail(&entry->list, &rmid_free_lru); } -static struct mbm_state *get_mbm_state(struct rdt_domain *d, u32 rmid, - enum resctrl_event_id evtid) +static struct mbm_state *get_mbm_state(struct rdt_domain *d, u32 closid, + u32 rmid, enum resctrl_event_id evtid) { + u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); + switch (evtid) { case QOS_L3_MBM_TOTAL_EVENT_ID: - return &d->mbm_total[rmid]; + return &d->mbm_total[idx]; case QOS_L3_MBM_LOCAL_EVENT_ID: - return &d->mbm_local[rmid]; + return &d->mbm_local[idx]; default: return NULL; } } -static int __mon_event_count(u32 rmid, struct rmid_read *rr) +static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) { struct mbm_state *m; u64 tval = 0; if (rr->first) { - resctrl_arch_reset_rmid(rr->r, rr->d, rmid, rr->evtid); - m = get_mbm_state(rr->d, rmid, rr->evtid); + resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid); + m = get_mbm_state(rr->d, closid, rmid, rr->evtid); if (m) memset(m, 0, sizeof(struct mbm_state)); return 0; } - rr->err = resctrl_arch_rmid_read(rr->r, rr->d, rmid, rr->evtid, &tval); + rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, rr->evtid, + &tval, rr->arch_mon_ctx); if (rr->err) return rr->err; @@ -421,6 +561,7 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr) /* * mbm_bw_count() - Update bw count from values previously read by * __mon_event_count(). + * @closid: The closid used to identify the cached mbm_state. * @rmid: The rmid used to identify the cached mbm_state. * @rr: The struct rmid_read populated by __mon_event_count(). * @@ -429,9 +570,10 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr) * __mon_event_count() is compared with the chunks value from the previous * invocation. This must be called once per second to maintain values in MBps. */ -static void mbm_bw_count(u32 rmid, struct rmid_read *rr) +static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr) { - struct mbm_state *m = &rr->d->mbm_local[rmid]; + u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); + struct mbm_state *m = &rr->d->mbm_local[idx]; u64 cur_bw, bytes, cur_bytes; cur_bytes = rr->val; @@ -440,14 +582,11 @@ static void mbm_bw_count(u32 rmid, struct rmid_read *rr) cur_bw = bytes / SZ_1M; - if (m->delta_comp) - m->delta_bw = abs(cur_bw - m->prev_bw); - m->delta_comp = false; m->prev_bw = cur_bw; } /* - * This is called via IPI to read the CQM/MBM counters + * This is scheduled by mon_event_read() to read the CQM/MBM counters * on a domain. */ void mon_event_count(void *info) @@ -459,7 +598,7 @@ void mon_event_count(void *info) rdtgrp = rr->rgrp; - ret = __mon_event_count(rdtgrp->mon.rmid, rr); + ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr); /* * For Ctrl groups read data from child monitor groups and @@ -470,7 +609,8 @@ void mon_event_count(void *info) if (rdtgrp->type == RDTCTRL_GROUP) { list_for_each_entry(entry, head, mon.crdtgrp_list) { - if (__mon_event_count(entry->mon.rmid, rr) == 0) + if (__mon_event_count(entry->closid, entry->mon.rmid, + rr) == 0) ret = 0; } } @@ -520,9 +660,9 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) { u32 closid, rmid, cur_msr_val, new_msr_val; struct mbm_state *pmbm_data, *cmbm_data; - u32 cur_bw, delta_bw, user_bw; struct rdt_resource *r_mba; struct rdt_domain *dom_mba; + u32 cur_bw, user_bw, idx; struct list_head *head; struct rdtgroup *entry; @@ -533,7 +673,8 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) closid = rgrp->closid; rmid = rgrp->mon.rmid; - pmbm_data = &dom_mbm->mbm_local[rmid]; + idx = resctrl_arch_rmid_idx_encode(closid, rmid); + pmbm_data = &dom_mbm->mbm_local[idx]; dom_mba = get_domain_from_cpu(smp_processor_id(), r_mba); if (!dom_mba) { @@ -543,7 +684,6 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) cur_bw = pmbm_data->prev_bw; user_bw = dom_mba->mbps_val[closid]; - delta_bw = pmbm_data->delta_bw; /* MBA resource doesn't support CDP */ cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE); @@ -555,52 +695,35 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) list_for_each_entry(entry, head, mon.crdtgrp_list) { cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid]; cur_bw += cmbm_data->prev_bw; - delta_bw += cmbm_data->delta_bw; } /* * Scale up/down the bandwidth linearly for the ctrl group. The * bandwidth step is the bandwidth granularity specified by the * hardware. - * - * The delta_bw is used when increasing the bandwidth so that we - * dont alternately increase and decrease the control values - * continuously. - * - * For ex: consider cur_bw = 90MBps, user_bw = 100MBps and if - * bandwidth step is 20MBps(> user_bw - cur_bw), we would keep - * switching between 90 and 110 continuously if we only check - * cur_bw < user_bw. + * Always increase throttling if current bandwidth is above the + * target set by user. + * But avoid thrashing up and down on every poll by checking + * whether a decrease in throttling is likely to push the group + * back over target. E.g. if currently throttling to 30% of bandwidth + * on a system with 10% granularity steps, check whether moving to + * 40% would go past the limit by multiplying current bandwidth by + * "(30 + 10) / 30". */ if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) { new_msr_val = cur_msr_val - r_mba->membw.bw_gran; } else if (cur_msr_val < MAX_MBA_BW && - (user_bw > (cur_bw + delta_bw))) { + (user_bw > (cur_bw * (cur_msr_val + r_mba->membw.min_bw) / cur_msr_val))) { new_msr_val = cur_msr_val + r_mba->membw.bw_gran; } else { return; } resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val); - - /* - * Delta values are updated dynamically package wise for each - * rdtgrp every time the throttle MSR changes value. - * - * This is because (1)the increase in bandwidth is not perfectly - * linear and only "approximately" linear even when the hardware - * says it is linear.(2)Also since MBA is a core specific - * mechanism, the delta values vary based on number of cores used - * by the rdtgrp. - */ - pmbm_data->delta_comp = true; - list_for_each_entry(entry, head, mon.crdtgrp_list) { - cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid]; - cmbm_data->delta_comp = true; - } } -static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid) +static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, + u32 closid, u32 rmid) { struct rmid_read rr; @@ -615,12 +738,28 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid) if (is_mbm_total_enabled()) { rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID; rr.val = 0; - __mon_event_count(rmid, &rr); + rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); + if (IS_ERR(rr.arch_mon_ctx)) { + pr_warn_ratelimited("Failed to allocate monitor context: %ld", + PTR_ERR(rr.arch_mon_ctx)); + return; + } + + __mon_event_count(closid, rmid, &rr); + + resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); } if (is_mbm_local_enabled()) { rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; rr.val = 0; - __mon_event_count(rmid, &rr); + rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); + if (IS_ERR(rr.arch_mon_ctx)) { + pr_warn_ratelimited("Failed to allocate monitor context: %ld", + PTR_ERR(rr.arch_mon_ctx)); + return; + } + + __mon_event_count(closid, rmid, &rr); /* * Call the MBA software controller only for the @@ -628,7 +767,9 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid) * the software controller explicitly. */ if (is_mba_sc(NULL)) - mbm_bw_count(rmid, &rr); + mbm_bw_count(closid, rmid, &rr); + + resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); } } @@ -639,106 +780,193 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid) void cqm_handle_limbo(struct work_struct *work) { unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); - int cpu = smp_processor_id(); - struct rdt_resource *r; struct rdt_domain *d; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); - r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; d = container_of(work, struct rdt_domain, cqm_limbo.work); __check_limbo(d, false); - if (has_busy_rmid(r, d)) - schedule_delayed_work_on(cpu, &d->cqm_limbo, delay); + if (has_busy_rmid(d)) { + d->cqm_work_cpu = cpumask_any_housekeeping(&d->cpu_mask, + RESCTRL_PICK_ANY_CPU); + schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo, + delay); + } mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); } -void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms) +/** + * cqm_setup_limbo_handler() - Schedule the limbo handler to run for this + * domain. + * @dom: The domain the limbo handler should run for. + * @delay_ms: How far in the future the handler should run. + * @exclude_cpu: Which CPU the handler should not run on, + * RESCTRL_PICK_ANY_CPU to pick any CPU. + */ +void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms, + int exclude_cpu) { unsigned long delay = msecs_to_jiffies(delay_ms); int cpu; - cpu = cpumask_any(&dom->cpu_mask); + cpu = cpumask_any_housekeeping(&dom->cpu_mask, exclude_cpu); dom->cqm_work_cpu = cpu; - schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); + if (cpu < nr_cpu_ids) + schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); } void mbm_handle_overflow(struct work_struct *work) { unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); struct rdtgroup *prgrp, *crgrp; - int cpu = smp_processor_id(); struct list_head *head; struct rdt_resource *r; struct rdt_domain *d; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); - if (!static_branch_likely(&rdt_mon_enable_key)) + /* + * If the filesystem has been unmounted this work no longer needs to + * run. + */ + if (!resctrl_mounted || !resctrl_arch_mon_capable()) goto out_unlock; r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; d = container_of(work, struct rdt_domain, mbm_over.work); list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { - mbm_update(r, d, prgrp->mon.rmid); + mbm_update(r, d, prgrp->closid, prgrp->mon.rmid); head = &prgrp->mon.crdtgrp_list; list_for_each_entry(crgrp, head, mon.crdtgrp_list) - mbm_update(r, d, crgrp->mon.rmid); + mbm_update(r, d, crgrp->closid, crgrp->mon.rmid); if (is_mba_sc(NULL)) update_mba_bw(prgrp, d); } - schedule_delayed_work_on(cpu, &d->mbm_over, delay); + /* + * Re-check for housekeeping CPUs. This allows the overflow handler to + * move off a nohz_full CPU quickly. + */ + d->mbm_work_cpu = cpumask_any_housekeeping(&d->cpu_mask, + RESCTRL_PICK_ANY_CPU); + schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay); out_unlock: mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); } -void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms) +/** + * mbm_setup_overflow_handler() - Schedule the overflow handler to run for this + * domain. + * @dom: The domain the overflow handler should run for. + * @delay_ms: How far in the future the handler should run. + * @exclude_cpu: Which CPU the handler should not run on, + * RESCTRL_PICK_ANY_CPU to pick any CPU. + */ +void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms, + int exclude_cpu) { unsigned long delay = msecs_to_jiffies(delay_ms); int cpu; - if (!static_branch_likely(&rdt_mon_enable_key)) + /* + * When a domain comes online there is no guarantee the filesystem is + * mounted. If not, there is no need to catch counter overflow. + */ + if (!resctrl_mounted || !resctrl_arch_mon_capable()) return; - cpu = cpumask_any(&dom->cpu_mask); + cpu = cpumask_any_housekeeping(&dom->cpu_mask, exclude_cpu); dom->mbm_work_cpu = cpu; - schedule_delayed_work_on(cpu, &dom->mbm_over, delay); + + if (cpu < nr_cpu_ids) + schedule_delayed_work_on(cpu, &dom->mbm_over, delay); } static int dom_data_init(struct rdt_resource *r) { + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); + u32 num_closid = resctrl_arch_get_num_closid(r); struct rmid_entry *entry = NULL; - int i, nr_rmids; + int err = 0, i; + u32 idx; + + mutex_lock(&rdtgroup_mutex); + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + u32 *tmp; + + /* + * If the architecture hasn't provided a sanitised value here, + * this may result in larger arrays than necessary. Resctrl will + * use a smaller system wide value based on the resources in + * use. + */ + tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL); + if (!tmp) { + err = -ENOMEM; + goto out_unlock; + } - nr_rmids = r->num_rmid; - rmid_ptrs = kcalloc(nr_rmids, sizeof(struct rmid_entry), GFP_KERNEL); - if (!rmid_ptrs) - return -ENOMEM; + closid_num_dirty_rmid = tmp; + } + + rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL); + if (!rmid_ptrs) { + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + kfree(closid_num_dirty_rmid); + closid_num_dirty_rmid = NULL; + } + err = -ENOMEM; + goto out_unlock; + } - for (i = 0; i < nr_rmids; i++) { + for (i = 0; i < idx_limit; i++) { entry = &rmid_ptrs[i]; INIT_LIST_HEAD(&entry->list); - entry->rmid = i; + resctrl_arch_rmid_idx_decode(i, &entry->closid, &entry->rmid); list_add_tail(&entry->list, &rmid_free_lru); } /* - * RMID 0 is special and is always allocated. It's used for all - * tasks that are not monitored. + * RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and + * are always allocated. These are used for the rdtgroup_default + * control group, which will be setup later in rdtgroup_init(). */ - entry = __rmid_entry(0); + idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, + RESCTRL_RESERVED_RMID); + entry = __rmid_entry(idx); list_del(&entry->list); - return 0; +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return err; +} + +static void __exit dom_data_exit(void) +{ + mutex_lock(&rdtgroup_mutex); + + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + kfree(closid_num_dirty_rmid); + closid_num_dirty_rmid = NULL; + } + + kfree(rmid_ptrs); + rmid_ptrs = NULL; + + mutex_unlock(&rdtgroup_mutex); } static struct mon_evt llc_occupancy_event = { @@ -813,6 +1041,12 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) return ret; if (rdt_cpu_has(X86_FEATURE_BMEC)) { + u32 eax, ebx, ecx, edx; + + /* Detect list of bandwidth sources that can be tracked */ + cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx); + hw_res->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) { mbm_total_event.configurable = true; mbm_config_rftype_init("mbm_total_bytes_config"); @@ -830,6 +1064,11 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) return 0; } +void __exit rdt_put_mon_l3_config(void) +{ + dom_data_exit(); +} + void __init intel_rdt_mbm_apply_quirk(void) { int cf_index; diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 8f559eeae08e..884b88e25141 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -581,7 +581,7 @@ static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp) if (ret) goto err_cpus; - if (rdt_mon_capable) { + if (resctrl_arch_mon_capable()) { ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups"); if (ret) goto err_cpus_list; @@ -628,7 +628,7 @@ static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp) if (ret) goto err_cpus; - if (rdt_mon_capable) { + if (resctrl_arch_mon_capable()) { ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777); if (ret) goto err_cpus_list; @@ -752,7 +752,7 @@ int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) * anymore when this group would be used for pseudo-locking. This * is safe to call on platforms not capable of monitoring. */ - free_rmid(rdtgrp->mon.rmid); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); ret = 0; goto out; @@ -776,8 +776,8 @@ int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) { int ret; - if (rdt_mon_capable) { - ret = alloc_rmid(); + if (resctrl_arch_mon_capable()) { + ret = alloc_rmid(rdtgrp->closid); if (ret < 0) { rdt_last_cmd_puts("Out of RMIDs\n"); return ret; @@ -787,7 +787,7 @@ int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) ret = rdtgroup_locksetup_user_restore(rdtgrp); if (ret) { - free_rmid(rdtgrp->mon.rmid); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); return ret; } @@ -844,6 +844,9 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d) struct rdt_domain *d_i; bool ret = false; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL)) return true; diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 69a1de92384a..011e17efb1a6 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -35,6 +35,10 @@ DEFINE_STATIC_KEY_FALSE(rdt_enable_key); DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key); DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key); + +/* Mutex to protect rdtgroup access. */ +DEFINE_MUTEX(rdtgroup_mutex); + static struct kernfs_root *rdt_root; struct rdtgroup rdtgroup_default; LIST_HEAD(rdt_all_groups); @@ -42,6 +46,9 @@ LIST_HEAD(rdt_all_groups); /* list of entries for the schemata file */ LIST_HEAD(resctrl_schema_all); +/* The filesystem can only be mounted once. */ +bool resctrl_mounted; + /* Kernel fs node for "info" directory under root */ static struct kernfs_node *kn_info; @@ -102,7 +109,7 @@ void rdt_staged_configs_clear(void) * * Using a global CLOSID across all resources has some advantages and * some drawbacks: - * + We can simply set "current->closid" to assign a task to a resource + * + We can simply set current's closid to assign a task to a resource * group. * + Context switch code can avoid extra memory references deciding which * CLOSID to load into the PQR_ASSOC MSR @@ -111,7 +118,7 @@ void rdt_staged_configs_clear(void) * - Our choices on how to configure each resource become progressively more * limited as the number of resources grows. */ -static int closid_free_map; +static unsigned long closid_free_map; static int closid_free_map_len; int closids_supported(void) @@ -130,26 +137,39 @@ static void closid_init(void) closid_free_map = BIT_MASK(rdt_min_closid) - 1; - /* CLOSID 0 is always reserved for the default group */ - closid_free_map &= ~1; + /* RESCTRL_RESERVED_CLOSID is always reserved for the default group */ + __clear_bit(RESCTRL_RESERVED_CLOSID, &closid_free_map); closid_free_map_len = rdt_min_closid; } static int closid_alloc(void) { - u32 closid = ffs(closid_free_map); + int cleanest_closid; + u32 closid; - if (closid == 0) - return -ENOSPC; - closid--; - closid_free_map &= ~(1 << closid); + lockdep_assert_held(&rdtgroup_mutex); + + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + cleanest_closid = resctrl_find_cleanest_closid(); + if (cleanest_closid < 0) + return cleanest_closid; + closid = cleanest_closid; + } else { + closid = ffs(closid_free_map); + if (closid == 0) + return -ENOSPC; + closid--; + } + __clear_bit(closid, &closid_free_map); return closid; } void closid_free(int closid) { - closid_free_map |= 1 << closid; + lockdep_assert_held(&rdtgroup_mutex); + + __set_bit(closid, &closid_free_map); } /** @@ -159,9 +179,11 @@ void closid_free(int closid) * Return: true if @closid is currently associated with a resource group, * false if @closid is free */ -static bool closid_allocated(unsigned int closid) +bool closid_allocated(unsigned int closid) { - return (closid_free_map & (1 << closid)) == 0; + lockdep_assert_held(&rdtgroup_mutex); + + return !test_bit(closid, &closid_free_map); } /** @@ -559,14 +581,26 @@ static void update_task_closid_rmid(struct task_struct *t) _update_task_closid_rmid(t); } +static bool task_in_rdtgroup(struct task_struct *tsk, struct rdtgroup *rdtgrp) +{ + u32 closid, rmid = rdtgrp->mon.rmid; + + if (rdtgrp->type == RDTCTRL_GROUP) + closid = rdtgrp->closid; + else if (rdtgrp->type == RDTMON_GROUP) + closid = rdtgrp->mon.parent->closid; + else + return false; + + return resctrl_arch_match_closid(tsk, closid) && + resctrl_arch_match_rmid(tsk, closid, rmid); +} + static int __rdtgroup_move_task(struct task_struct *tsk, struct rdtgroup *rdtgrp) { /* If the task is already in rdtgrp, no need to move the task. */ - if ((rdtgrp->type == RDTCTRL_GROUP && tsk->closid == rdtgrp->closid && - tsk->rmid == rdtgrp->mon.rmid) || - (rdtgrp->type == RDTMON_GROUP && tsk->rmid == rdtgrp->mon.rmid && - tsk->closid == rdtgrp->mon.parent->closid)) + if (task_in_rdtgroup(tsk, rdtgrp)) return 0; /* @@ -577,19 +611,19 @@ static int __rdtgroup_move_task(struct task_struct *tsk, * For monitor groups, can move the tasks only from * their parent CTRL group. */ - - if (rdtgrp->type == RDTCTRL_GROUP) { - WRITE_ONCE(tsk->closid, rdtgrp->closid); - WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid); - } else if (rdtgrp->type == RDTMON_GROUP) { - if (rdtgrp->mon.parent->closid == tsk->closid) { - WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid); - } else { - rdt_last_cmd_puts("Can't move task to different control group\n"); - return -EINVAL; - } + if (rdtgrp->type == RDTMON_GROUP && + !resctrl_arch_match_closid(tsk, rdtgrp->mon.parent->closid)) { + rdt_last_cmd_puts("Can't move task to different control group\n"); + return -EINVAL; } + if (rdtgrp->type == RDTMON_GROUP) + resctrl_arch_set_closid_rmid(tsk, rdtgrp->mon.parent->closid, + rdtgrp->mon.rmid); + else + resctrl_arch_set_closid_rmid(tsk, rdtgrp->closid, + rdtgrp->mon.rmid); + /* * Ensure the task's closid and rmid are written before determining if * the task is current that will decide if it will be interrupted. @@ -611,14 +645,15 @@ static int __rdtgroup_move_task(struct task_struct *tsk, static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) { - return (rdt_alloc_capable && - (r->type == RDTCTRL_GROUP) && (t->closid == r->closid)); + return (resctrl_arch_alloc_capable() && (r->type == RDTCTRL_GROUP) && + resctrl_arch_match_closid(t, r->closid)); } static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r) { - return (rdt_mon_capable && - (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid)); + return (resctrl_arch_mon_capable() && (r->type == RDTMON_GROUP) && + resctrl_arch_match_rmid(t, r->mon.parent->closid, + r->mon.rmid)); } /** @@ -853,7 +888,7 @@ int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns, mutex_lock(&rdtgroup_mutex); /* Return empty if resctrl has not been mounted. */ - if (!static_branch_unlikely(&rdt_enable_key)) { + if (!resctrl_mounted) { seq_puts(s, "res:\nmon:\n"); goto unlock; } @@ -869,7 +904,7 @@ int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns, rdtg->mode != RDT_MODE_EXCLUSIVE) continue; - if (rdtg->closid != tsk->closid) + if (!resctrl_arch_match_closid(tsk, rdtg->closid)) continue; seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "", @@ -877,7 +912,8 @@ int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns, seq_puts(s, "mon:"); list_for_each_entry(crg, &rdtg->mon.crdtgrp_list, mon.crdtgrp_list) { - if (tsk->rmid != crg->mon.rmid) + if (!resctrl_arch_match_rmid(tsk, crg->mon.parent->closid, + crg->mon.rmid)) continue; seq_printf(s, "%s", crg->kn->name); break; @@ -982,6 +1018,7 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, bool sep = false; u32 ctrl_val; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); hw_shareable = r->cache.shareable_bits; list_for_each_entry(dom, &r->domains, list) { @@ -1042,6 +1079,7 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, } seq_putc(seq, '\n'); mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); return 0; } @@ -1297,6 +1335,9 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) struct rdt_domain *d; u32 ctrl; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + list_for_each_entry(s, &resctrl_schema_all, list) { r = s->res; if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA) @@ -1561,6 +1602,7 @@ static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid struct rdt_domain *dom; bool sep = false; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); list_for_each_entry(dom, &r->domains, list) { @@ -1577,6 +1619,7 @@ static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid seq_puts(s, "\n"); mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); return 0; } @@ -1614,17 +1657,10 @@ static void mon_event_config_write(void *info) wrmsr(MSR_IA32_EVT_CFG_BASE + index, mon_info->mon_config, 0); } -static int mbm_config_write_domain(struct rdt_resource *r, - struct rdt_domain *d, u32 evtid, u32 val) +static void mbm_config_write_domain(struct rdt_resource *r, + struct rdt_domain *d, u32 evtid, u32 val) { struct mon_config_info mon_info = {0}; - int ret = 0; - - /* mon_config cannot be more than the supported set of events */ - if (val > MAX_EVT_CONFIG_BITS) { - rdt_last_cmd_puts("Invalid event configuration\n"); - return -EINVAL; - } /* * Read the current config value first. If both are the same then @@ -1633,7 +1669,7 @@ static int mbm_config_write_domain(struct rdt_resource *r, mon_info.evtid = evtid; mondata_config_read(d, &mon_info); if (mon_info.mon_config == val) - goto out; + return; mon_info.mon_config = val; @@ -1656,17 +1692,17 @@ static int mbm_config_write_domain(struct rdt_resource *r, * mbm_local and mbm_total counts for all the RMIDs. */ resctrl_arch_reset_rmid_all(r, d); - -out: - return ret; } static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) { + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); char *dom_str = NULL, *id_str; unsigned long dom_id, val; struct rdt_domain *d; - int ret = 0; + + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); next: if (!tok || tok[0] == '\0') @@ -1686,11 +1722,16 @@ next: return -EINVAL; } + /* Value from user cannot be more than the supported set of events */ + if ((val & hw_res->mbm_cfg_mask) != val) { + rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n", + hw_res->mbm_cfg_mask); + return -EINVAL; + } + list_for_each_entry(d, &r->domains, list) { if (d->id == dom_id) { - ret = mbm_config_write_domain(r, d, evtid, val); - if (ret) - return -EINVAL; + mbm_config_write_domain(r, d, evtid, val); goto next; } } @@ -1709,6 +1750,7 @@ static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of, if (nbytes == 0 || buf[nbytes - 1] != '\n') return -EINVAL; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); rdt_last_cmd_clear(); @@ -1718,6 +1760,7 @@ static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of, ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID); mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); return ret ?: nbytes; } @@ -1733,6 +1776,7 @@ static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, if (nbytes == 0 || buf[nbytes - 1] != '\n') return -EINVAL; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); rdt_last_cmd_clear(); @@ -1742,6 +1786,7 @@ static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID); mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); return ret ?: nbytes; } @@ -2218,6 +2263,9 @@ static int set_cache_qos_cfg(int level, bool enable) struct rdt_domain *d; int cpu; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + if (level == RDT_RESOURCE_L3) update = l3_qos_cfg_update; else if (level == RDT_RESOURCE_L2) @@ -2417,6 +2465,7 @@ struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn) rdtgroup_kn_get(rdtgrp, kn); + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); /* Was this group deleted while we waited? */ @@ -2434,6 +2483,8 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn) return; mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + rdtgroup_kn_put(rdtgrp, kn); } @@ -2584,7 +2635,7 @@ static int rdt_get_tree(struct fs_context *fc) /* * resctrl file system can only be mounted once. */ - if (static_branch_unlikely(&rdt_enable_key)) { + if (resctrl_mounted) { ret = -EBUSY; goto out; } @@ -2605,7 +2656,7 @@ static int rdt_get_tree(struct fs_context *fc) closid_init(); - if (rdt_mon_capable) + if (resctrl_arch_mon_capable()) flags |= RFTYPE_MON; ret = rdtgroup_add_files(rdtgroup_default.kn, flags); @@ -2618,7 +2669,7 @@ static int rdt_get_tree(struct fs_context *fc) if (ret < 0) goto out_schemata_free; - if (rdt_mon_capable) { + if (resctrl_arch_mon_capable()) { ret = mongroup_create_dir(rdtgroup_default.kn, &rdtgroup_default, "mon_groups", &kn_mongrp); @@ -2640,18 +2691,19 @@ static int rdt_get_tree(struct fs_context *fc) if (ret < 0) goto out_psl; - if (rdt_alloc_capable) - static_branch_enable_cpuslocked(&rdt_alloc_enable_key); - if (rdt_mon_capable) - static_branch_enable_cpuslocked(&rdt_mon_enable_key); + if (resctrl_arch_alloc_capable()) + resctrl_arch_enable_alloc(); + if (resctrl_arch_mon_capable()) + resctrl_arch_enable_mon(); - if (rdt_alloc_capable || rdt_mon_capable) - static_branch_enable_cpuslocked(&rdt_enable_key); + if (resctrl_arch_alloc_capable() || resctrl_arch_mon_capable()) + resctrl_mounted = true; if (is_mbm_enabled()) { r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; list_for_each_entry(dom, &r->domains, list) - mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL); + mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL, + RESCTRL_PICK_ANY_CPU); } goto out; @@ -2659,10 +2711,10 @@ static int rdt_get_tree(struct fs_context *fc) out_psl: rdt_pseudo_lock_release(); out_mondata: - if (rdt_mon_capable) + if (resctrl_arch_mon_capable()) kernfs_remove(kn_mondata); out_mongrp: - if (rdt_mon_capable) + if (resctrl_arch_mon_capable()) kernfs_remove(kn_mongrp); out_info: kernfs_remove(kn_info); @@ -2765,6 +2817,9 @@ static int reset_all_ctrls(struct rdt_resource *r) struct rdt_domain *d; int i; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) return -ENOMEM; @@ -2810,8 +2865,8 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, for_each_process_thread(p, t) { if (!from || is_closid_match(t, from) || is_rmid_match(t, from)) { - WRITE_ONCE(t->closid, to->closid); - WRITE_ONCE(t->rmid, to->mon.rmid); + resctrl_arch_set_closid_rmid(t, to->closid, + to->mon.rmid); /* * Order the closid/rmid stores above before the loads @@ -2842,7 +2897,7 @@ static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp) head = &rdtgrp->mon.crdtgrp_list; list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) { - free_rmid(sentry->mon.rmid); + free_rmid(sentry->closid, sentry->mon.rmid); list_del(&sentry->mon.crdtgrp_list); if (atomic_read(&sentry->waitcount) != 0) @@ -2882,7 +2937,7 @@ static void rmdir_all_sub(void) cpumask_or(&rdtgroup_default.cpu_mask, &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); - free_rmid(rdtgrp->mon.rmid); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); kernfs_remove(rdtgrp->kn); list_del(&rdtgrp->rdtgroup_list); @@ -2917,9 +2972,11 @@ static void rdt_kill_sb(struct super_block *sb) rdtgroup_default.mode = RDT_MODE_SHAREABLE; schemata_list_destroy(); rdtgroup_destroy_root(); - static_branch_disable_cpuslocked(&rdt_alloc_enable_key); - static_branch_disable_cpuslocked(&rdt_mon_enable_key); - static_branch_disable_cpuslocked(&rdt_enable_key); + if (resctrl_arch_alloc_capable()) + resctrl_arch_disable_alloc(); + if (resctrl_arch_mon_capable()) + resctrl_arch_disable_mon(); + resctrl_mounted = false; kernfs_kill_sb(sb); mutex_unlock(&rdtgroup_mutex); cpus_read_unlock(); @@ -3047,6 +3104,9 @@ static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, struct rdt_domain *dom; int ret; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + list_for_each_entry(dom, &r->domains, list) { ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp); if (ret) @@ -3293,6 +3353,36 @@ out: return ret; } +static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp) +{ + int ret; + + if (!resctrl_arch_mon_capable()) + return 0; + + ret = alloc_rmid(rdtgrp->closid); + if (ret < 0) { + rdt_last_cmd_puts("Out of RMIDs\n"); + return ret; + } + rdtgrp->mon.rmid = ret; + + ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn); + if (ret) { + rdt_last_cmd_puts("kernfs subdir error\n"); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); + return ret; + } + + return 0; +} + +static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp) +{ + if (resctrl_arch_mon_capable()) + free_rmid(rgrp->closid, rgrp->mon.rmid); +} + static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, const char *name, umode_t mode, enum rdt_group_type rtype, struct rdtgroup **r) @@ -3353,7 +3443,7 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, if (rtype == RDTCTRL_GROUP) { files = RFTYPE_BASE | RFTYPE_CTRL; - if (rdt_mon_capable) + if (resctrl_arch_mon_capable()) files |= RFTYPE_MON; } else { files = RFTYPE_BASE | RFTYPE_MON; @@ -3365,29 +3455,11 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, goto out_destroy; } - if (rdt_mon_capable) { - ret = alloc_rmid(); - if (ret < 0) { - rdt_last_cmd_puts("Out of RMIDs\n"); - goto out_destroy; - } - rdtgrp->mon.rmid = ret; - - ret = mkdir_mondata_all(kn, rdtgrp, &rdtgrp->mon.mon_data_kn); - if (ret) { - rdt_last_cmd_puts("kernfs subdir error\n"); - goto out_idfree; - } - } - kernfs_activate(kn); - /* * The caller unlocks the parent_kn upon success. */ return 0; -out_idfree: - free_rmid(rdtgrp->mon.rmid); out_destroy: kernfs_put(rdtgrp->kn); kernfs_remove(rdtgrp->kn); @@ -3401,7 +3473,6 @@ out_unlock: static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp) { kernfs_remove(rgrp->kn); - free_rmid(rgrp->mon.rmid); rdtgroup_remove(rgrp); } @@ -3423,12 +3494,21 @@ static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn, prgrp = rdtgrp->mon.parent; rdtgrp->closid = prgrp->closid; + ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp); + if (ret) { + mkdir_rdt_prepare_clean(rdtgrp); + goto out_unlock; + } + + kernfs_activate(rdtgrp->kn); + /* * Add the rdtgrp to the list of rdtgrps the parent * ctrl_mon group has to track. */ list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list); +out_unlock: rdtgroup_kn_unlock(parent_kn); return ret; } @@ -3459,13 +3539,20 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, ret = 0; rdtgrp->closid = closid; + + ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp); + if (ret) + goto out_closid_free; + + kernfs_activate(rdtgrp->kn); + ret = rdtgroup_init_alloc(rdtgrp); if (ret < 0) - goto out_id_free; + goto out_rmid_free; list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); - if (rdt_mon_capable) { + if (resctrl_arch_mon_capable()) { /* * Create an empty mon_groups directory to hold the subset * of tasks and cpus to monitor. @@ -3481,7 +3568,9 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, out_del_list: list_del(&rdtgrp->rdtgroup_list); -out_id_free: +out_rmid_free: + mkdir_rdt_prepare_rmid_free(rdtgrp); +out_closid_free: closid_free(closid); out_common_fail: mkdir_rdt_prepare_clean(rdtgrp); @@ -3518,14 +3607,14 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, * allocation is supported, add a control and monitoring * subdirectory */ - if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn) + if (resctrl_arch_alloc_capable() && parent_kn == rdtgroup_default.kn) return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode); /* * If RDT monitoring is supported and the parent directory is a valid * "mon_groups" directory, add a monitoring subdirectory. */ - if (rdt_mon_capable && is_mon_groups(parent_kn, name)) + if (resctrl_arch_mon_capable() && is_mon_groups(parent_kn, name)) return rdtgroup_mkdir_mon(parent_kn, name, mode); return -EPERM; @@ -3550,7 +3639,7 @@ static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) update_closid_rmid(tmpmask, NULL); rdtgrp->flags = RDT_DELETED; - free_rmid(rdtgrp->mon.rmid); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); /* * Remove the rdtgrp from the parent ctrl_mon group's list @@ -3596,8 +3685,8 @@ static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); update_closid_rmid(tmpmask, NULL); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); closid_free(rdtgrp->closid); - free_rmid(rdtgrp->mon.rmid); rdtgroup_ctrl_remove(rdtgrp); @@ -3829,8 +3918,8 @@ static void __init rdtgroup_setup_default(void) { mutex_lock(&rdtgroup_mutex); - rdtgroup_default.closid = 0; - rdtgroup_default.mon.rmid = 0; + rdtgroup_default.closid = RESCTRL_RESERVED_CLOSID; + rdtgroup_default.mon.rmid = RESCTRL_RESERVED_RMID; rdtgroup_default.type = RDTCTRL_GROUP; INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list); @@ -3848,24 +3937,24 @@ static void domain_destroy_mon_state(struct rdt_domain *d) void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d) { - lockdep_assert_held(&rdtgroup_mutex); + mutex_lock(&rdtgroup_mutex); if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) mba_sc_domain_destroy(r, d); if (!r->mon_capable) - return; + goto out_unlock; /* * If resctrl is mounted, remove all the * per domain monitor data directories. */ - if (static_branch_unlikely(&rdt_mon_enable_key)) + if (resctrl_mounted && resctrl_arch_mon_capable()) rmdir_mondata_subdir_allrdtgrp(r, d->id); if (is_mbm_enabled()) cancel_delayed_work(&d->mbm_over); - if (is_llc_occupancy_enabled() && has_busy_rmid(r, d)) { + if (is_llc_occupancy_enabled() && has_busy_rmid(d)) { /* * When a package is going down, forcefully * decrement rmid->ebusy. There is no way to know @@ -3879,20 +3968,24 @@ void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d) } domain_destroy_mon_state(d); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); } static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) { + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); size_t tsize; if (is_llc_occupancy_enabled()) { - d->rmid_busy_llc = bitmap_zalloc(r->num_rmid, GFP_KERNEL); + d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL); if (!d->rmid_busy_llc) return -ENOMEM; } if (is_mbm_total_enabled()) { tsize = sizeof(*d->mbm_total); - d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL); + d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL); if (!d->mbm_total) { bitmap_free(d->rmid_busy_llc); return -ENOMEM; @@ -3900,7 +3993,7 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) } if (is_mbm_local_enabled()) { tsize = sizeof(*d->mbm_local); - d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL); + d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL); if (!d->mbm_local) { bitmap_free(d->rmid_busy_llc); kfree(d->mbm_total); @@ -3913,34 +4006,97 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d) { - int err; + int err = 0; - lockdep_assert_held(&rdtgroup_mutex); + mutex_lock(&rdtgroup_mutex); - if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) + if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) { /* RDT_RESOURCE_MBA is never mon_capable */ - return mba_sc_domain_allocate(r, d); + err = mba_sc_domain_allocate(r, d); + goto out_unlock; + } if (!r->mon_capable) - return 0; + goto out_unlock; err = domain_setup_mon_state(r, d); if (err) - return err; + goto out_unlock; if (is_mbm_enabled()) { INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow); - mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL); + mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL, + RESCTRL_PICK_ANY_CPU); } if (is_llc_occupancy_enabled()) INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); - /* If resctrl is mounted, add per domain monitor data directories. */ - if (static_branch_unlikely(&rdt_mon_enable_key)) + /* + * If the filesystem is not mounted then only the default resource group + * exists. Creation of its directories is deferred until mount time + * by rdt_get_tree() calling mkdir_mondata_all(). + * If resctrl is mounted, add per domain monitor data directories. + */ + if (resctrl_mounted && resctrl_arch_mon_capable()) mkdir_mondata_subdir_allrdtgrp(r, d); - return 0; +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return err; +} + +void resctrl_online_cpu(unsigned int cpu) +{ + mutex_lock(&rdtgroup_mutex); + /* The CPU is set in default rdtgroup after online. */ + cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); + mutex_unlock(&rdtgroup_mutex); +} + +static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) +{ + struct rdtgroup *cr; + + list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) { + if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) + break; + } +} + +void resctrl_offline_cpu(unsigned int cpu) +{ + struct rdt_resource *l3 = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + struct rdtgroup *rdtgrp; + struct rdt_domain *d; + + mutex_lock(&rdtgroup_mutex); + list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { + if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) { + clear_childcpus(rdtgrp, cpu); + break; + } + } + + if (!l3->mon_capable) + goto out_unlock; + + d = get_domain_from_cpu(cpu, l3); + if (d) { + if (is_mbm_enabled() && cpu == d->mbm_work_cpu) { + cancel_delayed_work(&d->mbm_over); + mbm_setup_overflow_handler(d, 0, cpu); + } + if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu && + has_busy_rmid(d)) { + cancel_delayed_work(&d->cqm_limbo); + cqm_setup_limbo_handler(d, 0, cpu); + } + } + +out_unlock: + mutex_unlock(&rdtgroup_mutex); } /* diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index dc136703566f..3259b1d4fefe 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -1,167 +1,510 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: GPL-2.0-only /* - * Check for extended topology enumeration cpuid leaf 0xb and if it - * exists, use it for populating initial_apicid and cpu topology - * detection. + * CPU/APIC topology + * + * The APIC IDs describe the system topology in multiple domain levels. + * The CPUID topology parser provides the information which part of the + * APIC ID is associated to the individual levels: + * + * [PACKAGE][DIEGRP][DIE][TILE][MODULE][CORE][THREAD] + * + * The root space contains the package (socket) IDs. + * + * Not enumerated levels consume 0 bits space, but conceptually they are + * always represented. If e.g. only CORE and THREAD levels are enumerated + * then the DIE, MODULE and TILE have the same physical ID as the PACKAGE. + * + * If SMT is not supported, then the THREAD domain is still used. It then + * has the same physical ID as the CORE domain and is the only child of + * the core domain. + * + * This allows a unified view on the system independent of the enumerated + * domain levels without requiring any conditionals in the code. */ - +#define pr_fmt(fmt) "CPU topo: " fmt #include <linux/cpu.h> + +#include <xen/xen.h> + #include <asm/apic.h> -#include <asm/memtype.h> -#include <asm/processor.h> +#include <asm/hypervisor.h> +#include <asm/io_apic.h> +#include <asm/mpspec.h> +#include <asm/smp.h> #include "cpu.h" -/* leaf 0xb SMT level */ -#define SMT_LEVEL 0 +/* + * Map cpu index to physical APIC ID + */ +DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_apicid, BAD_APICID); +DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid, CPU_ACPIID_INVALID); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid); -/* extended topology sub-leaf types */ -#define INVALID_TYPE 0 -#define SMT_TYPE 1 -#define CORE_TYPE 2 -#define DIE_TYPE 5 +/* Bitmap of physically present CPUs. */ +DECLARE_BITMAP(phys_cpu_present_map, MAX_LOCAL_APIC) __read_mostly; -#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff) -#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f) -#define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff) +/* Used for CPU number allocation and parallel CPU bringup */ +u32 cpuid_to_apicid[] __ro_after_init = { [0 ... NR_CPUS - 1] = BAD_APICID, }; + +/* Bitmaps to mark registered APICs at each topology domain */ +static struct { DECLARE_BITMAP(map, MAX_LOCAL_APIC); } apic_maps[TOPO_MAX_DOMAIN] __ro_after_init; + +/* + * Keep track of assigned, disabled and rejected CPUs. Present assigned + * with 1 as CPU #0 is reserved for the boot CPU. + */ +static struct { + unsigned int nr_assigned_cpus; + unsigned int nr_disabled_cpus; + unsigned int nr_rejected_cpus; + u32 boot_cpu_apic_id; + u32 real_bsp_apic_id; +} topo_info __ro_after_init = { + .nr_assigned_cpus = 1, + .boot_cpu_apic_id = BAD_APICID, + .real_bsp_apic_id = BAD_APICID, +}; -unsigned int __max_die_per_package __read_mostly = 1; -EXPORT_SYMBOL(__max_die_per_package); +#define domain_weight(_dom) bitmap_weight(apic_maps[_dom].map, MAX_LOCAL_APIC) + +bool arch_match_cpu_phys_id(int cpu, u64 phys_id) +{ + return phys_id == (u64)cpuid_to_apicid[cpu]; +} #ifdef CONFIG_SMP +static void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) +{ + if (!(apicid & (__max_threads_per_core - 1))) + cpumask_set_cpu(cpu, &__cpu_primary_thread_mask); +} +#else +static inline void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) { } +#endif + /* - * Check if given CPUID extended topology "leaf" is implemented + * Convert the APIC ID to a domain level ID by masking out the low bits + * below the domain level @dom. */ -static int check_extended_topology_leaf(int leaf) +static inline u32 topo_apicid(u32 apicid, enum x86_topology_domains dom) +{ + if (dom == TOPO_SMT_DOMAIN) + return apicid; + return apicid & (UINT_MAX << x86_topo_system.dom_shifts[dom - 1]); +} + +static int topo_lookup_cpuid(u32 apic_id) { - unsigned int eax, ebx, ecx, edx; + int i; - cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx); + /* CPU# to APICID mapping is persistent once it is established */ + for (i = 0; i < topo_info.nr_assigned_cpus; i++) { + if (cpuid_to_apicid[i] == apic_id) + return i; + } + return -ENODEV; +} - if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) - return -1; +static __init int topo_get_cpunr(u32 apic_id) +{ + int cpu = topo_lookup_cpuid(apic_id); - return 0; + if (cpu >= 0) + return cpu; + + return topo_info.nr_assigned_cpus++; } -/* - * Return best CPUID Extended Topology Leaf supported + +static void topo_set_cpuids(unsigned int cpu, u32 apic_id, u32 acpi_id) +{ +#if defined(CONFIG_SMP) || defined(CONFIG_X86_64) + early_per_cpu(x86_cpu_to_apicid, cpu) = apic_id; + early_per_cpu(x86_cpu_to_acpiid, cpu) = acpi_id; +#endif + set_cpu_possible(cpu, true); + set_cpu_present(cpu, true); +} + +static __init bool check_for_real_bsp(u32 apic_id) +{ + /* + * There is no real good way to detect whether this a kdump() + * kernel, but except on the Voyager SMP monstrosity which is not + * longer supported, the real BSP APIC ID is the first one which is + * enumerated by firmware. That allows to detect whether the boot + * CPU is the real BSP. If it is not, then do not register the APIC + * because sending INIT to the real BSP would reset the whole + * system. + * + * The first APIC ID which is enumerated by firmware is detectable + * because the boot CPU APIC ID is registered before that without + * invoking this code. + */ + if (topo_info.real_bsp_apic_id != BAD_APICID) + return false; + + if (apic_id == topo_info.boot_cpu_apic_id) { + topo_info.real_bsp_apic_id = apic_id; + return false; + } + + pr_warn("Boot CPU APIC ID not the first enumerated APIC ID: %x > %x\n", + topo_info.boot_cpu_apic_id, apic_id); + pr_warn("Crash kernel detected. Disabling real BSP to prevent machine INIT\n"); + + topo_info.real_bsp_apic_id = apic_id; + return true; +} + +static unsigned int topo_unit_count(u32 lvlid, enum x86_topology_domains at_level, + unsigned long *map) +{ + unsigned int id, end, cnt = 0; + + /* Calculate the exclusive end */ + end = lvlid + (1U << x86_topo_system.dom_shifts[at_level]); + + /* Unfortunately there is no bitmap_weight_range() */ + for (id = find_next_bit(map, end, lvlid); id < end; id = find_next_bit(map, end, ++id)) + cnt++; + return cnt; +} + +static __init void topo_register_apic(u32 apic_id, u32 acpi_id, bool present) +{ + int cpu, dom; + + if (present) { + set_bit(apic_id, phys_cpu_present_map); + + /* + * Double registration is valid in case of the boot CPU + * APIC because that is registered before the enumeration + * of the APICs via firmware parsers or VM guest + * mechanisms. + */ + if (apic_id == topo_info.boot_cpu_apic_id) + cpu = 0; + else + cpu = topo_get_cpunr(apic_id); + + cpuid_to_apicid[cpu] = apic_id; + topo_set_cpuids(cpu, apic_id, acpi_id); + } else { + u32 pkgid = topo_apicid(apic_id, TOPO_PKG_DOMAIN); + + /* + * Check for present APICs in the same package when running + * on bare metal. Allow the bogosity in a guest. + */ + if (hypervisor_is_type(X86_HYPER_NATIVE) && + topo_unit_count(pkgid, TOPO_PKG_DOMAIN, phys_cpu_present_map)) { + pr_info_once("Ignoring hot-pluggable APIC ID %x in present package.\n", + apic_id); + topo_info.nr_rejected_cpus++; + return; + } + + topo_info.nr_disabled_cpus++; + } + + /* Register present and possible CPUs in the domain maps */ + for (dom = TOPO_SMT_DOMAIN; dom < TOPO_MAX_DOMAIN; dom++) + set_bit(topo_apicid(apic_id, dom), apic_maps[dom].map); +} + +/** + * topology_register_apic - Register an APIC in early topology maps + * @apic_id: The APIC ID to set up + * @acpi_id: The ACPI ID associated to the APIC + * @present: True if the corresponding CPU is present */ -static int detect_extended_topology_leaf(struct cpuinfo_x86 *c) +void __init topology_register_apic(u32 apic_id, u32 acpi_id, bool present) { - if (c->cpuid_level >= 0x1f) { - if (check_extended_topology_leaf(0x1f) == 0) - return 0x1f; + if (apic_id >= MAX_LOCAL_APIC) { + pr_err_once("APIC ID %x exceeds kernel limit of: %x\n", apic_id, MAX_LOCAL_APIC - 1); + topo_info.nr_rejected_cpus++; + return; + } + + if (check_for_real_bsp(apic_id)) { + topo_info.nr_rejected_cpus++; + return; } - if (c->cpuid_level >= 0xb) { - if (check_extended_topology_leaf(0xb) == 0) - return 0xb; + /* CPU numbers exhausted? */ + if (apic_id != topo_info.boot_cpu_apic_id && topo_info.nr_assigned_cpus >= nr_cpu_ids) { + pr_warn_once("CPU limit of %d reached. Ignoring further CPUs\n", nr_cpu_ids); + topo_info.nr_rejected_cpus++; + return; } - return -1; + topo_register_apic(apic_id, acpi_id, present); +} + +/** + * topology_register_boot_apic - Register the boot CPU APIC + * @apic_id: The APIC ID to set up + * + * Separate so CPU #0 can be assigned + */ +void __init topology_register_boot_apic(u32 apic_id) +{ + WARN_ON_ONCE(topo_info.boot_cpu_apic_id != BAD_APICID); + + topo_info.boot_cpu_apic_id = apic_id; + topo_register_apic(apic_id, CPU_ACPIID_INVALID, true); +} + +/** + * topology_get_logical_id - Retrieve the logical ID at a given topology domain level + * @apicid: The APIC ID for which to lookup the logical ID + * @at_level: The topology domain level to use + * + * @apicid must be a full APIC ID, not the normalized variant. It's valid to have + * all bits below the domain level specified by @at_level to be clear. So both + * real APIC IDs and backshifted normalized APIC IDs work correctly. + * + * Returns: + * - >= 0: The requested logical ID + * - -ERANGE: @apicid is out of range + * - -ENODEV: @apicid is not registered + */ +int topology_get_logical_id(u32 apicid, enum x86_topology_domains at_level) +{ + /* Remove the bits below @at_level to get the proper level ID of @apicid */ + unsigned int lvlid = topo_apicid(apicid, at_level); + + if (lvlid >= MAX_LOCAL_APIC) + return -ERANGE; + if (!test_bit(lvlid, apic_maps[at_level].map)) + return -ENODEV; + /* Get the number of set bits before @lvlid. */ + return bitmap_weight(apic_maps[at_level].map, lvlid); +} +EXPORT_SYMBOL_GPL(topology_get_logical_id); + +/** + * topology_unit_count - Retrieve the count of specified units at a given topology domain level + * @apicid: The APIC ID which specifies the search range + * @which_units: The domain level specifying the units to count + * @at_level: The domain level at which @which_units have to be counted + * + * This returns the number of possible units according to the enumerated + * information. + * + * E.g. topology_count_units(apicid, TOPO_CORE_DOMAIN, TOPO_PKG_DOMAIN) + * counts the number of possible cores in the package to which @apicid + * belongs. + * + * @at_level must obviously be greater than @which_level to produce useful + * results. If @at_level is equal to @which_units the result is + * unsurprisingly 1. If @at_level is less than @which_units the results + * is by definition undefined and the function returns 0. + */ +unsigned int topology_unit_count(u32 apicid, enum x86_topology_domains which_units, + enum x86_topology_domains at_level) +{ + /* Remove the bits below @at_level to get the proper level ID of @apicid */ + unsigned int lvlid = topo_apicid(apicid, at_level); + + if (lvlid >= MAX_LOCAL_APIC) + return 0; + if (!test_bit(lvlid, apic_maps[at_level].map)) + return 0; + if (which_units > at_level) + return 0; + if (which_units == at_level) + return 1; + return topo_unit_count(lvlid, at_level, apic_maps[which_units].map); +} + +#ifdef CONFIG_ACPI_HOTPLUG_CPU +/** + * topology_hotplug_apic - Handle a physical hotplugged APIC after boot + * @apic_id: The APIC ID to set up + * @acpi_id: The ACPI ID associated to the APIC + */ +int topology_hotplug_apic(u32 apic_id, u32 acpi_id) +{ + int cpu; + + if (apic_id >= MAX_LOCAL_APIC) + return -EINVAL; + + /* Reject if the APIC ID was not registered during enumeration. */ + if (!test_bit(apic_id, apic_maps[TOPO_SMT_DOMAIN].map)) + return -ENODEV; + + cpu = topo_lookup_cpuid(apic_id); + if (cpu < 0) + return -ENOSPC; + + set_bit(apic_id, phys_cpu_present_map); + topo_set_cpuids(cpu, apic_id, acpi_id); + cpu_mark_primary_thread(cpu, apic_id); + return cpu; +} + +/** + * topology_hotunplug_apic - Remove a physical hotplugged APIC after boot + * @cpu: The CPU number for which the APIC ID is removed + */ +void topology_hotunplug_apic(unsigned int cpu) +{ + u32 apic_id = cpuid_to_apicid[cpu]; + + if (apic_id == BAD_APICID) + return; + + per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID; + clear_bit(apic_id, phys_cpu_present_map); + set_cpu_present(cpu, false); } #endif -int detect_extended_topology_early(struct cpuinfo_x86 *c) +#ifdef CONFIG_X86_LOCAL_APIC +static unsigned int max_possible_cpus __initdata = NR_CPUS; + +/** + * topology_apply_cmdline_limits_early - Apply topology command line limits early + * + * Ensure that command line limits are in effect before firmware parsing + * takes place. + */ +void __init topology_apply_cmdline_limits_early(void) { -#ifdef CONFIG_SMP - unsigned int eax, ebx, ecx, edx; - int leaf; + unsigned int possible = nr_cpu_ids; - leaf = detect_extended_topology_leaf(c); - if (leaf < 0) - return -1; + /* 'maxcpus=0' 'nosmp' 'nolapic' 'disableapic' 'noapic' */ + if (!setup_max_cpus || ioapic_is_disabled || apic_is_disabled) + possible = 1; - set_cpu_cap(c, X86_FEATURE_XTOPOLOGY); + /* 'possible_cpus=N' */ + possible = min_t(unsigned int, max_possible_cpus, possible); + + if (possible < nr_cpu_ids) { + pr_info("Limiting to %u possible CPUs\n", possible); + set_nr_cpu_ids(possible); + } +} - cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx); +static __init bool restrict_to_up(void) +{ + if (!smp_found_config || ioapic_is_disabled) + return true; /* - * initial apic id, which also represents 32-bit extended x2apic id. + * XEN PV is special as it does not advertise the local APIC + * properly, but provides a fake topology for it so that the + * infrastructure works. So don't apply the restrictions vs. APIC + * here. */ - c->topo.initial_apicid = edx; - smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx)); -#endif - return 0; + if (xen_pv_domain()) + return false; + + return apic_is_disabled; } -/* - * Check for extended topology enumeration cpuid leaf, and if it - * exists, use it for populating initial_apicid and cpu topology - * detection. - */ -int detect_extended_topology(struct cpuinfo_x86 *c) +void __init topology_init_possible_cpus(void) { -#ifdef CONFIG_SMP - unsigned int eax, ebx, ecx, edx, sub_index; - unsigned int ht_mask_width, core_plus_mask_width, die_plus_mask_width; - unsigned int core_select_mask, core_level_siblings; - unsigned int die_select_mask, die_level_siblings; - unsigned int pkg_mask_width; - bool die_level_present = false; - int leaf; - - leaf = detect_extended_topology_leaf(c); - if (leaf < 0) - return -1; + unsigned int assigned = topo_info.nr_assigned_cpus; + unsigned int disabled = topo_info.nr_disabled_cpus; + unsigned int cnta, cntb, cpu, allowed = 1; + unsigned int total = assigned + disabled; + u32 apicid, firstid; + + if (!restrict_to_up()) { + if (WARN_ON_ONCE(assigned > nr_cpu_ids)) { + disabled += assigned - nr_cpu_ids; + assigned = nr_cpu_ids; + } + allowed = min_t(unsigned int, total, nr_cpu_ids); + } + + if (total > allowed) + pr_warn("%u possible CPUs exceed the limit of %u\n", total, allowed); + + assigned = min_t(unsigned int, allowed, assigned); + disabled = allowed - assigned; + topo_info.nr_assigned_cpus = assigned; + topo_info.nr_disabled_cpus = disabled; + + total_cpus = allowed; + set_nr_cpu_ids(allowed); + + cnta = domain_weight(TOPO_PKG_DOMAIN); + cntb = domain_weight(TOPO_DIE_DOMAIN); + __max_logical_packages = cnta; + __max_dies_per_package = 1U << (get_count_order(cntb) - get_count_order(cnta)); + + pr_info("Max. logical packages: %3u\n", cnta); + pr_info("Max. logical dies: %3u\n", cntb); + pr_info("Max. dies per package: %3u\n", __max_dies_per_package); + + cnta = domain_weight(TOPO_CORE_DOMAIN); + cntb = domain_weight(TOPO_SMT_DOMAIN); /* - * Populate HT related information from sub-leaf level 0. + * Can't use order delta here as order(cnta) can be equal + * order(cntb) even if cnta != cntb. */ - cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx); - c->topo.initial_apicid = edx; - core_level_siblings = LEVEL_MAX_SIBLINGS(ebx); - smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx)); - core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); - die_level_siblings = LEVEL_MAX_SIBLINGS(ebx); - pkg_mask_width = die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); - - sub_index = 1; - while (true) { - cpuid_count(leaf, sub_index, &eax, &ebx, &ecx, &edx); + __max_threads_per_core = DIV_ROUND_UP(cntb, cnta); + pr_info("Max. threads per core: %3u\n", __max_threads_per_core); - /* - * Check for the Core type in the implemented sub leaves. - */ - if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) { - core_level_siblings = LEVEL_MAX_SIBLINGS(ebx); - core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); - die_level_siblings = core_level_siblings; - die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); - } - if (LEAFB_SUBTYPE(ecx) == DIE_TYPE) { - die_level_present = true; - die_level_siblings = LEVEL_MAX_SIBLINGS(ebx); - die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); - } + firstid = find_first_bit(apic_maps[TOPO_SMT_DOMAIN].map, MAX_LOCAL_APIC); + __num_cores_per_package = topology_unit_count(firstid, TOPO_CORE_DOMAIN, TOPO_PKG_DOMAIN); + pr_info("Num. cores per package: %3u\n", __num_cores_per_package); + __num_threads_per_package = topology_unit_count(firstid, TOPO_SMT_DOMAIN, TOPO_PKG_DOMAIN); + pr_info("Num. threads per package: %3u\n", __num_threads_per_package); - if (LEAFB_SUBTYPE(ecx) != INVALID_TYPE) - pkg_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); - else - break; + pr_info("Allowing %u present CPUs plus %u hotplug CPUs\n", assigned, disabled); + if (topo_info.nr_rejected_cpus) + pr_info("Rejected CPUs %u\n", topo_info.nr_rejected_cpus); - sub_index++; + init_cpu_present(cpumask_of(0)); + init_cpu_possible(cpumask_of(0)); + + /* Assign CPU numbers to non-present CPUs */ + for (apicid = 0; disabled; disabled--, apicid++) { + apicid = find_next_andnot_bit(apic_maps[TOPO_SMT_DOMAIN].map, phys_cpu_present_map, + MAX_LOCAL_APIC, apicid); + if (apicid >= MAX_LOCAL_APIC) + break; + cpuid_to_apicid[topo_info.nr_assigned_cpus++] = apicid; } - core_select_mask = (~(-1 << pkg_mask_width)) >> ht_mask_width; - die_select_mask = (~(-1 << die_plus_mask_width)) >> - core_plus_mask_width; + for (cpu = 0; cpu < allowed; cpu++) { + apicid = cpuid_to_apicid[cpu]; - c->topo.core_id = apic->phys_pkg_id(c->topo.initial_apicid, - ht_mask_width) & core_select_mask; + set_cpu_possible(cpu, true); - if (die_level_present) { - c->topo.die_id = apic->phys_pkg_id(c->topo.initial_apicid, - core_plus_mask_width) & die_select_mask; + if (apicid == BAD_APICID) + continue; + + cpu_mark_primary_thread(cpu, apicid); + set_cpu_present(cpu, test_bit(apicid, phys_cpu_present_map)); } +} - c->topo.pkg_id = apic->phys_pkg_id(c->topo.initial_apicid, pkg_mask_width); - /* - * Reinit the apicid, now that we have extended initial_apicid. - */ - c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0); +/* + * Late SMP disable after sizing CPU masks when APIC/IOAPIC setup failed. + */ +void __init topology_reset_possible_cpus_up(void) +{ + init_cpu_present(cpumask_of(0)); + init_cpu_possible(cpumask_of(0)); - c->x86_max_cores = (core_level_siblings / smp_num_siblings); - __max_die_per_package = (die_level_siblings / core_level_siblings); -#endif + bitmap_zero(phys_cpu_present_map, MAX_LOCAL_APIC); + if (topo_info.boot_cpu_apic_id != BAD_APICID) + set_bit(topo_info.boot_cpu_apic_id, phys_cpu_present_map); +} + +static int __init setup_possible_cpus(char *str) +{ + get_option(&str, &max_possible_cpus); return 0; } +early_param("possible_cpus", setup_possible_cpus); +#endif diff --git a/arch/x86/kernel/cpu/topology.h b/arch/x86/kernel/cpu/topology.h new file mode 100644 index 000000000000..37326297f80c --- /dev/null +++ b/arch/x86/kernel/cpu/topology.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef ARCH_X86_TOPOLOGY_H +#define ARCH_X86_TOPOLOGY_H + +struct topo_scan { + struct cpuinfo_x86 *c; + unsigned int dom_shifts[TOPO_MAX_DOMAIN]; + unsigned int dom_ncpus[TOPO_MAX_DOMAIN]; + + /* Legacy CPUID[1]:EBX[23:16] number of logical processors */ + unsigned int ebx1_nproc_shift; + + /* AMD specific node ID which cannot be mapped into APIC space. */ + u16 amd_nodes_per_pkg; + u16 amd_node_id; +}; + +void cpu_init_topology(struct cpuinfo_x86 *c); +void cpu_parse_topology(struct cpuinfo_x86 *c); +void topology_set_dom(struct topo_scan *tscan, enum x86_topology_domains dom, + unsigned int shift, unsigned int ncpus); +bool cpu_parse_topology_ext(struct topo_scan *tscan); +void cpu_parse_topology_amd(struct topo_scan *tscan); +void cpu_topology_fixup_amd(struct topo_scan *tscan); + +static inline u32 topo_shift_apicid(u32 apicid, enum x86_topology_domains dom) +{ + if (dom == TOPO_SMT_DOMAIN) + return apicid; + return apicid >> x86_topo_system.dom_shifts[dom - 1]; +} + +static inline u32 topo_relative_domain_id(u32 apicid, enum x86_topology_domains dom) +{ + if (dom != TOPO_SMT_DOMAIN) + apicid >>= x86_topo_system.dom_shifts[dom - 1]; + return apicid & (x86_topo_system.dom_size[dom] - 1); +} + +static inline u32 topo_domain_mask(enum x86_topology_domains dom) +{ + return (1U << x86_topo_system.dom_shifts[dom]) - 1; +} + +/* + * Update a domain level after the fact without propagating. Used to fixup + * broken CPUID enumerations. + */ +static inline void topology_update_dom(struct topo_scan *tscan, enum x86_topology_domains dom, + unsigned int shift, unsigned int ncpus) +{ + tscan->dom_shifts[dom] = shift; + tscan->dom_ncpus[dom] = ncpus; +} + +#ifdef CONFIG_X86_LOCAL_APIC +unsigned int topology_unit_count(u32 apicid, enum x86_topology_domains which_units, + enum x86_topology_domains at_level); +#else +static inline unsigned int topology_unit_count(u32 apicid, enum x86_topology_domains which_units, + enum x86_topology_domains at_level) +{ + return 1; +} +#endif + +#endif /* ARCH_X86_TOPOLOGY_H */ diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c new file mode 100644 index 000000000000..1a8b3ad493af --- /dev/null +++ b/arch/x86/kernel/cpu/topology_amd.c @@ -0,0 +1,183 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/cpu.h> + +#include <asm/apic.h> +#include <asm/memtype.h> +#include <asm/processor.h> + +#include "cpu.h" + +static bool parse_8000_0008(struct topo_scan *tscan) +{ + struct { + // ecx + u32 cpu_nthreads : 8, // Number of physical threads - 1 + : 4, // Reserved + apicid_coreid_len : 4, // Number of thread core ID bits (shift) in APIC ID + perf_tsc_len : 2, // Performance time-stamp counter size + : 14; // Reserved + } ecx; + unsigned int sft; + + if (tscan->c->extended_cpuid_level < 0x80000008) + return false; + + cpuid_leaf_reg(0x80000008, CPUID_ECX, &ecx); + + /* If the thread bits are 0, then get the shift value from ecx.cpu_nthreads */ + sft = ecx.apicid_coreid_len; + if (!sft) + sft = get_count_order(ecx.cpu_nthreads + 1); + + topology_set_dom(tscan, TOPO_SMT_DOMAIN, sft, ecx.cpu_nthreads + 1); + return true; +} + +static void store_node(struct topo_scan *tscan, unsigned int nr_nodes, u16 node_id) +{ + /* + * Starting with Fam 17h the DIE domain could probably be used to + * retrieve the node info on AMD/HYGON. Analysis of CPUID dumps + * suggests it's the topmost bit(s) of the CPU cores area, but + * that's guess work and neither enumerated nor documented. + * + * Up to Fam 16h this does not work at all and the legacy node ID + * has to be used. + */ + tscan->amd_nodes_per_pkg = nr_nodes; + tscan->amd_node_id = node_id; +} + +static bool parse_8000_001e(struct topo_scan *tscan, bool has_0xb) +{ + struct { + // eax + u32 ext_apic_id : 32; // Extended APIC ID + // ebx + u32 core_id : 8, // Unique per-socket logical core unit ID + core_nthreads : 8, // #Threads per core (zero-based) + : 16; // Reserved + // ecx + u32 node_id : 8, // Node (die) ID of invoking logical CPU + nnodes_per_socket : 3, // #nodes in invoking logical CPU's package/socket + : 21; // Reserved + // edx + u32 : 32; // Reserved + } leaf; + + if (!boot_cpu_has(X86_FEATURE_TOPOEXT)) + return false; + + cpuid_leaf(0x8000001e, &leaf); + + tscan->c->topo.initial_apicid = leaf.ext_apic_id; + + /* + * If leaf 0xb is available, then SMT shift is set already. If not + * take it from ecx.threads_per_core and use topo_update_dom() - + * topology_set_dom() would propagate and overwrite the already + * propagated CORE level. + */ + if (!has_0xb) { + unsigned int nthreads = leaf.core_nthreads + 1; + + topology_update_dom(tscan, TOPO_SMT_DOMAIN, get_count_order(nthreads), nthreads); + } + + store_node(tscan, leaf.nnodes_per_socket + 1, leaf.node_id); + + if (tscan->c->x86_vendor == X86_VENDOR_AMD) { + if (tscan->c->x86 == 0x15) + tscan->c->topo.cu_id = leaf.core_id; + + cacheinfo_amd_init_llc_id(tscan->c, leaf.node_id); + } else { + /* + * Package ID is ApicId[6..] on certain Hygon CPUs. See + * commit e0ceeae708ce for explanation. The topology info + * is screwed up: The package shift is always 6 and the + * node ID is bit [4:5]. + */ + if (!boot_cpu_has(X86_FEATURE_HYPERVISOR) && tscan->c->x86_model <= 0x3) { + topology_set_dom(tscan, TOPO_CORE_DOMAIN, 6, + tscan->dom_ncpus[TOPO_CORE_DOMAIN]); + } + cacheinfo_hygon_init_llc_id(tscan->c); + } + return true; +} + +static bool parse_fam10h_node_id(struct topo_scan *tscan) +{ + struct { + union { + u64 node_id : 3, + nodes_per_pkg : 3, + unused : 58; + u64 msr; + }; + } nid; + + if (!boot_cpu_has(X86_FEATURE_NODEID_MSR)) + return false; + + rdmsrl(MSR_FAM10H_NODE_ID, nid.msr); + store_node(tscan, nid.nodes_per_pkg + 1, nid.node_id); + tscan->c->topo.llc_id = nid.node_id; + return true; +} + +static void legacy_set_llc(struct topo_scan *tscan) +{ + unsigned int apicid = tscan->c->topo.initial_apicid; + + /* parse_8000_0008() set everything up except llc_id */ + tscan->c->topo.llc_id = apicid >> tscan->dom_shifts[TOPO_CORE_DOMAIN]; +} + +static void parse_topology_amd(struct topo_scan *tscan) +{ + bool has_0xb = false; + + /* + * If the extended topology leaf 0x8000_001e is available + * try to get SMT and CORE shift from leaf 0xb first, then + * try to get the CORE shift from leaf 0x8000_0008. + */ + if (cpu_feature_enabled(X86_FEATURE_TOPOEXT)) + has_0xb = cpu_parse_topology_ext(tscan); + + if (!has_0xb && !parse_8000_0008(tscan)) + return; + + /* Prefer leaf 0x8000001e if available */ + if (parse_8000_001e(tscan, has_0xb)) + return; + + /* Try the NODEID MSR */ + if (parse_fam10h_node_id(tscan)) + return; + + legacy_set_llc(tscan); +} + +void cpu_parse_topology_amd(struct topo_scan *tscan) +{ + tscan->amd_nodes_per_pkg = 1; + parse_topology_amd(tscan); + + if (tscan->amd_nodes_per_pkg > 1) + set_cpu_cap(tscan->c, X86_FEATURE_AMD_DCM); +} + +void cpu_topology_fixup_amd(struct topo_scan *tscan) +{ + struct cpuinfo_x86 *c = tscan->c; + + /* + * Adjust the core_id relative to the node when there is more than + * one node. + */ + if (tscan->c->x86 < 0x17 && tscan->amd_nodes_per_pkg > 1) + c->topo.core_id %= tscan->dom_ncpus[TOPO_CORE_DOMAIN] / tscan->amd_nodes_per_pkg; +} diff --git a/arch/x86/kernel/cpu/topology_common.c b/arch/x86/kernel/cpu/topology_common.c new file mode 100644 index 000000000000..a50ae8d63d1c --- /dev/null +++ b/arch/x86/kernel/cpu/topology_common.c @@ -0,0 +1,218 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/cpu.h> + +#include <xen/xen.h> + +#include <asm/apic.h> +#include <asm/processor.h> +#include <asm/smp.h> + +#include "cpu.h" + +struct x86_topology_system x86_topo_system __ro_after_init; +EXPORT_SYMBOL_GPL(x86_topo_system); + +unsigned int __amd_nodes_per_pkg __ro_after_init; +EXPORT_SYMBOL_GPL(__amd_nodes_per_pkg); + +void topology_set_dom(struct topo_scan *tscan, enum x86_topology_domains dom, + unsigned int shift, unsigned int ncpus) +{ + topology_update_dom(tscan, dom, shift, ncpus); + + /* Propagate to the upper levels */ + for (dom++; dom < TOPO_MAX_DOMAIN; dom++) { + tscan->dom_shifts[dom] = tscan->dom_shifts[dom - 1]; + tscan->dom_ncpus[dom] = tscan->dom_ncpus[dom - 1]; + } +} + +static unsigned int __maybe_unused parse_num_cores_legacy(struct cpuinfo_x86 *c) +{ + struct { + u32 cache_type : 5, + unused : 21, + ncores : 6; + } eax; + + if (c->cpuid_level < 4) + return 1; + + cpuid_subleaf_reg(4, 0, CPUID_EAX, &eax); + if (!eax.cache_type) + return 1; + + return eax.ncores + 1; +} + +static void parse_legacy(struct topo_scan *tscan) +{ + unsigned int cores, core_shift, smt_shift = 0; + struct cpuinfo_x86 *c = tscan->c; + + cores = parse_num_cores_legacy(c); + core_shift = get_count_order(cores); + + if (cpu_has(c, X86_FEATURE_HT)) { + if (!WARN_ON_ONCE(tscan->ebx1_nproc_shift < core_shift)) + smt_shift = tscan->ebx1_nproc_shift - core_shift; + /* + * The parser expects leaf 0xb/0x1f format, which means + * the number of logical processors at core level is + * counting threads. + */ + core_shift += smt_shift; + cores <<= smt_shift; + } + + topology_set_dom(tscan, TOPO_SMT_DOMAIN, smt_shift, 1U << smt_shift); + topology_set_dom(tscan, TOPO_CORE_DOMAIN, core_shift, cores); +} + +static bool fake_topology(struct topo_scan *tscan) +{ + /* + * Preset the CORE level shift for CPUID less systems and XEN_PV, + * which has useless CPUID information. + */ + topology_set_dom(tscan, TOPO_SMT_DOMAIN, 0, 1); + topology_set_dom(tscan, TOPO_CORE_DOMAIN, 0, 1); + + return tscan->c->cpuid_level < 1; +} + +static void parse_topology(struct topo_scan *tscan, bool early) +{ + const struct cpuinfo_topology topo_defaults = { + .cu_id = 0xff, + .llc_id = BAD_APICID, + .l2c_id = BAD_APICID, + }; + struct cpuinfo_x86 *c = tscan->c; + struct { + u32 unused0 : 16, + nproc : 8, + apicid : 8; + } ebx; + + c->topo = topo_defaults; + + if (fake_topology(tscan)) + return; + + /* Preset Initial APIC ID from CPUID leaf 1 */ + cpuid_leaf_reg(1, CPUID_EBX, &ebx); + c->topo.initial_apicid = ebx.apicid; + + /* + * The initial invocation from early_identify_cpu() happens before + * the APIC is mapped or X2APIC enabled. For establishing the + * topology, that's not required. Use the initial APIC ID. + */ + if (early) + c->topo.apicid = c->topo.initial_apicid; + else + c->topo.apicid = read_apic_id(); + + /* The above is sufficient for UP */ + if (!IS_ENABLED(CONFIG_SMP)) + return; + + tscan->ebx1_nproc_shift = get_count_order(ebx.nproc); + + switch (c->x86_vendor) { + case X86_VENDOR_AMD: + if (IS_ENABLED(CONFIG_CPU_SUP_AMD)) + cpu_parse_topology_amd(tscan); + break; + case X86_VENDOR_CENTAUR: + case X86_VENDOR_ZHAOXIN: + parse_legacy(tscan); + break; + case X86_VENDOR_INTEL: + if (!IS_ENABLED(CONFIG_CPU_SUP_INTEL) || !cpu_parse_topology_ext(tscan)) + parse_legacy(tscan); + break; + case X86_VENDOR_HYGON: + if (IS_ENABLED(CONFIG_CPU_SUP_HYGON)) + cpu_parse_topology_amd(tscan); + break; + } +} + +static void topo_set_ids(struct topo_scan *tscan) +{ + struct cpuinfo_x86 *c = tscan->c; + u32 apicid = c->topo.apicid; + + c->topo.pkg_id = topo_shift_apicid(apicid, TOPO_PKG_DOMAIN); + c->topo.die_id = topo_shift_apicid(apicid, TOPO_DIE_DOMAIN); + + c->topo.logical_pkg_id = topology_get_logical_id(apicid, TOPO_PKG_DOMAIN); + c->topo.logical_die_id = topology_get_logical_id(apicid, TOPO_DIE_DOMAIN); + + /* Package relative core ID */ + c->topo.core_id = (apicid & topo_domain_mask(TOPO_PKG_DOMAIN)) >> + x86_topo_system.dom_shifts[TOPO_SMT_DOMAIN]; + + c->topo.amd_node_id = tscan->amd_node_id; + + if (c->x86_vendor == X86_VENDOR_AMD) + cpu_topology_fixup_amd(tscan); +} + +void cpu_parse_topology(struct cpuinfo_x86 *c) +{ + unsigned int dom, cpu = smp_processor_id(); + struct topo_scan tscan = { .c = c, }; + + parse_topology(&tscan, false); + + if (IS_ENABLED(CONFIG_X86_LOCAL_APIC)) { + if (c->topo.initial_apicid != c->topo.apicid) { + pr_err(FW_BUG "CPU%4u: APIC ID mismatch. CPUID: 0x%04x APIC: 0x%04x\n", + cpu, c->topo.initial_apicid, c->topo.apicid); + } + + if (c->topo.apicid != cpuid_to_apicid[cpu]) { + pr_err(FW_BUG "CPU%4u: APIC ID mismatch. Firmware: 0x%04x APIC: 0x%04x\n", + cpu, cpuid_to_apicid[cpu], c->topo.apicid); + } + } + + for (dom = TOPO_SMT_DOMAIN; dom < TOPO_MAX_DOMAIN; dom++) { + if (tscan.dom_shifts[dom] == x86_topo_system.dom_shifts[dom]) + continue; + pr_err(FW_BUG "CPU%d: Topology domain %u shift %u != %u\n", cpu, dom, + tscan.dom_shifts[dom], x86_topo_system.dom_shifts[dom]); + } + + topo_set_ids(&tscan); +} + +void __init cpu_init_topology(struct cpuinfo_x86 *c) +{ + struct topo_scan tscan = { .c = c, }; + unsigned int dom, sft; + + parse_topology(&tscan, true); + + /* Copy the shift values and calculate the unit sizes. */ + memcpy(x86_topo_system.dom_shifts, tscan.dom_shifts, sizeof(x86_topo_system.dom_shifts)); + + dom = TOPO_SMT_DOMAIN; + x86_topo_system.dom_size[dom] = 1U << x86_topo_system.dom_shifts[dom]; + + for (dom++; dom < TOPO_MAX_DOMAIN; dom++) { + sft = x86_topo_system.dom_shifts[dom] - x86_topo_system.dom_shifts[dom - 1]; + x86_topo_system.dom_size[dom] = 1U << sft; + } + + topo_set_ids(&tscan); + + /* + * AMD systems have Nodes per package which cannot be mapped to + * APIC ID. + */ + __amd_nodes_per_pkg = tscan.amd_nodes_per_pkg; +} diff --git a/arch/x86/kernel/cpu/topology_ext.c b/arch/x86/kernel/cpu/topology_ext.c new file mode 100644 index 000000000000..e477228cd5b2 --- /dev/null +++ b/arch/x86/kernel/cpu/topology_ext.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/cpu.h> + +#include <asm/apic.h> +#include <asm/memtype.h> +#include <asm/processor.h> + +#include "cpu.h" + +enum topo_types { + INVALID_TYPE = 0, + SMT_TYPE = 1, + CORE_TYPE = 2, + MAX_TYPE_0B = 3, + MODULE_TYPE = 3, + TILE_TYPE = 4, + DIE_TYPE = 5, + DIEGRP_TYPE = 6, + MAX_TYPE_1F = 7, +}; + +/* + * Use a lookup table for the case that there are future types > 6 which + * describe an intermediate domain level which does not exist today. + */ +static const unsigned int topo_domain_map_0b_1f[MAX_TYPE_1F] = { + [SMT_TYPE] = TOPO_SMT_DOMAIN, + [CORE_TYPE] = TOPO_CORE_DOMAIN, + [MODULE_TYPE] = TOPO_MODULE_DOMAIN, + [TILE_TYPE] = TOPO_TILE_DOMAIN, + [DIE_TYPE] = TOPO_DIE_DOMAIN, + [DIEGRP_TYPE] = TOPO_DIEGRP_DOMAIN, +}; + +static inline bool topo_subleaf(struct topo_scan *tscan, u32 leaf, u32 subleaf, + unsigned int *last_dom) +{ + unsigned int dom, maxtype; + const unsigned int *map; + struct { + // eax + u32 x2apic_shift : 5, // Number of bits to shift APIC ID right + // for the topology ID at the next level + : 27; // Reserved + // ebx + u32 num_processors : 16, // Number of processors at current level + : 16; // Reserved + // ecx + u32 level : 8, // Current topology level. Same as sub leaf number + type : 8, // Level type. If 0, invalid + : 16; // Reserved + // edx + u32 x2apic_id : 32; // X2APIC ID of the current logical processor + } sl; + + switch (leaf) { + case 0x0b: maxtype = MAX_TYPE_0B; map = topo_domain_map_0b_1f; break; + case 0x1f: maxtype = MAX_TYPE_1F; map = topo_domain_map_0b_1f; break; + default: return false; + } + + cpuid_subleaf(leaf, subleaf, &sl); + + if (!sl.num_processors || sl.type == INVALID_TYPE) + return false; + + if (sl.type >= maxtype) { + pr_err_once("Topology: leaf 0x%x:%d Unknown domain type %u\n", + leaf, subleaf, sl.type); + /* + * It really would have been too obvious to make the domain + * type space sparse and leave a few reserved types between + * the points which might change instead of following the + * usual "this can be fixed in software" principle. + */ + dom = *last_dom + 1; + } else { + dom = map[sl.type]; + *last_dom = dom; + } + + if (!dom) { + tscan->c->topo.initial_apicid = sl.x2apic_id; + } else if (tscan->c->topo.initial_apicid != sl.x2apic_id) { + pr_warn_once(FW_BUG "CPUID leaf 0x%x subleaf %d APIC ID mismatch %x != %x\n", + leaf, subleaf, tscan->c->topo.initial_apicid, sl.x2apic_id); + } + + topology_set_dom(tscan, dom, sl.x2apic_shift, sl.num_processors); + return true; +} + +static bool parse_topology_leaf(struct topo_scan *tscan, u32 leaf) +{ + unsigned int last_dom; + u32 subleaf; + + /* Read all available subleafs and populate the levels */ + for (subleaf = 0, last_dom = 0; topo_subleaf(tscan, leaf, subleaf, &last_dom); subleaf++); + + /* If subleaf 0 failed to parse, give up */ + if (!subleaf) + return false; + + /* + * There are machines in the wild which have shift 0 in the subleaf + * 0, but advertise 2 logical processors at that level. They are + * truly SMT. + */ + if (!tscan->dom_shifts[TOPO_SMT_DOMAIN] && tscan->dom_ncpus[TOPO_SMT_DOMAIN] > 1) { + unsigned int sft = get_count_order(tscan->dom_ncpus[TOPO_SMT_DOMAIN]); + + pr_warn_once(FW_BUG "CPUID leaf 0x%x subleaf 0 has shift level 0 but %u CPUs. Fixing it up.\n", + leaf, tscan->dom_ncpus[TOPO_SMT_DOMAIN]); + topology_update_dom(tscan, TOPO_SMT_DOMAIN, sft, tscan->dom_ncpus[TOPO_SMT_DOMAIN]); + } + + set_cpu_cap(tscan->c, X86_FEATURE_XTOPOLOGY); + return true; +} + +bool cpu_parse_topology_ext(struct topo_scan *tscan) +{ + /* Intel: Try leaf 0x1F first. */ + if (tscan->c->cpuid_level >= 0x1f && parse_topology_leaf(tscan, 0x1f)) + return true; + + /* Intel/AMD: Fall back to leaf 0xB if available */ + return tscan->c->cpuid_level >= 0x0b && parse_topology_leaf(tscan, 0x0b); +} diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c index 415564a6523b..90eba7eb5335 100644 --- a/arch/x86/kernel/cpu/zhaoxin.c +++ b/arch/x86/kernel/cpu/zhaoxin.c @@ -71,10 +71,6 @@ static void init_zhaoxin(struct cpuinfo_x86 *c) { early_init_zhaoxin(c); init_intel_cacheinfo(c); - detect_num_cpu_cores(c); -#ifdef CONFIG_X86_32 - detect_ht(c); -#endif if (c->cpuid_level > 9) { unsigned int eax = cpuid_eax(10); diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index b6b044356f1b..d184c29398db 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -40,6 +40,7 @@ #include <asm/intel_pt.h> #include <asm/crash.h> #include <asm/cmdline.h> +#include <asm/sev.h> /* Used while preparing memory map entries for second kernel */ struct crash_memmap_data { @@ -59,6 +60,8 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs) */ cpu_emergency_stop_pt(); + kdump_sev_callback(); + disable_local_APIC(); } diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index afd09924094e..4aeafe63521b 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -136,7 +136,7 @@ static void __init dtb_cpu_setup(void) pr_warn("%pOF: missing local APIC ID\n", dn); continue; } - generic_processor_info(apic_id); + topology_register_apic(apic_id, CPU_ACPIID_INVALID, true); } } @@ -302,7 +302,7 @@ void __init x86_flattree_get_config(void) } #endif -void __init x86_dtb_init(void) +void __init x86_dtb_parse_smp_config(void) { if (!of_have_populated_dt()) return; diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index fb8cf953380d..b66f540de054 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1017,10 +1017,12 @@ void __init e820__reserve_setup_data(void) e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); /* - * SETUP_EFI and SETUP_IMA are supplied by kexec and do not need - * to be reserved. + * SETUP_EFI, SETUP_IMA and SETUP_RNG_SEED are supplied by + * kexec and do not need to be reserved. */ - if (data->type != SETUP_EFI && data->type != SETUP_IMA) + if (data->type != SETUP_EFI && + data->type != SETUP_IMA && + data->type != SETUP_RNG_SEED) e820__range_update_kexec(pa_data, sizeof(*data) + data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 16f9814c9be0..6726e0473d0b 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -106,6 +106,10 @@ void __init init_espfix_bsp(void) pgd_t *pgd; p4d_t *p4d; + /* FRED systems always restore the full value of %rsp */ + if (cpu_feature_enabled(X86_FEATURE_FRED)) + return; + /* Install the espfix pud into the kernel page directory */ pgd = &init_top_pgt[pgd_index(ESPFIX_BASE_ADDR)]; p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR); @@ -129,6 +133,10 @@ void init_espfix_ap(int cpu) void *stack_page; pteval_t ptemask; + /* FRED systems always restore the full value of %rsp */ + if (cpu_feature_enabled(X86_FEATURE_FRED)) + return; + /* We only have to do this once... */ if (likely(per_cpu(espfix_stack, cpu))) return; /* Already initialized */ diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 558076dbde5b..247f2225aa9f 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -274,12 +274,13 @@ static int __restore_fpregs_from_user(void __user *buf, u64 ufeatures, * Attempt to restore the FPU registers directly from user memory. * Pagefaults are handled and any errors returned are fatal. */ -static bool restore_fpregs_from_user(void __user *buf, u64 xrestore, - bool fx_only, unsigned int size) +static bool restore_fpregs_from_user(void __user *buf, u64 xrestore, bool fx_only) { struct fpu *fpu = ¤t->thread.fpu; int ret; + /* Restore enabled features only. */ + xrestore &= fpu->fpstate->user_xfeatures; retry: fpregs_lock(); /* Ensure that XFD is up to date */ @@ -309,7 +310,7 @@ retry: if (ret != X86_TRAP_PF) return false; - if (!fault_in_readable(buf, size)) + if (!fault_in_readable(buf, fpu->fpstate->user_size)) goto retry; return false; } @@ -339,7 +340,6 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx, struct user_i387_ia32_struct env; bool success, fx_only = false; union fpregs_state *fpregs; - unsigned int state_size; u64 user_xfeatures = 0; if (use_xsave()) { @@ -349,17 +349,14 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx, return false; fx_only = !fx_sw_user.magic1; - state_size = fx_sw_user.xstate_size; user_xfeatures = fx_sw_user.xfeatures; } else { user_xfeatures = XFEATURE_MASK_FPSSE; - state_size = fpu->fpstate->user_size; } if (likely(!ia32_fxstate)) { /* Restore the FPU registers directly from user memory. */ - return restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only, - state_size); + return restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only); } /* diff --git a/arch/x86/kernel/fred.c b/arch/x86/kernel/fred.c new file mode 100644 index 000000000000..4bcd8791ad96 --- /dev/null +++ b/arch/x86/kernel/fred.c @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/kernel.h> + +#include <asm/desc.h> +#include <asm/fred.h> +#include <asm/tlbflush.h> +#include <asm/traps.h> + +/* #DB in the kernel would imply the use of a kernel debugger. */ +#define FRED_DB_STACK_LEVEL 1UL +#define FRED_NMI_STACK_LEVEL 2UL +#define FRED_MC_STACK_LEVEL 2UL +/* + * #DF is the highest level because a #DF means "something went wrong + * *while delivering an exception*." The number of cases for which that + * can happen with FRED is drastically reduced and basically amounts to + * "the stack you pointed me to is broken." Thus, always change stacks + * on #DF, which means it should be at the highest level. + */ +#define FRED_DF_STACK_LEVEL 3UL + +#define FRED_STKLVL(vector, lvl) ((lvl) << (2 * (vector))) + +void cpu_init_fred_exceptions(void) +{ + /* When FRED is enabled by default, remove this log message */ + pr_info("Initialize FRED on CPU%d\n", smp_processor_id()); + + wrmsrl(MSR_IA32_FRED_CONFIG, + /* Reserve for CALL emulation */ + FRED_CONFIG_REDZONE | + FRED_CONFIG_INT_STKLVL(0) | + FRED_CONFIG_ENTRYPOINT(asm_fred_entrypoint_user)); + + /* + * The purpose of separate stacks for NMI, #DB and #MC *in the kernel* + * (remember that user space faults are always taken on stack level 0) + * is to avoid overflowing the kernel stack. + */ + wrmsrl(MSR_IA32_FRED_STKLVLS, + FRED_STKLVL(X86_TRAP_DB, FRED_DB_STACK_LEVEL) | + FRED_STKLVL(X86_TRAP_NMI, FRED_NMI_STACK_LEVEL) | + FRED_STKLVL(X86_TRAP_MC, FRED_MC_STACK_LEVEL) | + FRED_STKLVL(X86_TRAP_DF, FRED_DF_STACK_LEVEL)); + + /* The FRED equivalents to IST stacks... */ + wrmsrl(MSR_IA32_FRED_RSP1, __this_cpu_ist_top_va(DB)); + wrmsrl(MSR_IA32_FRED_RSP2, __this_cpu_ist_top_va(NMI)); + wrmsrl(MSR_IA32_FRED_RSP3, __this_cpu_ist_top_va(DF)); + + /* Enable FRED */ + cr4_set_bits(X86_CR4_FRED); + /* Any further IDT use is a bug */ + idt_invalidate(); + + /* Use int $0x80 for 32-bit system calls in FRED mode */ + setup_clear_cpu_cap(X86_FEATURE_SYSENTER32); + setup_clear_cpu_cap(X86_FEATURE_SYSCALL32); +} diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index d4918d03efb4..c38e43589046 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -26,6 +26,7 @@ #include <asm/apicdef.h> #include <asm/fixmap.h> #include <asm/smp.h> +#include <asm/thread_info.h> /* * We are not able to switch in one step to the final KERNEL ADDRESS SPACE @@ -66,7 +67,7 @@ SYM_CODE_START_NOALIGN(startup_64) mov %rsi, %r15 /* Set up the stack for verify_cpu() */ - leaq (__end_init_task - PTREGS_SIZE)(%rip), %rsp + leaq (__end_init_task - TOP_OF_KERNEL_STACK_PADDING - PTREGS_SIZE)(%rip), %rsp leaq _text(%rip), %rdi diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index a38d0c93a66e..c96ae8fee95e 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -568,7 +568,7 @@ static struct irq_domain *hpet_create_irq_domain(int hpet_id) fwspec.param_count = 1; fwspec.param[0] = hpet_id; - parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_ANY); + parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_GENERIC_MSI); if (!parent) { irq_domain_free_fwnode(fn); kfree(domain_info); diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 660b601f1d6c..0cd53fa8c65d 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -337,7 +337,7 @@ void idt_invalidate(void) load_idt(&idt); } -void __init alloc_intr_gate(unsigned int n, const void *addr) +void __init idt_install_sysvec(unsigned int n, const void *function) { if (WARN_ON(n < FIRST_SYSTEM_VECTOR)) return; @@ -346,5 +346,5 @@ void __init alloc_intr_gate(unsigned int n, const void *addr) return; if (!WARN_ON(test_and_set_bit(n, system_vectors))) - set_intr_gate(n, addr); + set_intr_gate(n, function); } diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index c683666876f1..f79c5edc0b89 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -28,6 +28,7 @@ #include <asm/setup.h> #include <asm/i8259.h> #include <asm/traps.h> +#include <asm/fred.h> #include <asm/prom.h> /* @@ -96,7 +97,11 @@ void __init native_init_IRQ(void) /* Execute any quirks before the call gates are initialised: */ x86_init.irqs.pre_vector_init(); - idt_setup_apic_and_irq_gates(); + if (cpu_feature_enabled(X86_FEATURE_FRED)) + fred_complete_exception_setup(); + else + idt_setup_apic_and_irq_gates(); + lapic_assign_system_vectors(); if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) { diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c index 578d16fc040f..df337860612d 100644 --- a/arch/x86/kernel/jailhouse.c +++ b/arch/x86/kernel/jailhouse.c @@ -89,7 +89,7 @@ static void __init jailhouse_x2apic_init(void) #endif } -static void __init jailhouse_get_smp_config(unsigned int early) +static void __init jailhouse_parse_smp_config(void) { struct ioapic_domain_cfg ioapic_cfg = { .type = IOAPIC_DOMAIN_STRICT, @@ -102,7 +102,7 @@ static void __init jailhouse_get_smp_config(unsigned int early) register_lapic_address(0xfee00000); for (cpu = 0; cpu < setup_data.v1.num_cpus; cpu++) - generic_processor_info(setup_data.v1.cpu_ids[cpu]); + topology_register_apic(setup_data.v1.cpu_ids[cpu], CPU_ACPIID_INVALID, true); smp_found_config = 1; @@ -201,21 +201,23 @@ static void __init jailhouse_init_platform(void) struct setup_data header; void *mapping; - x86_init.irqs.pre_vector_init = x86_init_noop; - x86_init.timers.timer_init = jailhouse_timer_init; - x86_init.mpparse.get_smp_config = jailhouse_get_smp_config; - x86_init.pci.arch_init = jailhouse_pci_arch_init; + x86_init.irqs.pre_vector_init = x86_init_noop; + x86_init.timers.timer_init = jailhouse_timer_init; + x86_init.mpparse.find_mptable = x86_init_noop; + x86_init.mpparse.early_parse_smp_cfg = x86_init_noop; + x86_init.mpparse.parse_smp_cfg = jailhouse_parse_smp_config; + x86_init.pci.arch_init = jailhouse_pci_arch_init; - x86_platform.calibrate_cpu = jailhouse_get_tsc; - x86_platform.calibrate_tsc = jailhouse_get_tsc; - x86_platform.get_wallclock = jailhouse_get_wallclock; - x86_platform.legacy.rtc = 0; - x86_platform.legacy.warm_reset = 0; - x86_platform.legacy.i8042 = X86_LEGACY_I8042_PLATFORM_ABSENT; + x86_platform.calibrate_cpu = jailhouse_get_tsc; + x86_platform.calibrate_tsc = jailhouse_get_tsc; + x86_platform.get_wallclock = jailhouse_get_wallclock; + x86_platform.legacy.rtc = 0; + x86_platform.legacy.warm_reset = 0; + x86_platform.legacy.i8042 = X86_LEGACY_I8042_PLATFORM_ABSENT; - legacy_pic = &null_legacy_pic; + legacy_pic = &null_legacy_pic; - machine_ops.emergency_restart = jailhouse_no_restart; + machine_ops.emergency_restart = jailhouse_no_restart; while (pa_data) { mapping = early_memremap(pa_data, sizeof(header)); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index dfe9945b9bec..101a7c1bf200 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -434,7 +434,8 @@ static void __init sev_map_percpu_data(void) { int cpu; - if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) + if (cc_vendor != CC_VENDOR_AMD || + !cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) return; for_each_possible_cpu(cpu) { @@ -829,7 +830,7 @@ static void __init kvm_guest_init(void) if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) { static_branch_enable(&kvm_async_pf_enabled); - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_kvm_asyncpf_interrupt); + sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_kvm_asyncpf_interrupt); } #ifdef CONFIG_SMP diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 5bb395551c44..5b2c15214a6b 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -154,15 +154,15 @@ static int kvm_cs_enable(struct clocksource *cs) return 0; } -struct clocksource kvm_clock = { +static struct clocksource kvm_clock = { .name = "kvm-clock", .read = kvm_clock_get_cycles, .rating = 400, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .id = CSID_X86_KVM_CLK, .enable = kvm_cs_enable, }; -EXPORT_SYMBOL_GPL(kvm_clock); static void kvm_register_clock(char *txt) { diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index b223922248e9..1ccd30c8246f 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -36,6 +36,8 @@ * Checksum an MP configuration block. */ +static unsigned int num_procs __initdata; + static int __init mpf_checksum(unsigned char *mp, int len) { int sum = 0; @@ -50,16 +52,15 @@ static void __init MP_processor_info(struct mpc_cpu *m) { char *bootup_cpu = ""; - if (!(m->cpuflag & CPU_ENABLED)) { - disabled_cpus++; + topology_register_apic(m->apicid, CPU_ACPIID_INVALID, m->cpuflag & CPU_ENABLED); + if (!(m->cpuflag & CPU_ENABLED)) return; - } if (m->cpuflag & CPU_BOOTPROCESSOR) bootup_cpu = " (Bootup-CPU)"; pr_info("Processor #%d%s\n", m->apicid, bootup_cpu); - generic_processor_info(m->apicid); + num_procs++; } #ifdef CONFIG_X86_IO_APIC @@ -236,9 +237,9 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) } } - if (!num_processors) + if (!num_procs && !acpi_lapic) pr_err("MPTABLE: no processors registered!\n"); - return num_processors; + return num_procs || acpi_lapic; } #ifdef CONFIG_X86_IO_APIC @@ -473,7 +474,7 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early) /* * Scan the memory blocks for an SMP configuration block. */ -void __init default_get_smp_config(unsigned int early) +static __init void mpparse_get_smp_config(unsigned int early) { struct mpf_intel *mpf; @@ -529,8 +530,8 @@ void __init default_get_smp_config(unsigned int early) } else BUG(); - if (!early) - pr_info("Processors: %d\n", num_processors); + if (!early && !acpi_lapic) + pr_info("Processors: %d\n", num_procs); /* * Only use the first configuration found. */ @@ -538,6 +539,16 @@ out: early_memunmap(mpf, sizeof(*mpf)); } +void __init mpparse_parse_early_smp_config(void) +{ + mpparse_get_smp_config(true); +} + +void __init mpparse_parse_smp_config(void) +{ + mpparse_get_smp_config(false); +} + static void __init smp_reserve_memory(struct mpf_intel *mpf) { memblock_reserve(mpf->physptr, get_mpc_size(mpf->physptr)); @@ -587,7 +598,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length) return ret; } -void __init default_find_smp_config(void) +void __init mpparse_find_mptable(void) { unsigned int address; diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 17e955ab69fe..56e7a9e2737a 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -35,6 +35,7 @@ #include <asm/nospec-branch.h> #include <asm/microcode.h> #include <asm/sev.h> +#include <asm/fred.h> #define CREATE_TRACE_POINTS #include <trace/events/nmi.h> @@ -303,13 +304,13 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) __this_cpu_add(nmi_stats.unknown, 1); - pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", - reason, smp_processor_id()); + pr_emerg_ratelimited("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", + reason, smp_processor_id()); if (unknown_nmi_panic || panic_on_unrecovered_nmi) nmi_panic(regs, "NMI: Not continuing"); - pr_emerg("Dazed and confused, but trying to continue\n"); + pr_emerg_ratelimited("Dazed and confused, but trying to continue\n"); } NOKPROBE_SYMBOL(unknown_nmi_error); @@ -563,9 +564,6 @@ nmi_restart: } if (this_cpu_dec_return(nmi_state)) goto nmi_restart; - - if (user_mode(regs)) - mds_user_clear_cpu_buffers(); } #if IS_ENABLED(CONFIG_KVM_INTEL) @@ -639,7 +637,7 @@ void nmi_backtrace_stall_check(const struct cpumask *btp) msgp = nmi_check_stall_msg[idx]; if (nsp->idt_ignored_snap != READ_ONCE(nsp->idt_ignored) && (idx & 0x1)) modp = ", but OK because ignore_nmis was set"; - if (nmi_seq & ~0x1) + if (nmi_seq & 0x1) msghp = " (CPU currently in NMI handler function)"; else if (nsp->idt_nmi_seq_snap + 1 == nmi_seq) msghp = " (CPU exited one NMI handler function)"; @@ -651,6 +649,47 @@ void nmi_backtrace_stall_check(const struct cpumask *btp) #endif +#ifdef CONFIG_X86_FRED +/* + * With FRED, CR2/DR6 is pushed to #PF/#DB stack frame during FRED + * event delivery, i.e., there is no problem of transient states. + * And NMI unblocking only happens when the stack frame indicates + * that so should happen. + * + * Thus, the NMI entry stub for FRED is really straightforward and + * as simple as most exception handlers. As such, #DB is allowed + * during NMI handling. + */ +DEFINE_FREDENTRY_NMI(exc_nmi) +{ + irqentry_state_t irq_state; + + if (arch_cpu_is_offline(smp_processor_id())) { + if (microcode_nmi_handler_enabled()) + microcode_offline_nmi_handler(); + return; + } + + /* + * Save CR2 for eventual restore to cover the case where the NMI + * hits the VMENTER/VMEXIT region where guest CR2 is life. This + * prevents guest state corruption in case that the NMI handler + * takes a page fault. + */ + this_cpu_write(nmi_cr2, read_cr2()); + + irq_state = irqentry_nmi_enter(regs); + + inc_irq_stat(__nmi_count); + default_do_nmi(regs); + + irqentry_nmi_exit(regs, irq_state); + + if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) + write_cr2(this_cpu_read(nmi_cr2)); +} +#endif + void stop_nmi(void) { ignore_nmis++; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ab49ade31b0d..6121c2b42ecf 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -936,7 +936,7 @@ static __cpuidle void mwait_idle(void) void select_idle_routine(const struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP - if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1) + if (boot_option_idle_override == IDLE_POLL && __max_threads_per_core > 1) pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n"); #endif if (x86_idle_set() || boot_option_idle_override == IDLE_POLL) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 33b268747bb7..c075591b7b46 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -56,6 +56,7 @@ #include <asm/resctrl.h> #include <asm/unistd.h> #include <asm/fsgsbase.h> +#include <asm/fred.h> #ifdef CONFIG_IA32_EMULATION /* Not included via unistd.h */ #include <asm/unistd_32_ia32.h> @@ -117,7 +118,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, printk("%sFS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", log_lvl, fs, fsindex, gs, gsindex, shadowgs); - printk("%sCS: %04lx DS: %04x ES: %04x CR0: %016lx\n", + printk("%sCS: %04x DS: %04x ES: %04x CR0: %016lx\n", log_lvl, regs->cs, ds, es, cr0); printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n", log_lvl, cr2, cr3, cr4); @@ -166,7 +167,29 @@ static noinstr unsigned long __rdgsbase_inactive(void) lockdep_assert_irqs_disabled(); - if (!cpu_feature_enabled(X86_FEATURE_XENPV)) { + /* + * SWAPGS is no longer needed thus NOT allowed with FRED because + * FRED transitions ensure that an operating system can _always_ + * operate with its own GS base address: + * - For events that occur in ring 3, FRED event delivery swaps + * the GS base address with the IA32_KERNEL_GS_BASE MSR. + * - ERETU (the FRED transition that returns to ring 3) also swaps + * the GS base address with the IA32_KERNEL_GS_BASE MSR. + * + * And the operating system can still setup the GS segment for a + * user thread without the need of loading a user thread GS with: + * - Using LKGS, available with FRED, to modify other attributes + * of the GS segment without compromising its ability always to + * operate with its own GS base address. + * - Accessing the GS segment base address for a user thread as + * before using RDMSR or WRMSR on the IA32_KERNEL_GS_BASE MSR. + * + * Note, LKGS loads the GS base address into the IA32_KERNEL_GS_BASE + * MSR instead of the GS segment’s descriptor cache. As such, the + * operating system never changes its runtime GS base address. + */ + if (!cpu_feature_enabled(X86_FEATURE_FRED) && + !cpu_feature_enabled(X86_FEATURE_XENPV)) { native_swapgs(); gsbase = rdgsbase(); native_swapgs(); @@ -191,7 +214,8 @@ static noinstr void __wrgsbase_inactive(unsigned long gsbase) { lockdep_assert_irqs_disabled(); - if (!cpu_feature_enabled(X86_FEATURE_XENPV)) { + if (!cpu_feature_enabled(X86_FEATURE_FRED) && + !cpu_feature_enabled(X86_FEATURE_XENPV)) { native_swapgs(); wrgsbase(gsbase); native_swapgs(); @@ -505,7 +529,7 @@ void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase) static void start_thread_common(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp, - unsigned int _cs, unsigned int _ss, unsigned int _ds) + u16 _cs, u16 _ss, u16 _ds) { WARN_ON_ONCE(regs != current_pt_regs()); @@ -522,11 +546,36 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, loadsegment(ds, _ds); load_gs_index(0); - regs->ip = new_ip; - regs->sp = new_sp; - regs->cs = _cs; - regs->ss = _ss; - regs->flags = X86_EFLAGS_IF; + regs->ip = new_ip; + regs->sp = new_sp; + regs->csx = _cs; + regs->ssx = _ss; + /* + * Allow single-step trap and NMI when starting a new task, thus + * once the new task enters user space, single-step trap and NMI + * are both enabled immediately. + * + * Entering a new task is logically speaking a return from a + * system call (exec, fork, clone, etc.). As such, if ptrace + * enables single stepping a single step exception should be + * allowed to trigger immediately upon entering user space. + * This is not optional. + * + * NMI should *never* be disabled in user space. As such, this + * is an optional, opportunistic way to catch errors. + * + * Paranoia: High-order 48 bits above the lowest 16 bit SS are + * discarded by the legacy IRET instruction on all Intel, AMD, + * and Cyrix/Centaur/VIA CPUs, thus can be set unconditionally, + * even when FRED is not enabled. But we choose the safer side + * to use these bits only when FRED is enabled. + */ + if (cpu_feature_enabled(X86_FEATURE_FRED)) { + regs->fred_ss.swevent = true; + regs->fred_ss.nmi = true; + } + + regs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED; } void diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 84201071dfac..4e320d4d3898 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -970,10 +970,8 @@ void __init setup_arch(char **cmdline_p) high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; #endif - /* - * Find and reserve possible boot-time SMP configuration: - */ - find_smp_config(); + /* Find and reserve MPTABLE area */ + x86_init.mpparse.find_mptable(); early_alloc_pgt_buf(); @@ -1090,7 +1088,9 @@ void __init setup_arch(char **cmdline_p) early_platform_quirks(); + /* Some platforms need the APIC registered for NUMA configuration */ early_acpi_boot_init(); + x86_init.mpparse.early_parse_smp_cfg(); x86_flattree_get_config(); @@ -1131,24 +1131,19 @@ void __init setup_arch(char **cmdline_p) early_quirks(); - /* - * Read APIC and some other early information from ACPI tables. - */ - acpi_boot_init(); - x86_dtb_init(); + topology_apply_cmdline_limits_early(); /* - * get boot-time SMP configuration: + * Parse SMP configuration. Try ACPI first and then the platform + * specific parser. */ - get_smp_config(); + acpi_boot_init(); + x86_init.mpparse.parse_smp_cfg(); - /* - * Systems w/o ACPI and mptables might not have it mapped the local - * APIC yet, but prefill_possible_map() might need to access it. - */ + /* Last opportunity to detect and map the local APIC */ init_apic_mappings(); - prefill_possible_map(); + topology_init_possible_cpus(); init_cpu_to_node(); init_gi_nodes(); diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index 1d24ec679915..ae79f9505298 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -10,11 +10,15 @@ */ #ifndef __BOOT_COMPRESSED -#define error(v) pr_err(v) -#define has_cpuflag(f) boot_cpu_has(f) +#define error(v) pr_err(v) +#define has_cpuflag(f) boot_cpu_has(f) +#define sev_printk(fmt, ...) printk(fmt, ##__VA_ARGS__) +#define sev_printk_rtl(fmt, ...) printk_ratelimited(fmt, ##__VA_ARGS__) #else #undef WARN #define WARN(condition, format...) (!!(condition)) +#define sev_printk(fmt, ...) +#define sev_printk_rtl(fmt, ...) #endif /* I/O parameters for CPUID-related helpers */ @@ -556,9 +560,9 @@ static int snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_le leaf->eax = leaf->ebx = leaf->ecx = leaf->edx = 0; /* Skip post-processing for out-of-range zero leafs. */ - if (!(leaf->fn <= cpuid_std_range_max || - (leaf->fn >= 0x40000000 && leaf->fn <= cpuid_hyp_range_max) || - (leaf->fn >= 0x80000000 && leaf->fn <= cpuid_ext_range_max))) + if (!(leaf->fn <= RIP_REL_REF(cpuid_std_range_max) || + (leaf->fn >= 0x40000000 && leaf->fn <= RIP_REL_REF(cpuid_hyp_range_max)) || + (leaf->fn >= 0x80000000 && leaf->fn <= RIP_REL_REF(cpuid_ext_range_max)))) return 0; } @@ -574,6 +578,7 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) { unsigned int subfn = lower_bits(regs->cx, 32); unsigned int fn = lower_bits(regs->ax, 32); + u16 opcode = *(unsigned short *)regs->ip; struct cpuid_leaf leaf; int ret; @@ -581,6 +586,10 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) if (exit_code != SVM_EXIT_CPUID) goto fail; + /* Is it really a CPUID insn? */ + if (opcode != 0xa20f) + goto fail; + leaf.fn = fn; leaf.subfn = subfn; @@ -1063,11 +1072,11 @@ static void __init setup_cpuid_table(const struct cc_blob_sev_info *cc_info) const struct snp_cpuid_fn *fn = &cpuid_table->fn[i]; if (fn->eax_in == 0x0) - cpuid_std_range_max = fn->eax; + RIP_REL_REF(cpuid_std_range_max) = fn->eax; else if (fn->eax_in == 0x40000000) - cpuid_hyp_range_max = fn->eax; + RIP_REL_REF(cpuid_hyp_range_max) = fn->eax; else if (fn->eax_in == 0x80000000) - cpuid_ext_range_max = fn->eax; + RIP_REL_REF(cpuid_ext_range_max) = fn->eax; } } @@ -1170,3 +1179,92 @@ static int vmgexit_psc(struct ghcb *ghcb, struct snp_psc_desc *desc) out: return ret; } + +static enum es_result vc_check_opcode_bytes(struct es_em_ctxt *ctxt, + unsigned long exit_code) +{ + unsigned int opcode = (unsigned int)ctxt->insn.opcode.value; + u8 modrm = ctxt->insn.modrm.value; + + switch (exit_code) { + + case SVM_EXIT_IOIO: + case SVM_EXIT_NPF: + /* handled separately */ + return ES_OK; + + case SVM_EXIT_CPUID: + if (opcode == 0xa20f) + return ES_OK; + break; + + case SVM_EXIT_INVD: + if (opcode == 0x080f) + return ES_OK; + break; + + case SVM_EXIT_MONITOR: + if (opcode == 0x010f && modrm == 0xc8) + return ES_OK; + break; + + case SVM_EXIT_MWAIT: + if (opcode == 0x010f && modrm == 0xc9) + return ES_OK; + break; + + case SVM_EXIT_MSR: + /* RDMSR */ + if (opcode == 0x320f || + /* WRMSR */ + opcode == 0x300f) + return ES_OK; + break; + + case SVM_EXIT_RDPMC: + if (opcode == 0x330f) + return ES_OK; + break; + + case SVM_EXIT_RDTSC: + if (opcode == 0x310f) + return ES_OK; + break; + + case SVM_EXIT_RDTSCP: + if (opcode == 0x010f && modrm == 0xf9) + return ES_OK; + break; + + case SVM_EXIT_READ_DR7: + if (opcode == 0x210f && + X86_MODRM_REG(ctxt->insn.modrm.value) == 7) + return ES_OK; + break; + + case SVM_EXIT_VMMCALL: + if (opcode == 0x010f && modrm == 0xd9) + return ES_OK; + + break; + + case SVM_EXIT_WRITE_DR7: + if (opcode == 0x230f && + X86_MODRM_REG(ctxt->insn.modrm.value) == 7) + return ES_OK; + break; + + case SVM_EXIT_WBINVD: + if (opcode == 0x90f) + return ES_OK; + break; + + default: + break; + } + + sev_printk(KERN_ERR "Wrong/unhandled opcode bytes: 0x%x, exit_code: 0x%lx, rIP: 0x%lx\n", + opcode, exit_code, ctxt->regs->ip); + + return ES_UNSUPPORTED; +} diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index c67285824e82..7d242898852f 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -59,6 +59,25 @@ #define AP_INIT_CR0_DEFAULT 0x60000010 #define AP_INIT_MXCSR_DEFAULT 0x1f80 +static const char * const sev_status_feat_names[] = { + [MSR_AMD64_SEV_ENABLED_BIT] = "SEV", + [MSR_AMD64_SEV_ES_ENABLED_BIT] = "SEV-ES", + [MSR_AMD64_SEV_SNP_ENABLED_BIT] = "SEV-SNP", + [MSR_AMD64_SNP_VTOM_BIT] = "vTom", + [MSR_AMD64_SNP_REFLECT_VC_BIT] = "ReflectVC", + [MSR_AMD64_SNP_RESTRICTED_INJ_BIT] = "RI", + [MSR_AMD64_SNP_ALT_INJ_BIT] = "AI", + [MSR_AMD64_SNP_DEBUG_SWAP_BIT] = "DebugSwap", + [MSR_AMD64_SNP_PREVENT_HOST_IBS_BIT] = "NoHostIBS", + [MSR_AMD64_SNP_BTB_ISOLATION_BIT] = "BTBIsol", + [MSR_AMD64_SNP_VMPL_SSS_BIT] = "VmplSSS", + [MSR_AMD64_SNP_SECURE_TSC_BIT] = "SecureTSC", + [MSR_AMD64_SNP_VMGEXIT_PARAM_BIT] = "VMGExitParam", + [MSR_AMD64_SNP_IBS_VIRT_BIT] = "IBSVirt", + [MSR_AMD64_SNP_VMSA_REG_PROT_BIT] = "VMSARegProt", + [MSR_AMD64_SNP_SMT_PROT_BIT] = "SMTProt", +}; + /* For early boot hypervisor communication in SEV-ES enabled guests */ static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); @@ -748,7 +767,7 @@ void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long padd * This eliminates worries about jump tables or checking boot_cpu_data * in the cc_platform_has() function. */ - if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) + if (!(RIP_REL_REF(sev_status) & MSR_AMD64_SEV_SNP_ENABLED)) return; /* @@ -767,7 +786,7 @@ void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr * This eliminates worries about jump tables or checking boot_cpu_data * in the cc_platform_has() function. */ - if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) + if (!(RIP_REL_REF(sev_status) & MSR_AMD64_SEV_SNP_ENABLED)) return; /* Ask hypervisor to mark the memory pages shared in the RMP table. */ @@ -1752,7 +1771,10 @@ static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt, struct ghcb *ghcb, unsigned long exit_code) { - enum es_result result; + enum es_result result = vc_check_opcode_bytes(ctxt, exit_code); + + if (result != ES_OK) + return result; switch (exit_code) { case SVM_EXIT_READ_DR7: @@ -2262,3 +2284,29 @@ static int __init snp_init_platform_device(void) return 0; } device_initcall(snp_init_platform_device); + +void kdump_sev_callback(void) +{ + /* + * Do wbinvd() on remote CPUs when SNP is enabled in order to + * safely do SNP_SHUTDOWN on the local CPU. + */ + if (cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + wbinvd(); +} + +void sev_show_status(void) +{ + int i; + + pr_info("Status: "); + for (i = 0; i < MSR_AMD64_SNP_RESV_BIT; i++) { + if (sev_status & BIT_ULL(i)) { + if (!sev_status_feat_names[i]) + continue; + + pr_cont("%s ", sev_status_feat_names[i]); + } + } + pr_cont("\n"); +} diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 3f57ce68a3f1..9c1e1219c28f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -125,25 +125,6 @@ struct mwait_cpu_dead { */ static DEFINE_PER_CPU_ALIGNED(struct mwait_cpu_dead, mwait_cpu_dead); -/* Logical package management. */ -struct logical_maps { - u32 phys_pkg_id; - u32 phys_die_id; - u32 logical_pkg_id; - u32 logical_die_id; -}; - -/* Temporary workaround until the full topology mechanics is in place */ -static DEFINE_PER_CPU_READ_MOSTLY(struct logical_maps, logical_maps) = { - .phys_pkg_id = U32_MAX, - .phys_die_id = U32_MAX, -}; - -unsigned int __max_logical_packages __read_mostly; -EXPORT_SYMBOL(__max_logical_packages); -static unsigned int logical_packages __read_mostly; -static unsigned int logical_die __read_mostly; - /* Maximum number of SMT threads on any online core */ int __read_mostly __max_smt_threads = 1; @@ -336,103 +317,11 @@ static void notrace start_secondary(void *unused) cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } -/** - * topology_phys_to_logical_pkg - Map a physical package id to a logical - * @phys_pkg: The physical package id to map - * - * Returns logical package id or -1 if not found - */ -int topology_phys_to_logical_pkg(unsigned int phys_pkg) -{ - int cpu; - - for_each_possible_cpu(cpu) { - if (per_cpu(logical_maps.phys_pkg_id, cpu) == phys_pkg) - return per_cpu(logical_maps.logical_pkg_id, cpu); - } - return -1; -} -EXPORT_SYMBOL(topology_phys_to_logical_pkg); - -/** - * topology_phys_to_logical_die - Map a physical die id to logical - * @die_id: The physical die id to map - * @cur_cpu: The CPU for which the mapping is done - * - * Returns logical die id or -1 if not found - */ -static int topology_phys_to_logical_die(unsigned int die_id, unsigned int cur_cpu) -{ - int cpu, proc_id = cpu_data(cur_cpu).topo.pkg_id; - - for_each_possible_cpu(cpu) { - if (per_cpu(logical_maps.phys_pkg_id, cpu) == proc_id && - per_cpu(logical_maps.phys_die_id, cpu) == die_id) - return per_cpu(logical_maps.logical_die_id, cpu); - } - return -1; -} - -/** - * topology_update_package_map - Update the physical to logical package map - * @pkg: The physical package id as retrieved via CPUID - * @cpu: The cpu for which this is updated - */ -int topology_update_package_map(unsigned int pkg, unsigned int cpu) -{ - int new; - - /* Already available somewhere? */ - new = topology_phys_to_logical_pkg(pkg); - if (new >= 0) - goto found; - - new = logical_packages++; - if (new != pkg) { - pr_info("CPU %u Converting physical %u to logical package %u\n", - cpu, pkg, new); - } -found: - per_cpu(logical_maps.phys_pkg_id, cpu) = pkg; - per_cpu(logical_maps.logical_pkg_id, cpu) = new; - cpu_data(cpu).topo.logical_pkg_id = new; - return 0; -} -/** - * topology_update_die_map - Update the physical to logical die map - * @die: The die id as retrieved via CPUID - * @cpu: The cpu for which this is updated - */ -int topology_update_die_map(unsigned int die, unsigned int cpu) -{ - int new; - - /* Already available somewhere? */ - new = topology_phys_to_logical_die(die, cpu); - if (new >= 0) - goto found; - - new = logical_die++; - if (new != die) { - pr_info("CPU %u Converting physical %u to logical die %u\n", - cpu, die, new); - } -found: - per_cpu(logical_maps.phys_die_id, cpu) = die; - per_cpu(logical_maps.logical_die_id, cpu) = new; - cpu_data(cpu).topo.logical_die_id = new; - return 0; -} - static void __init smp_store_boot_cpu_info(void) { - int id = 0; /* CPU 0 */ - struct cpuinfo_x86 *c = &cpu_data(id); + struct cpuinfo_x86 *c = &cpu_data(0); *c = boot_cpu_data; - c->cpu_index = id; - topology_update_package_map(c->topo.pkg_id, id); - topology_update_die_map(c->topo.die_id, id); c->initialized = true; } @@ -488,6 +377,7 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) if (c->topo.pkg_id == o->topo.pkg_id && c->topo.die_id == o->topo.die_id && + c->topo.amd_node_id == o->topo.amd_node_id && per_cpu_llc_id(cpu1) == per_cpu_llc_id(cpu2)) { if (c->topo.core_id == o->topo.core_id) return topology_sane(c, o, "smt"); @@ -509,10 +399,13 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { - if (c->topo.pkg_id == o->topo.pkg_id && - c->topo.die_id == o->topo.die_id) - return true; - return false; + if (c->topo.pkg_id != o->topo.pkg_id || c->topo.die_id != o->topo.die_id) + return false; + + if (cpu_feature_enabled(X86_FEATURE_TOPOEXT) && topology_amd_nodes_per_pkg() > 1) + return c->topo.amd_node_id == o->topo.amd_node_id; + + return true; } static bool match_l2c(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) @@ -670,8 +563,8 @@ static void __init build_sched_topology(void) void set_cpu_sibling_map(int cpu) { - bool has_smt = smp_num_siblings > 1; - bool has_mp = has_smt || boot_cpu_data.x86_max_cores > 1; + bool has_smt = __max_threads_per_core > 1; + bool has_mp = has_smt || topology_num_cores_per_package() > 1; struct cpuinfo_x86 *c = &cpu_data(cpu); struct cpuinfo_x86 *o; int i, threads; @@ -1068,9 +961,13 @@ int native_kick_ap(unsigned int cpu, struct task_struct *tidle) pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu); - if (apicid == BAD_APICID || !physid_isset(apicid, phys_cpu_present_map) || - !apic_id_valid(apicid)) { - pr_err("%s: bad cpu %d\n", __func__, cpu); + if (apicid == BAD_APICID || !apic_id_valid(apicid)) { + pr_err("CPU %u has invalid APIC ID %x. Aborting bringup\n", cpu, apicid); + return -EINVAL; + } + + if (!test_bit(apicid, phys_cpu_present_map)) { + pr_err("CPU %u APIC ID %x is not present. Aborting bringup\n", cpu, apicid); return -EINVAL; } @@ -1139,14 +1036,8 @@ static __init void disable_smp(void) pr_info("SMP disabled\n"); disable_ioapic_support(); + topology_reset_possible_cpus_up(); - init_cpu_present(cpumask_of(0)); - init_cpu_possible(cpumask_of(0)); - - if (smp_found_config) - physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); - else - physid_set_mask_of_physid(0, &phys_cpu_present_map); cpumask_set_cpu(0, topology_sibling_cpumask(0)); cpumask_set_cpu(0, topology_core_cpumask(0)); cpumask_set_cpu(0, topology_die_cpumask(0)); @@ -1265,102 +1156,16 @@ void __init native_smp_prepare_boot_cpu(void) native_pv_lock_init(); } -void __init calculate_max_logical_packages(void) -{ - int ncpus; - - /* - * Today neither Intel nor AMD support heterogeneous systems so - * extrapolate the boot cpu's data to all packages. - */ - ncpus = cpu_data(0).booted_cores * topology_max_smt_threads(); - __max_logical_packages = DIV_ROUND_UP(total_cpus, ncpus); - pr_info("Max logical packages: %u\n", __max_logical_packages); -} - void __init native_smp_cpus_done(unsigned int max_cpus) { pr_debug("Boot done\n"); - calculate_max_logical_packages(); build_sched_topology(); nmi_selftest(); impress_friends(); cache_aps_init(); } -static int __initdata setup_possible_cpus = -1; -static int __init _setup_possible_cpus(char *str) -{ - get_option(&str, &setup_possible_cpus); - return 0; -} -early_param("possible_cpus", _setup_possible_cpus); - - -/* - * cpu_possible_mask should be static, it cannot change as cpu's - * are onlined, or offlined. The reason is per-cpu data-structures - * are allocated by some modules at init time, and don't expect to - * do this dynamically on cpu arrival/departure. - * cpu_present_mask on the other hand can change dynamically. - * In case when cpu_hotplug is not compiled, then we resort to current - * behaviour, which is cpu_possible == cpu_present. - * - Ashok Raj - * - * Three ways to find out the number of additional hotplug CPUs: - * - If the BIOS specified disabled CPUs in ACPI/mptables use that. - * - The user can overwrite it with possible_cpus=NUM - * - Otherwise don't reserve additional CPUs. - * We do this because additional CPUs waste a lot of memory. - * -AK - */ -__init void prefill_possible_map(void) -{ - int i, possible; - - i = setup_max_cpus ?: 1; - if (setup_possible_cpus == -1) { - possible = num_processors; -#ifdef CONFIG_HOTPLUG_CPU - if (setup_max_cpus) - possible += disabled_cpus; -#else - if (possible > i) - possible = i; -#endif - } else - possible = setup_possible_cpus; - - total_cpus = max_t(int, possible, num_processors + disabled_cpus); - - /* nr_cpu_ids could be reduced via nr_cpus= */ - if (possible > nr_cpu_ids) { - pr_warn("%d Processors exceeds NR_CPUS limit of %u\n", - possible, nr_cpu_ids); - possible = nr_cpu_ids; - } - -#ifdef CONFIG_HOTPLUG_CPU - if (!setup_max_cpus) -#endif - if (possible > i) { - pr_warn("%d Processors exceeds max_cpus limit of %u\n", - possible, setup_max_cpus); - possible = i; - } - - set_nr_cpu_ids(possible); - - pr_info("Allowing %d CPUs, %d hotplug CPUs\n", - possible, max_t(int, possible - num_processors, 0)); - - reset_cpu_possible_mask(); - - for (i = 0; i < possible; i++) - set_cpu_possible(i, true); -} - /* correctly size the local cpu masks */ void __init setup_cpu_local_masks(void) { diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index c3b2f863acf0..6cb31df3d5ff 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -51,6 +51,7 @@ #include <asm/ftrace.h> #include <asm/traps.h> #include <asm/desc.h> +#include <asm/fred.h> #include <asm/fpu/api.h> #include <asm/cpu.h> #include <asm/cpu_entry_area.h> @@ -935,8 +936,7 @@ static bool notify_debug(struct pt_regs *regs, unsigned long *dr6) return false; } -static __always_inline void exc_debug_kernel(struct pt_regs *regs, - unsigned long dr6) +static noinstr void exc_debug_kernel(struct pt_regs *regs, unsigned long dr6) { /* * Disable breakpoints during exception handling; recursive exceptions @@ -948,6 +948,11 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, * * Entry text is excluded for HW_BP_X and cpu_entry_area, which * includes the entry stack is excluded for everything. + * + * For FRED, nested #DB should just work fine. But when a watchpoint or + * breakpoint is set in the code path which is executed by #DB handler, + * it results in an endless recursion and stack overflow. Thus we stay + * with the IDT approach, i.e., save DR7 and disable #DB. */ unsigned long dr7 = local_db_save(); irqentry_state_t irq_state = irqentry_nmi_enter(regs); @@ -977,7 +982,8 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, * Catch SYSENTER with TF set and clear DR_STEP. If this hit a * watchpoint at the same time then that will still be handled. */ - if ((dr6 & DR_STEP) && is_sysenter_singlestep(regs)) + if (!cpu_feature_enabled(X86_FEATURE_FRED) && + (dr6 & DR_STEP) && is_sysenter_singlestep(regs)) dr6 &= ~DR_STEP; /* @@ -1009,8 +1015,7 @@ out: local_db_restore(dr7); } -static __always_inline void exc_debug_user(struct pt_regs *regs, - unsigned long dr6) +static noinstr void exc_debug_user(struct pt_regs *regs, unsigned long dr6) { bool icebp; @@ -1094,6 +1099,34 @@ DEFINE_IDTENTRY_DEBUG_USER(exc_debug) { exc_debug_user(regs, debug_read_clear_dr6()); } + +#ifdef CONFIG_X86_FRED +/* + * When occurred on different ring level, i.e., from user or kernel + * context, #DB needs to be handled on different stack: User #DB on + * current task stack, while kernel #DB on a dedicated stack. + * + * This is exactly how FRED event delivery invokes an exception + * handler: ring 3 event on level 0 stack, i.e., current task stack; + * ring 0 event on the #DB dedicated stack specified in the + * IA32_FRED_STKLVLS MSR. So unlike IDT, the FRED debug exception + * entry stub doesn't do stack switch. + */ +DEFINE_FREDENTRY_DEBUG(exc_debug) +{ + /* + * FRED #DB stores DR6 on the stack in the format which + * debug_read_clear_dr6() returns for the IDT entry points. + */ + unsigned long dr6 = fred_event_data(regs); + + if (user_mode(regs)) + exc_debug_user(regs, dr6); + else + exc_debug_kernel(regs, dr6); +} +#endif /* CONFIG_X86_FRED */ + #else /* 32 bit does not have separate entry points. */ DEFINE_IDTENTRY_RAW(exc_debug) @@ -1369,8 +1402,34 @@ DEFINE_IDTENTRY_SW(iret_error) } #endif +/* Do not enable FRED by default yet. */ +static bool enable_fred __ro_after_init = false; + +#ifdef CONFIG_X86_FRED +static int __init fred_setup(char *str) +{ + if (!str) + return -EINVAL; + + if (!cpu_feature_enabled(X86_FEATURE_FRED)) + return 0; + + if (!strcmp(str, "on")) + enable_fred = true; + else if (!strcmp(str, "off")) + enable_fred = false; + else + pr_warn("invalid FRED option: 'fred=%s'\n", str); + return 0; +} +early_param("fred", fred_setup); +#endif + void __init trap_init(void) { + if (cpu_feature_enabled(X86_FEATURE_FRED) && !enable_fred) + setup_clear_cpu_cap(X86_FEATURE_FRED); + /* Init cpu_entry_area before IST entries are set up */ setup_cpu_entry_areas(); @@ -1379,7 +1438,10 @@ void __init trap_init(void) /* Initialize TSS before setting up traps so ISTs work */ cpu_init_exception_handling(); + /* Setup traps as cpu_init() might #GP */ - idt_setup_traps(); + if (!cpu_feature_enabled(X86_FEATURE_FRED)) + idt_setup_traps(); + cpu_init(); } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 15f97c0abc9d..5a69a49acc96 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -53,7 +53,7 @@ static int __read_mostly tsc_force_recalibrate; static u32 art_to_tsc_numerator; static u32 art_to_tsc_denominator; static u64 art_to_tsc_offset; -static struct clocksource *art_related_clocksource; +static bool have_art; struct cyc2ns { struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */ @@ -652,7 +652,7 @@ success: } /** - * native_calibrate_tsc + * native_calibrate_tsc - determine TSC frequency * Determine TSC frequency via CPUID, else return 0. */ unsigned long native_calibrate_tsc(void) @@ -1168,6 +1168,7 @@ static struct clocksource clocksource_tsc_early = { .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_MUST_VERIFY, + .id = CSID_X86_TSC_EARLY, .vdso_clock_mode = VDSO_CLOCKMODE_TSC, .enable = tsc_cs_enable, .resume = tsc_resume, @@ -1190,6 +1191,7 @@ static struct clocksource clocksource_tsc = { CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_MUST_VERIFY | CLOCK_SOURCE_VERIFY_PERCPU, + .id = CSID_X86_TSC, .vdso_clock_mode = VDSO_CLOCKMODE_TSC, .enable = tsc_cs_enable, .resume = tsc_resume, @@ -1309,8 +1311,10 @@ struct system_counterval_t convert_art_to_tsc(u64 art) do_div(tmp, art_to_tsc_denominator); res += tmp + art_to_tsc_offset; - return (struct system_counterval_t) {.cs = art_related_clocksource, - .cycles = res}; + return (struct system_counterval_t) { + .cs_id = have_art ? CSID_X86_TSC : CSID_GENERIC, + .cycles = res, + }; } EXPORT_SYMBOL(convert_art_to_tsc); @@ -1327,12 +1331,10 @@ EXPORT_SYMBOL(convert_art_to_tsc); * that this flag is set before conversion to TSC is attempted. * * Return: - * struct system_counterval_t - system counter value with the pointer to the - * corresponding clocksource - * @cycles: System counter value - * @cs: Clocksource corresponding to system counter value. Used - * by timekeeping code to verify comparability of two cycle - * values. + * struct system_counterval_t - system counter value with the ID of the + * corresponding clocksource: + * cycles: System counter value + * cs_id: The clocksource ID for validating comparability */ struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns) @@ -1347,8 +1349,10 @@ struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns) do_div(tmp, USEC_PER_SEC); res += tmp; - return (struct system_counterval_t) { .cs = art_related_clocksource, - .cycles = res}; + return (struct system_counterval_t) { + .cs_id = have_art ? CSID_X86_TSC : CSID_GENERIC, + .cycles = res, + }; } EXPORT_SYMBOL(convert_art_ns_to_tsc); @@ -1357,7 +1361,7 @@ static void tsc_refine_calibration_work(struct work_struct *work); static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work); /** * tsc_refine_calibration_work - Further refine tsc freq calibration - * @work - ignored. + * @work: ignored. * * This functions uses delayed work over a period of a * second to further refine the TSC freq value. Since this is @@ -1455,7 +1459,7 @@ out: goto unreg; if (boot_cpu_has(X86_FEATURE_ART)) - art_related_clocksource = &clocksource_tsc; + have_art = true; clocksource_register_khz(&clocksource_tsc, tsc_khz); unreg: clocksource_unregister(&clocksource_tsc_early); @@ -1481,7 +1485,7 @@ static int __init init_tsc_clocksource(void) */ if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) { if (boot_cpu_has(X86_FEATURE_ART)) - art_related_clocksource = &clocksource_tsc; + have_art = true; clocksource_register_khz(&clocksource_tsc, tsc_khz); clocksource_unregister(&clocksource_tsc_early); diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index d3fc01770558..73511332bb67 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -127,25 +127,12 @@ static void __init vsmp_cap_cpus(void) #endif } -static u32 apicid_phys_pkg_id(u32 initial_apic_id, int index_msb) -{ - return read_apic_id() >> index_msb; -} - -static void vsmp_apic_post_init(void) -{ - /* need to update phys_pkg_id */ - apic->phys_pkg_id = apicid_phys_pkg_id; -} - void __init vsmp_init(void) { detect_vsmp_box(); if (!is_vsmp_box()) return; - x86_platform.apic_post_init = vsmp_apic_post_init; - vsmp_cap_cpus(); set_vsmp_ctl(); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index a37ebd3b4773..a42830dc151b 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -70,8 +70,9 @@ struct x86_init_ops x86_init __initdata = { .mpparse = { .setup_ioapic_ids = x86_init_noop, - .find_smp_config = default_find_smp_config, - .get_smp_config = default_get_smp_config, + .find_mptable = mpparse_find_mptable, + .early_parse_smp_cfg = mpparse_parse_early_smp_config, + .parse_smp_cfg = mpparse_parse_smp_config, }, .irqs = { diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 87e3da7b0439..65ed14b6540b 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -80,9 +80,10 @@ config KVM_SW_PROTECTED_VM depends on KVM && X86_64 select KVM_GENERIC_PRIVATE_MEM help - Enable support for KVM software-protected VMs. Currently "protected" - means the VM can be backed with memory provided by - KVM_CREATE_GUEST_MEMFD. + Enable support for KVM software-protected VMs. Currently, software- + protected VMs are purely a development and testing vehicle for + KVM_CREATE_GUEST_MEMFD. Attempting to run a "real" VM workload as a + software-protected VM will fail miserably. If unsure, say "N". diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 4943f6b2bbee..8a47f8541eab 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1322,6 +1322,56 @@ static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr) return false; } +#define KVM_HV_WIN2016_GUEST_ID 0x1040a00003839 +#define KVM_HV_WIN2016_GUEST_ID_MASK (~GENMASK_ULL(23, 16)) /* mask out the service version */ + +/* + * Hyper-V enabled Windows Server 2016 SMP VMs fail to boot in !XSAVES && XSAVEC + * configuration. + * Such configuration can result from, for example, AMD Erratum 1386 workaround. + * + * Print a notice so users aren't left wondering what's suddenly gone wrong. + */ +static void __kvm_hv_xsaves_xsavec_maybe_warn(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_hv *hv = to_kvm_hv(kvm); + + /* Check again under the hv_lock. */ + if (hv->xsaves_xsavec_checked) + return; + + if ((hv->hv_guest_os_id & KVM_HV_WIN2016_GUEST_ID_MASK) != + KVM_HV_WIN2016_GUEST_ID) + return; + + hv->xsaves_xsavec_checked = true; + + /* UP configurations aren't affected */ + if (atomic_read(&kvm->online_vcpus) < 2) + return; + + if (guest_cpuid_has(vcpu, X86_FEATURE_XSAVES) || + !guest_cpuid_has(vcpu, X86_FEATURE_XSAVEC)) + return; + + pr_notice_ratelimited("Booting SMP Windows KVM VM with !XSAVES && XSAVEC. " + "If it fails to boot try disabling XSAVEC in the VM config.\n"); +} + +void kvm_hv_xsaves_xsavec_maybe_warn(struct kvm_vcpu *vcpu) +{ + struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); + + if (!vcpu->arch.hyperv_enabled || + hv->xsaves_xsavec_checked) + return; + + mutex_lock(&hv->hv_lock); + __kvm_hv_xsaves_xsavec_maybe_warn(vcpu); + mutex_unlock(&hv->hv_lock); +} + static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 1dc0b6604526..923e64903da9 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -182,6 +182,8 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, struct pvclock_vcpu_time_info *hv_clock); void kvm_hv_request_tsc_page_update(struct kvm *kvm); +void kvm_hv_xsaves_xsavec_maybe_warn(struct kvm_vcpu *vcpu); + void kvm_hv_init_vm(struct kvm *kvm); void kvm_hv_destroy_vm(struct kvm *kvm); int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu); @@ -267,6 +269,7 @@ int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu); static inline void kvm_hv_setup_tsc_page(struct kvm *kvm, struct pvclock_vcpu_time_info *hv_clock) {} static inline void kvm_hv_request_tsc_page_update(struct kvm *kvm) {} +static inline void kvm_hv_xsaves_xsavec_maybe_warn(struct kvm_vcpu *vcpu) {} static inline void kvm_hv_init_vm(struct kvm *kvm) {} static inline void kvm_hv_destroy_vm(struct kvm *kvm) {} static inline int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 3242f3da2457..1edf93ee3395 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2815,7 +2815,10 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) vcpu->arch.apic = apic; - apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); + if (kvm_x86_ops.alloc_apic_backing_page) + apic->regs = static_call(kvm_x86_alloc_apic_backing_page)(vcpu); + else + apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!apic->regs) { printk(KERN_ERR "malloc apic regs error for vcpu %x\n", vcpu->vcpu_id); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 2d6cdeab1f8a..0544700ca50b 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4405,6 +4405,31 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq; smp_rmb(); + /* + * Check for a relevant mmu_notifier invalidation event before getting + * the pfn from the primary MMU, and before acquiring mmu_lock. + * + * For mmu_lock, if there is an in-progress invalidation and the kernel + * allows preemption, the invalidation task may drop mmu_lock and yield + * in response to mmu_lock being contended, which is *very* counter- + * productive as this vCPU can't actually make forward progress until + * the invalidation completes. + * + * Retrying now can also avoid unnessary lock contention in the primary + * MMU, as the primary MMU doesn't necessarily hold a single lock for + * the duration of the invalidation, i.e. faulting in a conflicting pfn + * can cause the invalidation to take longer by holding locks that are + * needed to complete the invalidation. + * + * Do the pre-check even for non-preemtible kernels, i.e. even if KVM + * will never yield mmu_lock in response to contention, as this vCPU is + * *guaranteed* to need to retry, i.e. waiting until mmu_lock is held + * to detect retry guarantees the worst case latency for the vCPU. + */ + if (fault->slot && + mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) + return RET_PF_RETRY; + ret = __kvm_faultin_pfn(vcpu, fault); if (ret != RET_PF_CONTINUE) return ret; @@ -4415,6 +4440,18 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, if (unlikely(!fault->slot)) return kvm_handle_noslot_fault(vcpu, fault, access); + /* + * Check again for a relevant mmu_notifier invalidation event purely to + * avoid contending mmu_lock. Most invalidations will be detected by + * the previous check, but checking is extremely cheap relative to the + * overall cost of failing to detect the invalidation until after + * mmu_lock is acquired. + */ + if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) { + kvm_release_pfn_clean(fault->pfn); + return RET_PF_RETRY; + } + return RET_PF_CONTINUE; } @@ -4442,6 +4479,11 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu, if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu)) return true; + /* + * Check for a relevant mmu_notifier invalidation event one last time + * now that mmu_lock is held, as the "unsafe" checks performed without + * holding mmu_lock can get false negatives. + */ return fault->slot && mmu_invalidate_retry_gfn(vcpu->kvm, fault->mmu_seq, fault->gfn); } diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index dee62362a360..55b9a6d96bcf 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1181,7 +1181,7 @@ int svm_allocate_nested(struct vcpu_svm *svm) if (svm->nested.initialized) return 0; - vmcb02_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + vmcb02_page = snp_safe_alloc_page(&svm->vcpu); if (!vmcb02_page) return -ENOMEM; svm->nested.vmcb02.ptr = page_address(vmcb02_page); diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index f760106c31f8..ae0ac12382b9 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -57,7 +57,7 @@ static bool sev_es_enabled = true; module_param_named(sev_es, sev_es_enabled, bool, 0444); /* enable/disable SEV-ES DebugSwap support */ -static bool sev_es_debug_swap_enabled = true; +static bool sev_es_debug_swap_enabled = false; module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444); #else #define sev_enabled false @@ -246,6 +246,7 @@ static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) { struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct sev_platform_init_args init_args = {0}; int asid, ret; if (kvm->created_vcpus) @@ -262,7 +263,8 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) goto e_no_asid; sev->asid = asid; - ret = sev_platform_init(&argp->error); + init_args.probe = false; + ret = sev_platform_init(&init_args); if (ret) goto e_free; @@ -274,6 +276,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) return 0; e_free: + argp->error = init_args.error; sev_asid_free(sev); sev->asid = 0; e_no_asid: @@ -612,8 +615,11 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm) save->xss = svm->vcpu.arch.ia32_xss; save->dr6 = svm->vcpu.arch.dr6; - if (sev_es_debug_swap_enabled) + if (sev_es_debug_swap_enabled) { save->sev_features |= SVM_SEV_FEAT_DEBUG_SWAP; + pr_warn_once("Enabling DebugSwap with KVM_SEV_ES_INIT. " + "This will not work starting with Linux 6.10\n"); + } pr_debug("Virtual Machine Save Area (VMSA):\n"); print_hex_dump_debug("", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false); @@ -1975,20 +1981,22 @@ int sev_mem_enc_register_region(struct kvm *kvm, goto e_free; } - region->uaddr = range->addr; - region->size = range->size; - - list_add_tail(®ion->list, &sev->regions_list); - mutex_unlock(&kvm->lock); - /* * The guest may change the memory encryption attribute from C=0 -> C=1 * or vice versa for this memory range. Lets make sure caches are * flushed to ensure that guest data gets written into memory with - * correct C-bit. + * correct C-bit. Note, this must be done before dropping kvm->lock, + * as region and its array of pages can be freed by a different task + * once kvm->lock is released. */ sev_clflush_pages(region->pages, region->npages); + region->uaddr = range->addr; + region->size = range->size; + + list_add_tail(®ion->list, &sev->regions_list); + mutex_unlock(&kvm->lock); + return ret; e_free: @@ -3160,3 +3168,35 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1); } + +struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu) +{ + unsigned long pfn; + struct page *p; + + if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + return alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + + /* + * Allocate an SNP-safe page to workaround the SNP erratum where + * the CPU will incorrectly signal an RMP violation #PF if a + * hugepage (2MB or 1GB) collides with the RMP entry of a + * 2MB-aligned VMCB, VMSA, or AVIC backing page. + * + * Allocate one extra page, choose a page which is not + * 2MB-aligned, and free the other. + */ + p = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1); + if (!p) + return NULL; + + split_page(p, 1); + + pfn = page_to_pfn(p); + if (IS_ALIGNED(pfn, PTRS_PER_PMD)) + __free_page(p++); + else + __free_page(p + 1); + + return p; +} diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index e90b429c84f1..8284105bf704 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -703,7 +703,7 @@ static int svm_cpu_init(int cpu) int ret = -ENOMEM; memset(sd, 0, sizeof(struct svm_cpu_data)); - sd->save_area = alloc_page(GFP_KERNEL | __GFP_ZERO); + sd->save_area = snp_safe_alloc_page(NULL); if (!sd->save_area) return ret; @@ -1421,7 +1421,7 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu) svm = to_svm(vcpu); err = -ENOMEM; - vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + vmcb01_page = snp_safe_alloc_page(vcpu); if (!vmcb01_page) goto out; @@ -1430,7 +1430,7 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu) * SEV-ES guests require a separate VMSA page used to contain * the encrypted register state of the guest. */ - vmsa_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + vmsa_page = snp_safe_alloc_page(vcpu); if (!vmsa_page) goto error_free_vmcb_page; @@ -4900,6 +4900,16 @@ static int svm_vm_init(struct kvm *kvm) return 0; } +static void *svm_alloc_apic_backing_page(struct kvm_vcpu *vcpu) +{ + struct page *page = snp_safe_alloc_page(vcpu); + + if (!page) + return NULL; + + return page_address(page); +} + static struct kvm_x86_ops svm_x86_ops __initdata = { .name = KBUILD_MODNAME, @@ -5031,6 +5041,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector, .vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons, + .alloc_apic_backing_page = svm_alloc_apic_backing_page, }; /* diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 8ef95139cd24..7f1fbd874c45 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -694,6 +694,7 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm); void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa); void sev_es_unmap_ghcb(struct vcpu_svm *svm); +struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu); /* vmenter.S */ diff --git a/arch/x86/kvm/svm/svm_ops.h b/arch/x86/kvm/svm/svm_ops.h index 36c8af87a707..4e725854c63a 100644 --- a/arch/x86/kvm/svm/svm_ops.h +++ b/arch/x86/kvm/svm/svm_ops.h @@ -8,7 +8,7 @@ #define svm_asm(insn, clobber...) \ do { \ - asm_volatile_goto("1: " __stringify(insn) "\n\t" \ + asm goto("1: " __stringify(insn) "\n\t" \ _ASM_EXTABLE(1b, %l[fault]) \ ::: clobber : fault); \ return; \ @@ -18,7 +18,7 @@ fault: \ #define svm_asm1(insn, op1, clobber...) \ do { \ - asm_volatile_goto("1: " __stringify(insn) " %0\n\t" \ + asm goto("1: " __stringify(insn) " %0\n\t" \ _ASM_EXTABLE(1b, %l[fault]) \ :: op1 : clobber : fault); \ return; \ @@ -28,7 +28,7 @@ fault: \ #define svm_asm2(insn, op1, op2, clobber...) \ do { \ - asm_volatile_goto("1: " __stringify(insn) " %1, %0\n\t" \ + asm goto("1: " __stringify(insn) " %1, %0\n\t" \ _ASM_EXTABLE(1b, %l[fault]) \ :: op1, op2 : clobber : fault); \ return; \ diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index a6216c874729..315c7c2ba89b 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -71,7 +71,7 @@ static int fixed_pmc_events[] = { static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) { struct kvm_pmc *pmc; - u8 old_fixed_ctr_ctrl = pmu->fixed_ctr_ctrl; + u64 old_fixed_ctr_ctrl = pmu->fixed_ctr_ctrl; int i; pmu->fixed_ctr_ctrl = data; diff --git a/arch/x86/kvm/vmx/run_flags.h b/arch/x86/kvm/vmx/run_flags.h index edc3f16cc189..6a9bfdfbb6e5 100644 --- a/arch/x86/kvm/vmx/run_flags.h +++ b/arch/x86/kvm/vmx/run_flags.h @@ -2,7 +2,10 @@ #ifndef __KVM_X86_VMX_RUN_FLAGS_H #define __KVM_X86_VMX_RUN_FLAGS_H -#define VMX_RUN_VMRESUME (1 << 0) -#define VMX_RUN_SAVE_SPEC_CTRL (1 << 1) +#define VMX_RUN_VMRESUME_SHIFT 0 +#define VMX_RUN_SAVE_SPEC_CTRL_SHIFT 1 + +#define VMX_RUN_VMRESUME BIT(VMX_RUN_VMRESUME_SHIFT) +#define VMX_RUN_SAVE_SPEC_CTRL BIT(VMX_RUN_SAVE_SPEC_CTRL_SHIFT) #endif /* __KVM_X86_VMX_RUN_FLAGS_H */ diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 906ecd001511..2bfbf758d061 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -139,7 +139,7 @@ SYM_FUNC_START(__vmx_vcpu_run) mov (%_ASM_SP), %_ASM_AX /* Check if vmlaunch or vmresume is needed */ - test $VMX_RUN_VMRESUME, %ebx + bt $VMX_RUN_VMRESUME_SHIFT, %ebx /* Load guest registers. Don't clobber flags. */ mov VCPU_RCX(%_ASM_AX), %_ASM_CX @@ -161,8 +161,11 @@ SYM_FUNC_START(__vmx_vcpu_run) /* Load guest RAX. This kills the @regs pointer! */ mov VCPU_RAX(%_ASM_AX), %_ASM_AX - /* Check EFLAGS.ZF from 'test VMX_RUN_VMRESUME' above */ - jz .Lvmlaunch + /* Clobbers EFLAGS.ZF */ + CLEAR_CPU_BUFFERS + + /* Check EFLAGS.CF from the VMX_RUN_VMRESUME bit test above. */ + jnc .Lvmlaunch /* * After a successful VMRESUME/VMLAUNCH, control flow "magically" diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e262bc2ba4e5..5b8fae9c1f82 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -38,6 +38,7 @@ #include <asm/desc.h> #include <asm/fpu/api.h> #include <asm/fpu/xstate.h> +#include <asm/fred.h> #include <asm/idtentry.h> #include <asm/io.h> #include <asm/irq_remapping.h> @@ -388,7 +389,16 @@ static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx) static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) { - vmx->disable_fb_clear = (host_arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) && + /* + * Disable VERW's behavior of clearing CPU buffers for the guest if the + * CPU isn't affected by MDS/TAA, and the host hasn't forcefully enabled + * the mitigation. Disabling the clearing behavior provides a + * performance boost for guests that aren't aware that manually clearing + * CPU buffers is unnecessary, at the cost of MSR accesses on VM-Entry + * and VM-Exit. + */ + vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) && + (host_arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) && !boot_cpu_has_bug(X86_BUG_MDS) && !boot_cpu_has_bug(X86_BUG_TAA); @@ -738,7 +748,7 @@ static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, */ static int kvm_cpu_vmxoff(void) { - asm_volatile_goto("1: vmxoff\n\t" + asm goto("1: vmxoff\n\t" _ASM_EXTABLE(1b, %l[fault]) ::: "cc", "memory" : fault); @@ -2784,7 +2794,7 @@ static int kvm_cpu_vmxon(u64 vmxon_pointer) cr4_set_bits(X86_CR4_VMXE); - asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t" + asm goto("1: vmxon %[vmxon_pointer]\n\t" _ASM_EXTABLE(1b, %l[fault]) : : [vmxon_pointer] "m"(vmxon_pointer) : : fault); @@ -6960,14 +6970,16 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) { u32 intr_info = vmx_get_intr_info(vcpu); unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; - gate_desc *desc = (gate_desc *)host_idt_base + vector; if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm, "unexpected VM-Exit interrupt info: 0x%x", intr_info)) return; kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ); - vmx_do_interrupt_irqoff(gate_offset(desc)); + if (cpu_feature_enabled(X86_FEATURE_FRED)) + fred_entry_from_kvm(EVENT_TYPE_EXTINT, vector); + else + vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base + vector)); kvm_after_interrupt(vcpu); vcpu->arch.at_instruction_boundary = true; @@ -7224,11 +7236,14 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, guest_state_enter_irqoff(); - /* L1D Flush includes CPU buffer clear to mitigate MDS */ + /* + * L1D Flush includes CPU buffer clear to mitigate MDS, but VERW + * mitigation for MDS is done late in VMentry and is still + * executed in spite of L1D Flush. This is because an extra VERW + * should not matter much after the big hammer L1D Flush. + */ if (static_branch_unlikely(&vmx_l1d_should_flush)) vmx_l1d_flush(vcpu); - else if (static_branch_unlikely(&mds_user_clear)) - mds_clear_cpu_buffers(); else if (static_branch_unlikely(&mmio_stale_data_clear) && kvm_arch_has_assigned_device(vcpu->kvm)) mds_clear_cpu_buffers(); @@ -7260,7 +7275,10 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI && is_nmi(vmx_get_intr_info(vcpu))) { kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); - vmx_do_nmi_irqoff(); + if (cpu_feature_enabled(X86_FEATURE_FRED)) + fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR); + else + vmx_do_nmi_irqoff(); kvm_after_interrupt(vcpu); } diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index f41ce3c24123..8060e5fc6dbd 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -94,7 +94,7 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field) #ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT - asm_volatile_goto("1: vmread %[field], %[output]\n\t" + asm_goto_output("1: vmread %[field], %[output]\n\t" "jna %l[do_fail]\n\t" _ASM_EXTABLE(1b, %l[do_exception]) @@ -188,7 +188,7 @@ static __always_inline unsigned long vmcs_readl(unsigned long field) #define vmx_asm1(insn, op1, error_args...) \ do { \ - asm_volatile_goto("1: " __stringify(insn) " %0\n\t" \ + asm goto("1: " __stringify(insn) " %0\n\t" \ ".byte 0x2e\n\t" /* branch not taken hint */ \ "jna %l[error]\n\t" \ _ASM_EXTABLE(1b, %l[fault]) \ @@ -205,7 +205,7 @@ fault: \ #define vmx_asm2(insn, op1, op2, error_args...) \ do { \ - asm_volatile_goto("1: " __stringify(insn) " %1, %0\n\t" \ + asm goto("1: " __stringify(insn) " %1, %0\n\t" \ ".byte 0x2e\n\t" /* branch not taken hint */ \ "jna %l[error]\n\t" \ _ASM_EXTABLE(1b, %l[fault]) \ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 363b1c080205..e02cc710f56d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1704,22 +1704,17 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) struct kvm_msr_entry msr; int r; + /* Unconditionally clear the output for simplicity */ + msr.data = 0; msr.index = index; r = kvm_get_msr_feature(&msr); - if (r == KVM_MSR_RET_INVALID) { - /* Unconditionally clear the output for simplicity */ - *data = 0; - if (kvm_msr_ignored_check(index, 0, false)) - r = 0; - } - - if (r) - return r; + if (r == KVM_MSR_RET_INVALID && kvm_msr_ignored_check(index, 0, false)) + r = 0; *data = msr.data; - return 0; + return r; } static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) @@ -1782,6 +1777,10 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if ((efer ^ old_efer) & KVM_MMU_EFER_ROLE_BITS) kvm_mmu_reset_context(vcpu); + if (!static_cpu_has(X86_FEATURE_XSAVES) && + (efer & EFER_SVME)) + kvm_hv_xsaves_xsavec_maybe_warn(vcpu); + return 0; } @@ -2507,7 +2506,7 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) } #ifdef CONFIG_X86_64 -static inline int gtod_is_based_on_tsc(int mode) +static inline bool gtod_is_based_on_tsc(int mode) { return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK; } @@ -4581,7 +4580,7 @@ static bool kvm_is_vm_type_supported(unsigned long type) { return type == KVM_X86_DEFAULT_VM || (type == KVM_X86_SW_PROTECTED_VM && - IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_enabled); + IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled); } int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) @@ -5454,7 +5453,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) { vcpu->arch.nmi_pending = 0; atomic_set(&vcpu->arch.nmi_queued, events->nmi.pending); - kvm_make_request(KVM_REQ_NMI, vcpu); + if (events->nmi.pending) + kvm_make_request(KVM_REQ_NMI, vcpu); } static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked); @@ -7016,6 +7016,9 @@ set_identity_unlock: r = -EEXIST; if (kvm->arch.vpit) goto create_pit_unlock; + r = -ENOENT; + if (!pic_in_kernel(kvm)) + goto create_pit_unlock; r = -ENOMEM; kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags); if (kvm->arch.vpit) @@ -8004,6 +8007,16 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, if (r < 0) return X86EMUL_UNHANDLEABLE; + + /* + * Mark the page dirty _before_ checking whether or not the CMPXCHG was + * successful, as the old value is written back on failure. Note, for + * live migration, this is unnecessarily conservative as CMPXCHG writes + * back the original value and the access is atomic, but KVM's ABI is + * that all writes are dirty logged, regardless of the value written. + */ + kvm_vcpu_mark_page_dirty(vcpu, gpa_to_gfn(gpa)); + if (r) return X86EMUL_CMPXCHG_FAILED; diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index 20ef350a60fb..10d5ed8b5990 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -163,23 +163,23 @@ SYM_CODE_END(__get_user_8_handle_exception) #endif /* get_user */ - _ASM_EXTABLE(1b, __get_user_handle_exception) - _ASM_EXTABLE(2b, __get_user_handle_exception) - _ASM_EXTABLE(3b, __get_user_handle_exception) + _ASM_EXTABLE_UA(1b, __get_user_handle_exception) + _ASM_EXTABLE_UA(2b, __get_user_handle_exception) + _ASM_EXTABLE_UA(3b, __get_user_handle_exception) #ifdef CONFIG_X86_64 - _ASM_EXTABLE(4b, __get_user_handle_exception) + _ASM_EXTABLE_UA(4b, __get_user_handle_exception) #else - _ASM_EXTABLE(4b, __get_user_8_handle_exception) - _ASM_EXTABLE(5b, __get_user_8_handle_exception) + _ASM_EXTABLE_UA(4b, __get_user_8_handle_exception) + _ASM_EXTABLE_UA(5b, __get_user_8_handle_exception) #endif /* __get_user */ - _ASM_EXTABLE(6b, __get_user_handle_exception) - _ASM_EXTABLE(7b, __get_user_handle_exception) - _ASM_EXTABLE(8b, __get_user_handle_exception) + _ASM_EXTABLE_UA(6b, __get_user_handle_exception) + _ASM_EXTABLE_UA(7b, __get_user_handle_exception) + _ASM_EXTABLE_UA(8b, __get_user_handle_exception) #ifdef CONFIG_X86_64 - _ASM_EXTABLE(9b, __get_user_handle_exception) + _ASM_EXTABLE_UA(9b, __get_user_handle_exception) #else - _ASM_EXTABLE(9b, __get_user_8_handle_exception) - _ASM_EXTABLE(10b, __get_user_8_handle_exception) + _ASM_EXTABLE_UA(9b, __get_user_8_handle_exception) + _ASM_EXTABLE_UA(10b, __get_user_8_handle_exception) #endif diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S index 2877f5934177..975c9c18263d 100644 --- a/arch/x86/lib/putuser.S +++ b/arch/x86/lib/putuser.S @@ -133,15 +133,15 @@ SYM_CODE_START_LOCAL(__put_user_handle_exception) RET SYM_CODE_END(__put_user_handle_exception) - _ASM_EXTABLE(1b, __put_user_handle_exception) - _ASM_EXTABLE(2b, __put_user_handle_exception) - _ASM_EXTABLE(3b, __put_user_handle_exception) - _ASM_EXTABLE(4b, __put_user_handle_exception) - _ASM_EXTABLE(5b, __put_user_handle_exception) - _ASM_EXTABLE(6b, __put_user_handle_exception) - _ASM_EXTABLE(7b, __put_user_handle_exception) - _ASM_EXTABLE(9b, __put_user_handle_exception) + _ASM_EXTABLE_UA(1b, __put_user_handle_exception) + _ASM_EXTABLE_UA(2b, __put_user_handle_exception) + _ASM_EXTABLE_UA(3b, __put_user_handle_exception) + _ASM_EXTABLE_UA(4b, __put_user_handle_exception) + _ASM_EXTABLE_UA(5b, __put_user_handle_exception) + _ASM_EXTABLE_UA(6b, __put_user_handle_exception) + _ASM_EXTABLE_UA(7b, __put_user_handle_exception) + _ASM_EXTABLE_UA(9b, __put_user_handle_exception) #ifdef CONFIG_X86_32 - _ASM_EXTABLE(8b, __put_user_handle_exception) - _ASM_EXTABLE(10b, __put_user_handle_exception) + _ASM_EXTABLE_UA(8b, __put_user_handle_exception) + _ASM_EXTABLE_UA(10b, __put_user_handle_exception) #endif diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 5168ee0360b2..12af572201a2 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -1051,8 +1051,8 @@ GrpTable: Grp6 EndTable GrpTable: Grp7 -0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) | PCONFIG (101),(11B) | ENCLV (000),(11B) -1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) | ENCLS (111),(11B) +0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) | PCONFIG (101),(11B) | ENCLV (000),(11B) | WRMSRNS (110),(11B) +1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) | ENCLS (111),(11B) | ERETU (F3),(010),(11B) | ERETS (F2),(010),(11B) 2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) | ENCLU (111),(11B) 3: LIDT Ms 4: SMSW Mw/Rv diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index c80febc44cd2..6ec103bedcf1 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -16,6 +16,7 @@ KASAN_SANITIZE_pgprot.o := n KCSAN_SANITIZE := n # Avoid recursion by not calling KMSAN hooks for CEA code. KMSAN_SANITIZE_cpu_entry_area.o := n +KMSAN_SANITIZE_mem_encrypt_identity.o := n ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_mem_encrypt.o = -pg diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c index b3ca7d23e4b0..9332b36a1091 100644 --- a/arch/x86/mm/amdtopology.c +++ b/arch/x86/mm/amdtopology.c @@ -54,13 +54,11 @@ static __init int find_northbridge(void) int __init amd_numa_init(void) { - u64 start = PFN_PHYS(0); + unsigned int numnodes, cores, apicid; + u64 prevbase, start = PFN_PHYS(0); u64 end = PFN_PHYS(max_pfn); - unsigned numnodes; - u64 prevbase; - int i, j, nb; u32 nodeid, reg; - unsigned int bits, cores, apicid_base; + int i, j, nb; if (!early_pci_allowed()) return -EINVAL; @@ -158,26 +156,18 @@ int __init amd_numa_init(void) return -ENOENT; /* - * We seem to have valid NUMA configuration. Map apicids to nodes - * using the coreid bits from early_identify_cpu. + * We seem to have valid NUMA configuration. Map apicids to nodes + * using the size of the core domain in the APIC space. */ - bits = boot_cpu_data.x86_coreid_bits; - cores = 1 << bits; - apicid_base = 0; + cores = topology_get_domain_size(TOPO_CORE_DOMAIN); - /* - * get boot-time SMP configuration: - */ - early_get_smp_config(); + apicid = boot_cpu_physical_apicid; + if (apicid > 0) + pr_info("BSP APIC ID: %02x\n", apicid); - if (boot_cpu_physical_apicid > 0) { - pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid); - apicid_base = boot_cpu_physical_apicid; + for_each_node_mask(i, numa_nodes_parsed) { + for (j = 0; j < cores; j++, apicid++) + set_apicid_to_node(apicid, i); } - - for_each_node_mask(i, numa_nodes_parsed) - for (j = apicid_base; j < cores + apicid_base; j++) - set_apicid_to_node((i << bits) + j, i); - return 0; } diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 271dcb2deabc..b522933bfa56 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -6,6 +6,7 @@ #include <xen/xen.h> #include <asm/fpu/api.h> +#include <asm/fred.h> #include <asm/sev.h> #include <asm/traps.h> #include <asm/kdebug.h> @@ -223,6 +224,79 @@ static bool ex_handler_ucopy_len(const struct exception_table_entry *fixup, return ex_handler_uaccess(fixup, regs, trapnr, fault_address); } +#ifdef CONFIG_X86_FRED +static bool ex_handler_eretu(const struct exception_table_entry *fixup, + struct pt_regs *regs, unsigned long error_code) +{ + struct pt_regs *uregs = (struct pt_regs *)(regs->sp - offsetof(struct pt_regs, orig_ax)); + unsigned short ss = uregs->ss; + unsigned short cs = uregs->cs; + + /* + * Move the NMI bit from the invalid stack frame, which caused ERETU + * to fault, to the fault handler's stack frame, thus to unblock NMI + * with the fault handler's ERETS instruction ASAP if NMI is blocked. + */ + regs->fred_ss.nmi = uregs->fred_ss.nmi; + + /* + * Sync event information to uregs, i.e., the ERETU return frame, but + * is it safe to write to the ERETU return frame which is just above + * current event stack frame? + * + * The RSP used by FRED to push a stack frame is not the value in %rsp, + * it is calculated from %rsp with the following 2 steps: + * 1) RSP = %rsp - (IA32_FRED_CONFIG & 0x1c0) // Reserve N*64 bytes + * 2) RSP = RSP & ~0x3f // Align to a 64-byte cache line + * when an event delivery doesn't trigger a stack level change. + * + * Here is an example with N*64 (N=1) bytes reserved: + * + * 64-byte cache line ==> ______________ + * |___Reserved___| + * |__Event_data__| + * |_____SS_______| + * |_____RSP______| + * |_____FLAGS____| + * |_____CS_______| + * |_____IP_______| + * 64-byte cache line ==> |__Error_code__| <== ERETU return frame + * |______________| + * |______________| + * |______________| + * |______________| + * |______________| + * |______________| + * |______________| + * 64-byte cache line ==> |______________| <== RSP after step 1) and 2) + * |___Reserved___| + * |__Event_data__| + * |_____SS_______| + * |_____RSP______| + * |_____FLAGS____| + * |_____CS_______| + * |_____IP_______| + * 64-byte cache line ==> |__Error_code__| <== ERETS return frame + * + * Thus a new FRED stack frame will always be pushed below a previous + * FRED stack frame ((N*64) bytes may be reserved between), and it is + * safe to write to a previous FRED stack frame as they never overlap. + */ + fred_info(uregs)->edata = fred_event_data(regs); + uregs->ssx = regs->ssx; + uregs->fred_ss.ss = ss; + /* The NMI bit was moved away above */ + uregs->fred_ss.nmi = 0; + uregs->csx = regs->csx; + uregs->fred_cs.sl = 0; + uregs->fred_cs.wfe = 0; + uregs->cs = cs; + uregs->orig_ax = error_code; + + return ex_handler_default(fixup, regs); +} +#endif + int ex_get_fixup_type(unsigned long ip) { const struct exception_table_entry *e = search_exception_tables(ip); @@ -300,6 +374,10 @@ int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code, return ex_handler_ucopy_len(e, regs, trapnr, fault_addr, reg, imm); case EX_TYPE_ZEROPAD: return ex_handler_zeropad(e, regs, fault_addr); +#ifdef CONFIG_X86_FRED + case EX_TYPE_ERETU: + return ex_handler_eretu(e, regs, error_code); +#endif } BUG(); } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 679b09cfe241..cdb5045a0428 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -34,6 +34,8 @@ #include <asm/kvm_para.h> /* kvm_handle_async_pf */ #include <asm/vdso.h> /* fixup_vdso_exception() */ #include <asm/irq_stack.h> +#include <asm/fred.h> +#include <asm/sev.h> /* snp_dump_hva_rmpentry() */ #define CREATE_TRACE_POINTS #include <asm/trace/exceptions.h> @@ -547,6 +549,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad !(error_code & X86_PF_PROT) ? "not-present page" : (error_code & X86_PF_RSVD) ? "reserved bit violation" : (error_code & X86_PF_PK) ? "protection keys violation" : + (error_code & X86_PF_RMP) ? "RMP violation" : "permissions violation"); if (!(error_code & X86_PF_USER) && user_mode(regs)) { @@ -579,6 +582,9 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad } dump_pagetable(address); + + if (error_code & X86_PF_RMP) + snp_dump_hva_rmpentry(address); } static noinline void @@ -798,15 +804,6 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, show_opcodes(regs, loglvl); } -/* - * The (legacy) vsyscall page is the long page in the kernel portion - * of the address space that has user-accessible permissions. - */ -static bool is_vsyscall_vaddr(unsigned long vaddr) -{ - return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR); -} - static void __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, unsigned long address, u32 pkey, int si_code) @@ -1518,8 +1515,10 @@ handle_page_fault(struct pt_regs *regs, unsigned long error_code, DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) { - unsigned long address = read_cr2(); irqentry_state_t state; + unsigned long address; + + address = cpu_feature_enabled(X86_FEATURE_FRED) ? fred_event_data(regs) : read_cr2(); prefetchw(¤t->mm->mmap_lock); diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index 968d7005f4a7..f50cc210a981 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -26,18 +26,31 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, for (; addr < end; addr = next) { pud_t *pud = pud_page + pud_index(addr); pmd_t *pmd; + bool use_gbpage; next = (addr & PUD_MASK) + PUD_SIZE; if (next > end) next = end; - if (info->direct_gbpages) { - pud_t pudval; + /* if this is already a gbpage, this portion is already mapped */ + if (pud_large(*pud)) + continue; + + /* Is using a gbpage allowed? */ + use_gbpage = info->direct_gbpages; - if (pud_present(*pud)) - continue; + /* Don't use gbpage if it maps more than the requested region. */ + /* at the begining: */ + use_gbpage &= ((addr & ~PUD_MASK) == 0); + /* ... or at the end: */ + use_gbpage &= ((next & ~PUD_MASK) == 0); + + /* Never overwrite existing mappings */ + use_gbpage &= !pud_present(*pud); + + if (use_gbpage) { + pud_t pudval; - addr &= PUD_MASK; pudval = __pud((addr - info->offset) | info->page_flag); set_pud(pud, pudval); continue; diff --git a/arch/x86/mm/maccess.c b/arch/x86/mm/maccess.c index 6993f026adec..42115ac079cf 100644 --- a/arch/x86/mm/maccess.c +++ b/arch/x86/mm/maccess.c @@ -3,6 +3,8 @@ #include <linux/uaccess.h> #include <linux/kernel.h> +#include <asm/vsyscall.h> + #ifdef CONFIG_X86_64 bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) { @@ -16,6 +18,14 @@ bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) return false; /* + * Reading from the vsyscall page may cause an unhandled fault in + * certain cases. Though it is at an address above TASK_SIZE_MAX, it is + * usually considered as a user space address. + */ + if (is_vsyscall_vaddr(vaddr)) + return false; + + /* * Allow everything during early boot before 'x86_virt_bits' * is initialized. Needed for instruction decoding in early * exception handlers. diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index c290c55b632b..6f3b3e028718 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -14,6 +14,8 @@ #include <linux/mem_encrypt.h> #include <linux/virtio_anchor.h> +#include <asm/sev.h> + /* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */ bool force_dma_unencrypted(struct device *dev) { @@ -42,38 +44,45 @@ bool force_dma_unencrypted(struct device *dev) static void print_mem_encrypt_feature_info(void) { - pr_info("Memory Encryption Features active:"); - - if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { - pr_cont(" Intel TDX\n"); - return; - } + pr_info("Memory Encryption Features active: "); - pr_cont(" AMD"); + switch (cc_vendor) { + case CC_VENDOR_INTEL: + pr_cont("Intel TDX\n"); + break; + case CC_VENDOR_AMD: + pr_cont("AMD"); - /* Secure Memory Encryption */ - if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) { + /* Secure Memory Encryption */ + if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) { /* * SME is mutually exclusive with any of the SEV * features below. - */ - pr_cont(" SME\n"); - return; - } + */ + pr_cont(" SME\n"); + return; + } - /* Secure Encrypted Virtualization */ - if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) - pr_cont(" SEV"); + /* Secure Encrypted Virtualization */ + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) + pr_cont(" SEV"); + + /* Encrypted Register State */ + if (cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) + pr_cont(" SEV-ES"); - /* Encrypted Register State */ - if (cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) - pr_cont(" SEV-ES"); + /* Secure Nested Paging */ + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + pr_cont(" SEV-SNP"); - /* Secure Nested Paging */ - if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) - pr_cont(" SEV-SNP"); + pr_cont("\n"); - pr_cont("\n"); + sev_show_status(); + + break; + default: + pr_cont("Unknown\n"); + } } /* Architecture __weak replacement functions */ diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index d73aeb16417f..0166ab1780cc 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -97,7 +97,6 @@ static char sme_workarea[2 * PMD_SIZE] __section(".init.scratch"); static char sme_cmdline_arg[] __initdata = "mem_encrypt"; static char sme_cmdline_on[] __initdata = "on"; -static char sme_cmdline_off[] __initdata = "off"; static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd) { @@ -305,7 +304,8 @@ void __init sme_encrypt_kernel(struct boot_params *bp) * instrumentation or checking boot_cpu_data in the cc_platform_has() * function. */ - if (!sme_get_me_mask() || sev_status & MSR_AMD64_SEV_ENABLED) + if (!sme_get_me_mask() || + RIP_REL_REF(sev_status) & MSR_AMD64_SEV_ENABLED) return; /* @@ -504,10 +504,9 @@ void __init sme_encrypt_kernel(struct boot_params *bp) void __init sme_enable(struct boot_params *bp) { - const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off; + const char *cmdline_ptr, *cmdline_arg, *cmdline_on; unsigned int eax, ebx, ecx, edx; unsigned long feature_mask; - bool active_by_default; unsigned long me_mask; char buffer[16]; bool snp; @@ -543,11 +542,11 @@ void __init sme_enable(struct boot_params *bp) me_mask = 1UL << (ebx & 0x3f); /* Check the SEV MSR whether SEV or SME is enabled */ - sev_status = __rdmsr(MSR_AMD64_SEV); - feature_mask = (sev_status & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT; + RIP_REL_REF(sev_status) = msr = __rdmsr(MSR_AMD64_SEV); + feature_mask = (msr & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT; /* The SEV-SNP CC blob should never be present unless SEV-SNP is enabled. */ - if (snp && !(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) + if (snp && !(msr & MSR_AMD64_SEV_SNP_ENABLED)) snp_abort(); /* Check if memory encryption is enabled */ @@ -573,7 +572,6 @@ void __init sme_enable(struct boot_params *bp) return; } else { /* SEV state cannot be controlled by a command line option */ - sme_me_mask = me_mask; goto out; } @@ -588,31 +586,17 @@ void __init sme_enable(struct boot_params *bp) asm ("lea sme_cmdline_on(%%rip), %0" : "=r" (cmdline_on) : "p" (sme_cmdline_on)); - asm ("lea sme_cmdline_off(%%rip), %0" - : "=r" (cmdline_off) - : "p" (sme_cmdline_off)); - - if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT)) - active_by_default = true; - else - active_by_default = false; cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr | ((u64)bp->ext_cmd_line_ptr << 32)); - if (cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer)) < 0) + if (cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer)) < 0 || + strncmp(buffer, cmdline_on, sizeof(buffer))) return; - if (!strncmp(buffer, cmdline_on, sizeof(buffer))) - sme_me_mask = me_mask; - else if (!strncmp(buffer, cmdline_off, sizeof(buffer))) - sme_me_mask = 0; - else - sme_me_mask = active_by_default ? me_mask : 0; out: - if (sme_me_mask) { - physical_mask &= ~sme_me_mask; - cc_vendor = CC_VENDOR_AMD; - cc_set_mask(sme_me_mask); - } + RIP_REL_REF(sme_me_mask) = me_mask; + physical_mask &= ~me_mask; + cc_vendor = CC_VENDOR_AMD; + cc_set_mask(me_mask); } diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index adc497b93f03..65e9a6e391c0 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -934,7 +934,7 @@ static int __init cmp_memblk(const void *a, const void *b) const struct numa_memblk *ma = *(const struct numa_memblk **)a; const struct numa_memblk *mb = *(const struct numa_memblk **)b; - return ma->start - mb->start; + return (ma->start > mb->start) - (ma->start < mb->start); } static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata; @@ -944,14 +944,12 @@ static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata; * @start: address to begin fill * @end: address to end fill * - * Find and extend numa_meminfo memblks to cover the @start-@end - * physical address range, such that the first memblk includes - * @start, the last memblk includes @end, and any gaps in between - * are filled. + * Find and extend numa_meminfo memblks to cover the physical + * address range @start-@end * * RETURNS: * 0 : Success - * NUMA_NO_MEMBLK : No memblk exists in @start-@end range + * NUMA_NO_MEMBLK : No memblks exist in address range @start-@end */ int __init numa_fill_memblks(u64 start, u64 end) @@ -963,17 +961,14 @@ int __init numa_fill_memblks(u64 start, u64 end) /* * Create a list of pointers to numa_meminfo memblks that - * overlap start, end. Exclude (start == bi->end) since - * end addresses in both a CFMWS range and a memblk range - * are exclusive. - * - * This list of pointers is used to make in-place changes - * that fill out the numa_meminfo memblks. + * overlap start, end. The list is used to make in-place + * changes that fill out the numa_meminfo memblks. */ for (int i = 0; i < mi->nr_blks; i++) { struct numa_memblk *bi = &mi->blk[i]; - if (start < bi->end && end >= bi->start) { + if (memblock_addrs_overlap(start, end - start, bi->start, + bi->end - bi->start)) { blk[count] = &mi->blk[i]; count++; } diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index 0904d7e8e126..0d72183b5dd0 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -240,6 +240,8 @@ void pat_cpu_init(void) } wrmsrl(MSR_IA32_CR_PAT, pat_msr_val); + + __flush_tlb_all(); } /** @@ -296,13 +298,8 @@ void __init pat_bp_init(void) /* * Xen PV doesn't allow to set PAT MSR, but all cache modes are * supported. - * When running as TDX guest setting the PAT MSR won't work either - * due to the requirement to set CR0.CD when doing so. Rely on - * firmware to have set the PAT MSR correctly. */ - if (pat_disabled || - cpu_feature_enabled(X86_FEATURE_XENPV) || - cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { + if (pat_disabled || cpu_feature_enabled(X86_FEATURE_XENPV)) { init_cache_modes(pat_msr_val); return; } diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index e9b448d1b1b7..102880404046 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -755,10 +755,14 @@ pmd_t *lookup_pmd_address(unsigned long address) * areas on 32-bit NUMA systems. The percpu areas can * end up in this kind of memory, for instance. * - * This could be optimized, but it is only intended to be - * used at initialization time, and keeping it - * unoptimized should increase the testing coverage for - * the more obscure platforms. + * Note that as long as the PTEs are well-formed with correct PFNs, this + * works without checking the PRESENT bit in the leaf PTE. This is unlike + * the similar vmalloc_to_page() and derivatives. Callers may depend on + * this behavior. + * + * This could be optimized, but it is only used in paths that are not perf + * sensitive, and keeping it unoptimized should increase the testing coverage + * for the more obscure platforms. */ phys_addr_t slow_virt_to_phys(void *__virt_addr) { @@ -2041,17 +2045,12 @@ int set_mce_nospec(unsigned long pfn) return rc; } -static int set_memory_p(unsigned long *addr, int numpages) -{ - return change_page_attr_set(addr, numpages, __pgprot(_PAGE_PRESENT), 0); -} - /* Restore full speculative operation to the pfn. */ int clear_mce_nospec(unsigned long pfn) { unsigned long addr = (unsigned long) pfn_to_kaddr(pfn); - return set_memory_p(&addr, 1); + return set_memory_p(addr, 1); } EXPORT_SYMBOL_GPL(clear_mce_nospec); #endif /* CONFIG_X86_64 */ @@ -2104,6 +2103,11 @@ int set_memory_np_noalias(unsigned long addr, int numpages) CPA_NO_CHECK_ALIAS, NULL); } +int set_memory_p(unsigned long addr, int numpages) +{ + return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_PRESENT), 0); +} + int set_memory_4k(unsigned long addr, int numpages) { return change_page_attr_set_clr(&addr, numpages, __pgprot(0), diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c index 40745664d92f..f32451bdcfdd 100644 --- a/arch/x86/platform/ce4100/ce4100.c +++ b/arch/x86/platform/ce4100/ce4100.c @@ -135,13 +135,13 @@ static void sdv_pci_init(void) */ void __init x86_ce4100_early_setup(void) { - x86_init.oem.arch_setup = sdv_arch_setup; - x86_init.resources.probe_roms = x86_init_noop; - x86_init.mpparse.get_smp_config = x86_init_uint_noop; - x86_init.mpparse.find_smp_config = x86_init_noop; - x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc_nocheck; - x86_init.pci.init = ce4100_pci_init; - x86_init.pci.init_irq = sdv_pci_init; + x86_init.oem.arch_setup = sdv_arch_setup; + x86_init.resources.probe_roms = x86_init_noop; + x86_init.mpparse.find_mptable = x86_init_noop; + x86_init.mpparse.early_parse_smp_cfg = x86_init_noop; + x86_init.mpparse.parse_smp_cfg = x86_dtb_parse_smp_config; + x86_init.pci.init = ce4100_pci_init; + x86_init.pci.init_irq = sdv_pci_init; /* * By default, the reboot method is ACPI which is supported by the diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c index f4592dc7a1c1..7be71c2cdc83 100644 --- a/arch/x86/platform/intel-mid/intel-mid.c +++ b/arch/x86/platform/intel-mid/intel-mid.c @@ -118,7 +118,8 @@ void __init x86_intel_mid_early_setup(void) machine_ops.emergency_restart = intel_mid_reboot; /* Avoid searching for BIOS MP tables */ - x86_init.mpparse.find_smp_config = x86_init_noop; - x86_init.mpparse.get_smp_config = x86_init_uint_noop; + x86_init.mpparse.find_mptable = x86_init_noop; + x86_init.mpparse.early_parse_smp_cfg = x86_init_noop; + x86_init.mpparse.parse_smp_cfg = x86_init_noop; set_bit(MP_BUS_ISA, mp_bus_not_pci); } diff --git a/arch/x86/virt/svm/Makefile b/arch/x86/virt/svm/Makefile new file mode 100644 index 000000000000..ef2a31bdcc70 --- /dev/null +++ b/arch/x86/virt/svm/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_KVM_AMD_SEV) += sev.o diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c new file mode 100644 index 000000000000..cffe1157a90a --- /dev/null +++ b/arch/x86/virt/svm/sev.c @@ -0,0 +1,560 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD SVM-SEV Host Support. + * + * Copyright (C) 2023 Advanced Micro Devices, Inc. + * + * Author: Ashish Kalra <ashish.kalra@amd.com> + * + */ + +#include <linux/cc_platform.h> +#include <linux/printk.h> +#include <linux/mm_types.h> +#include <linux/set_memory.h> +#include <linux/memblock.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/cpumask.h> +#include <linux/iommu.h> +#include <linux/amd-iommu.h> + +#include <asm/sev.h> +#include <asm/processor.h> +#include <asm/setup.h> +#include <asm/svm.h> +#include <asm/smp.h> +#include <asm/cpu.h> +#include <asm/apic.h> +#include <asm/cpuid.h> +#include <asm/cmdline.h> +#include <asm/iommu.h> + +/* + * The RMP entry format is not architectural. The format is defined in PPR + * Family 19h Model 01h, Rev B1 processor. + */ +struct rmpentry { + union { + struct { + u64 assigned : 1, + pagesize : 1, + immutable : 1, + rsvd1 : 9, + gpa : 39, + asid : 10, + vmsa : 1, + validated : 1, + rsvd2 : 1; + }; + u64 lo; + }; + u64 hi; +} __packed; + +/* + * The first 16KB from the RMP_BASE is used by the processor for the + * bookkeeping, the range needs to be added during the RMP entry lookup. + */ +#define RMPTABLE_CPU_BOOKKEEPING_SZ 0x4000 + +/* Mask to apply to a PFN to get the first PFN of a 2MB page */ +#define PFN_PMD_MASK GENMASK_ULL(63, PMD_SHIFT - PAGE_SHIFT) + +static u64 probed_rmp_base, probed_rmp_size; +static struct rmpentry *rmptable __ro_after_init; +static u64 rmptable_max_pfn __ro_after_init; + +static LIST_HEAD(snp_leaked_pages_list); +static DEFINE_SPINLOCK(snp_leaked_pages_list_lock); + +static unsigned long snp_nr_leaked_pages; + +#undef pr_fmt +#define pr_fmt(fmt) "SEV-SNP: " fmt + +static int __mfd_enable(unsigned int cpu) +{ + u64 val; + + if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + return 0; + + rdmsrl(MSR_AMD64_SYSCFG, val); + + val |= MSR_AMD64_SYSCFG_MFDM; + + wrmsrl(MSR_AMD64_SYSCFG, val); + + return 0; +} + +static __init void mfd_enable(void *arg) +{ + __mfd_enable(smp_processor_id()); +} + +static int __snp_enable(unsigned int cpu) +{ + u64 val; + + if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + return 0; + + rdmsrl(MSR_AMD64_SYSCFG, val); + + val |= MSR_AMD64_SYSCFG_SNP_EN; + val |= MSR_AMD64_SYSCFG_SNP_VMPL_EN; + + wrmsrl(MSR_AMD64_SYSCFG, val); + + return 0; +} + +static __init void snp_enable(void *arg) +{ + __snp_enable(smp_processor_id()); +} + +#define RMP_ADDR_MASK GENMASK_ULL(51, 13) + +bool snp_probe_rmptable_info(void) +{ + u64 max_rmp_pfn, calc_rmp_sz, rmp_sz, rmp_base, rmp_end; + + rdmsrl(MSR_AMD64_RMP_BASE, rmp_base); + rdmsrl(MSR_AMD64_RMP_END, rmp_end); + + if (!(rmp_base & RMP_ADDR_MASK) || !(rmp_end & RMP_ADDR_MASK)) { + pr_err("Memory for the RMP table has not been reserved by BIOS\n"); + return false; + } + + if (rmp_base > rmp_end) { + pr_err("RMP configuration not valid: base=%#llx, end=%#llx\n", rmp_base, rmp_end); + return false; + } + + rmp_sz = rmp_end - rmp_base + 1; + + /* + * Calculate the amount the memory that must be reserved by the BIOS to + * address the whole RAM, including the bookkeeping area. The RMP itself + * must also be covered. + */ + max_rmp_pfn = max_pfn; + if (PHYS_PFN(rmp_end) > max_pfn) + max_rmp_pfn = PHYS_PFN(rmp_end); + + calc_rmp_sz = (max_rmp_pfn << 4) + RMPTABLE_CPU_BOOKKEEPING_SZ; + + if (calc_rmp_sz > rmp_sz) { + pr_err("Memory reserved for the RMP table does not cover full system RAM (expected 0x%llx got 0x%llx)\n", + calc_rmp_sz, rmp_sz); + return false; + } + + probed_rmp_base = rmp_base; + probed_rmp_size = rmp_sz; + + pr_info("RMP table physical range [0x%016llx - 0x%016llx]\n", + probed_rmp_base, probed_rmp_base + probed_rmp_size - 1); + + return true; +} + +/* + * Do the necessary preparations which are verified by the firmware as + * described in the SNP_INIT_EX firmware command description in the SNP + * firmware ABI spec. + */ +static int __init snp_rmptable_init(void) +{ + void *rmptable_start; + u64 rmptable_size; + u64 val; + + if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + return 0; + + if (!amd_iommu_snp_en) + return 0; + + if (!probed_rmp_size) + goto nosnp; + + rmptable_start = memremap(probed_rmp_base, probed_rmp_size, MEMREMAP_WB); + if (!rmptable_start) { + pr_err("Failed to map RMP table\n"); + return 1; + } + + /* + * Check if SEV-SNP is already enabled, this can happen in case of + * kexec boot. + */ + rdmsrl(MSR_AMD64_SYSCFG, val); + if (val & MSR_AMD64_SYSCFG_SNP_EN) + goto skip_enable; + + memset(rmptable_start, 0, probed_rmp_size); + + /* Flush the caches to ensure that data is written before SNP is enabled. */ + wbinvd_on_all_cpus(); + + /* MtrrFixDramModEn must be enabled on all the CPUs prior to enabling SNP. */ + on_each_cpu(mfd_enable, NULL, 1); + + on_each_cpu(snp_enable, NULL, 1); + +skip_enable: + rmptable_start += RMPTABLE_CPU_BOOKKEEPING_SZ; + rmptable_size = probed_rmp_size - RMPTABLE_CPU_BOOKKEEPING_SZ; + + rmptable = (struct rmpentry *)rmptable_start; + rmptable_max_pfn = rmptable_size / sizeof(struct rmpentry) - 1; + + cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/rmptable_init:online", __snp_enable, NULL); + + /* + * Setting crash_kexec_post_notifiers to 'true' to ensure that SNP panic + * notifier is invoked to do SNP IOMMU shutdown before kdump. + */ + crash_kexec_post_notifiers = true; + + return 0; + +nosnp: + setup_clear_cpu_cap(X86_FEATURE_SEV_SNP); + return -ENOSYS; +} + +/* + * This must be called after the IOMMU has been initialized. + */ +device_initcall(snp_rmptable_init); + +static struct rmpentry *get_rmpentry(u64 pfn) +{ + if (WARN_ON_ONCE(pfn > rmptable_max_pfn)) + return ERR_PTR(-EFAULT); + + return &rmptable[pfn]; +} + +static struct rmpentry *__snp_lookup_rmpentry(u64 pfn, int *level) +{ + struct rmpentry *large_entry, *entry; + + if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + return ERR_PTR(-ENODEV); + + entry = get_rmpentry(pfn); + if (IS_ERR(entry)) + return entry; + + /* + * Find the authoritative RMP entry for a PFN. This can be either a 4K + * RMP entry or a special large RMP entry that is authoritative for a + * whole 2M area. + */ + large_entry = get_rmpentry(pfn & PFN_PMD_MASK); + if (IS_ERR(large_entry)) + return large_entry; + + *level = RMP_TO_PG_LEVEL(large_entry->pagesize); + + return entry; +} + +int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level) +{ + struct rmpentry *e; + + e = __snp_lookup_rmpentry(pfn, level); + if (IS_ERR(e)) + return PTR_ERR(e); + + *assigned = !!e->assigned; + return 0; +} +EXPORT_SYMBOL_GPL(snp_lookup_rmpentry); + +/* + * Dump the raw RMP entry for a particular PFN. These bits are documented in the + * PPR for a particular CPU model and provide useful information about how a + * particular PFN is being utilized by the kernel/firmware at the time certain + * unexpected events occur, such as RMP faults. + */ +static void dump_rmpentry(u64 pfn) +{ + u64 pfn_i, pfn_end; + struct rmpentry *e; + int level; + + e = __snp_lookup_rmpentry(pfn, &level); + if (IS_ERR(e)) { + pr_err("Failed to read RMP entry for PFN 0x%llx, error %ld\n", + pfn, PTR_ERR(e)); + return; + } + + if (e->assigned) { + pr_info("PFN 0x%llx, RMP entry: [0x%016llx - 0x%016llx]\n", + pfn, e->lo, e->hi); + return; + } + + /* + * If the RMP entry for a particular PFN is not in an assigned state, + * then it is sometimes useful to get an idea of whether or not any RMP + * entries for other PFNs within the same 2MB region are assigned, since + * those too can affect the ability to access a particular PFN in + * certain situations, such as when the PFN is being accessed via a 2MB + * mapping in the host page table. + */ + pfn_i = ALIGN_DOWN(pfn, PTRS_PER_PMD); + pfn_end = pfn_i + PTRS_PER_PMD; + + pr_info("PFN 0x%llx unassigned, dumping non-zero entries in 2M PFN region: [0x%llx - 0x%llx]\n", + pfn, pfn_i, pfn_end); + + while (pfn_i < pfn_end) { + e = __snp_lookup_rmpentry(pfn_i, &level); + if (IS_ERR(e)) { + pr_err("Error %ld reading RMP entry for PFN 0x%llx\n", + PTR_ERR(e), pfn_i); + pfn_i++; + continue; + } + + if (e->lo || e->hi) + pr_info("PFN: 0x%llx, [0x%016llx - 0x%016llx]\n", pfn_i, e->lo, e->hi); + pfn_i++; + } +} + +void snp_dump_hva_rmpentry(unsigned long hva) +{ + unsigned long paddr; + unsigned int level; + pgd_t *pgd; + pte_t *pte; + + pgd = __va(read_cr3_pa()); + pgd += pgd_index(hva); + pte = lookup_address_in_pgd(pgd, hva, &level); + + if (!pte) { + pr_err("Can't dump RMP entry for HVA %lx: no PTE/PFN found\n", hva); + return; + } + + paddr = PFN_PHYS(pte_pfn(*pte)) | (hva & ~page_level_mask(level)); + dump_rmpentry(PHYS_PFN(paddr)); +} + +/* + * PSMASH a 2MB aligned page into 4K pages in the RMP table while preserving the + * Validated bit. + */ +int psmash(u64 pfn) +{ + unsigned long paddr = pfn << PAGE_SHIFT; + int ret; + + if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + return -ENODEV; + + if (!pfn_valid(pfn)) + return -EINVAL; + + /* Binutils version 2.36 supports the PSMASH mnemonic. */ + asm volatile(".byte 0xF3, 0x0F, 0x01, 0xFF" + : "=a" (ret) + : "a" (paddr) + : "memory", "cc"); + + return ret; +} +EXPORT_SYMBOL_GPL(psmash); + +/* + * If the kernel uses a 2MB or larger directmap mapping to write to an address, + * and that mapping contains any 4KB pages that are set to private in the RMP + * table, an RMP #PF will trigger and cause a host crash. Hypervisor code that + * owns the PFNs being transitioned will never attempt such a write, but other + * kernel tasks writing to other PFNs in the range may trigger these checks + * inadvertently due a large directmap mapping that happens to overlap such a + * PFN. + * + * Prevent this by splitting any 2MB+ mappings that might end up containing a + * mix of private/shared PFNs as a result of a subsequent RMPUPDATE for the + * PFN/rmp_level passed in. + * + * Note that there is no attempt here to scan all the RMP entries for the 2MB + * physical range, since it would only be worthwhile in determining if a + * subsequent RMPUPDATE for a 4KB PFN would result in all the entries being of + * the same shared/private state, thus avoiding the need to split the mapping. + * But that would mean the entries are currently in a mixed state, and so the + * mapping would have already been split as a result of prior transitions. + * And since the 4K split is only done if the mapping is 2MB+, and there isn't + * currently a mechanism in place to restore 2MB+ mappings, such a check would + * not provide any usable benefit. + * + * More specifics on how these checks are carried out can be found in APM + * Volume 2, "RMP and VMPL Access Checks". + */ +static int adjust_direct_map(u64 pfn, int rmp_level) +{ + unsigned long vaddr; + unsigned int level; + int npages, ret; + pte_t *pte; + + /* + * pfn_to_kaddr() will return a vaddr only within the direct + * map range. + */ + vaddr = (unsigned long)pfn_to_kaddr(pfn); + + /* Only 4KB/2MB RMP entries are supported by current hardware. */ + if (WARN_ON_ONCE(rmp_level > PG_LEVEL_2M)) + return -EINVAL; + + if (!pfn_valid(pfn)) + return -EINVAL; + + if (rmp_level == PG_LEVEL_2M && + (!IS_ALIGNED(pfn, PTRS_PER_PMD) || !pfn_valid(pfn + PTRS_PER_PMD - 1))) + return -EINVAL; + + /* + * If an entire 2MB physical range is being transitioned, then there is + * no risk of RMP #PFs due to write accesses from overlapping mappings, + * since even accesses from 1GB mappings will be treated as 2MB accesses + * as far as RMP table checks are concerned. + */ + if (rmp_level == PG_LEVEL_2M) + return 0; + + pte = lookup_address(vaddr, &level); + if (!pte || pte_none(*pte)) + return 0; + + if (level == PG_LEVEL_4K) + return 0; + + npages = page_level_size(rmp_level) / PAGE_SIZE; + ret = set_memory_4k(vaddr, npages); + if (ret) + pr_warn("Failed to split direct map for PFN 0x%llx, ret: %d\n", + pfn, ret); + + return ret; +} + +/* + * It is expected that those operations are seldom enough so that no mutual + * exclusion of updaters is needed and thus the overlap error condition below + * should happen very rarely and would get resolved relatively quickly by + * the firmware. + * + * If not, one could consider introducing a mutex or so here to sync concurrent + * RMP updates and thus diminish the amount of cases where firmware needs to + * lock 2M ranges to protect against concurrent updates. + * + * The optimal solution would be range locking to avoid locking disjoint + * regions unnecessarily but there's no support for that yet. + */ +static int rmpupdate(u64 pfn, struct rmp_state *state) +{ + unsigned long paddr = pfn << PAGE_SHIFT; + int ret, level; + + if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + return -ENODEV; + + level = RMP_TO_PG_LEVEL(state->pagesize); + + if (adjust_direct_map(pfn, level)) + return -EFAULT; + + do { + /* Binutils version 2.36 supports the RMPUPDATE mnemonic. */ + asm volatile(".byte 0xF2, 0x0F, 0x01, 0xFE" + : "=a" (ret) + : "a" (paddr), "c" ((unsigned long)state) + : "memory", "cc"); + } while (ret == RMPUPDATE_FAIL_OVERLAP); + + if (ret) { + pr_err("RMPUPDATE failed for PFN %llx, pg_level: %d, ret: %d\n", + pfn, level, ret); + dump_rmpentry(pfn); + dump_stack(); + return -EFAULT; + } + + return 0; +} + +/* Transition a page to guest-owned/private state in the RMP table. */ +int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid, bool immutable) +{ + struct rmp_state state; + + memset(&state, 0, sizeof(state)); + state.assigned = 1; + state.asid = asid; + state.immutable = immutable; + state.gpa = gpa; + state.pagesize = PG_LEVEL_TO_RMP(level); + + return rmpupdate(pfn, &state); +} +EXPORT_SYMBOL_GPL(rmp_make_private); + +/* Transition a page to hypervisor-owned/shared state in the RMP table. */ +int rmp_make_shared(u64 pfn, enum pg_level level) +{ + struct rmp_state state; + + memset(&state, 0, sizeof(state)); + state.pagesize = PG_LEVEL_TO_RMP(level); + + return rmpupdate(pfn, &state); +} +EXPORT_SYMBOL_GPL(rmp_make_shared); + +void snp_leak_pages(u64 pfn, unsigned int npages) +{ + struct page *page = pfn_to_page(pfn); + + pr_warn("Leaking PFN range 0x%llx-0x%llx\n", pfn, pfn + npages); + + spin_lock(&snp_leaked_pages_list_lock); + while (npages--) { + + /* + * Reuse the page's buddy list for chaining into the leaked + * pages list. This page should not be on a free list currently + * and is also unsafe to be added to a free list. + */ + if (likely(!PageCompound(page)) || + + /* + * Skip inserting tail pages of compound page as + * page->buddy_list of tail pages is not usable. + */ + (PageHead(page) && compound_nr(page) <= npages)) + list_add_tail(&page->buddy_list, &snp_leaked_pages_list); + + dump_rmpentry(pfn); + snp_nr_leaked_pages++; + pfn++; + page++; + } + spin_unlock(&snp_leaked_pages_list_lock); +} +EXPORT_SYMBOL_GPL(snp_leak_pages); diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index 9dd5490b3318..8b045dd25196 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -33,12 +33,6 @@ static unsigned int xen_io_apic_read(unsigned apic, unsigned reg) return 0xfd; } -static u32 xen_set_apic_id(u32 x) -{ - WARN_ON(1); - return x; -} - static u32 xen_get_apic_id(u32 x) { return ((x)>>24) & 0xFFu; @@ -49,20 +43,20 @@ static u32 xen_apic_read(u32 reg) struct xen_platform_op op = { .cmd = XENPF_get_cpuinfo, .interface_version = XENPF_INTERFACE_VERSION, - .u.pcpu_info.xen_cpuid = 0, }; - int ret; - - /* Shouldn't need this as APIC is turned off for PV, and we only - * get called on the bootup processor. But just in case. */ - if (!xen_initial_domain() || smp_processor_id()) - return 0; + int ret, cpu; if (reg == APIC_LVR) return 0x14; if (reg != APIC_ID) return 0; + cpu = smp_processor_id(); + if (!xen_initial_domain()) + return cpu ? cpuid_to_apicid[cpu] << 24 : 0; + + op.u.pcpu_info.xen_cpuid = cpu; + ret = HYPERVISOR_platform_op(&op); if (ret) op.u.pcpu_info.apic_id = BAD_APICID; @@ -110,11 +104,6 @@ static int xen_madt_oem_check(char *oem_id, char *oem_table_id) return xen_pv_domain(); } -static u32 xen_phys_pkg_id(u32 initial_apic_id, int index_msb) -{ - return initial_apic_id >> index_msb; -} - static u32 xen_cpu_present_to_apicid(int cpu) { if (cpu_present(cpu)) @@ -133,11 +122,9 @@ static struct apic xen_pv_apic __ro_after_init = { .disable_esr = 0, .cpu_present_to_apicid = xen_cpu_present_to_apicid, - .phys_pkg_id = xen_phys_pkg_id, /* detect_ht */ .max_apic_id = UINT_MAX, .get_apic_id = xen_get_apic_id, - .set_apic_id = xen_set_apic_id, .calc_dest_apicid = apic_flat_calc_apicid, diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index 3f8c34707c50..99a68fa71dbe 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -168,7 +168,7 @@ static int xen_cpu_up_prepare_hvm(unsigned int cpu) */ xen_uninit_lock_cpu(cpu); - if (cpu_acpi_id(cpu) != U32_MAX) + if (cpu_acpi_id(cpu) != CPU_ACPIID_INVALID) per_cpu(xen_vcpu_id, cpu) = cpu_acpi_id(cpu); else per_cpu(xen_vcpu_id, cpu) = cpu; diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index aeb33e0a3f76..ace2eb054053 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -200,6 +200,9 @@ static void __init xen_pv_init_platform(void) xen_set_mtrr_data(); else mtrr_overwrite_state(NULL, 0, MTRR_TYPE_WRBACK); + + /* Adjust nr_cpu_ids before "enumeration" happens */ + xen_smp_count_cpus(); } static void __init xen_pv_guest_late_init(void) diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 4b0d6fff88de..935771726f9c 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -65,6 +65,8 @@ int xen_smp_intr_init(unsigned int cpu) char *resched_name, *callfunc_name, *debug_name; resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu); + if (!resched_name) + goto fail_mem; per_cpu(xen_resched_irq, cpu).name = resched_name; rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR, cpu, @@ -77,6 +79,8 @@ int xen_smp_intr_init(unsigned int cpu) per_cpu(xen_resched_irq, cpu).irq = rc; callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu); + if (!callfunc_name) + goto fail_mem; per_cpu(xen_callfunc_irq, cpu).name = callfunc_name; rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR, cpu, @@ -90,6 +94,9 @@ int xen_smp_intr_init(unsigned int cpu) if (!xen_fifo_events) { debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu); + if (!debug_name) + goto fail_mem; + per_cpu(xen_debug_irq, cpu).name = debug_name; rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu, xen_debug_interrupt, @@ -101,6 +108,9 @@ int xen_smp_intr_init(unsigned int cpu) } callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu); + if (!callfunc_name) + goto fail_mem; + per_cpu(xen_callfuncsingle_irq, cpu).name = callfunc_name; rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR, cpu, @@ -114,6 +124,8 @@ int xen_smp_intr_init(unsigned int cpu) return 0; + fail_mem: + rc = -ENOMEM; fail: xen_smp_intr_free(cpu); return rc; @@ -123,8 +135,6 @@ void __init xen_smp_cpus_done(unsigned int max_cpus) { if (xen_hvm_domain()) native_smp_cpus_done(max_cpus); - else - calculate_max_logical_packages(); } void xen_smp_send_reschedule(int cpu) diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h index c20cbb14c82b..b8efdbc693f7 100644 --- a/arch/x86/xen/smp.h +++ b/arch/x86/xen/smp.h @@ -19,6 +19,7 @@ extern void xen_smp_intr_free(unsigned int cpu); int xen_smp_intr_init_pv(unsigned int cpu); void xen_smp_intr_free_pv(unsigned int cpu); +void xen_smp_count_cpus(void); void xen_smp_cpus_done(unsigned int max_cpus); void xen_smp_send_reschedule(int cpu); @@ -44,6 +45,7 @@ static inline int xen_smp_intr_init_pv(unsigned int cpu) return 0; } static inline void xen_smp_intr_free_pv(unsigned int cpu) {} +static inline void xen_smp_count_cpus(void) { } #endif /* CONFIG_SMP */ #endif diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index a0f07bbfcd6e..27d1a5b7f571 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -29,6 +29,7 @@ #include <asm/idtentry.h> #include <asm/desc.h> #include <asm/cpu.h> +#include <asm/apic.h> #include <asm/io_apic.h> #include <xen/interface/xen.h> @@ -73,7 +74,6 @@ static void cpu_bringup(void) } cpu = smp_processor_id(); smp_store_cpu_info(cpu); - cpu_data(cpu).x86_max_cores = 1; set_cpu_sibling_map(cpu); speculative_store_bypass_ht_init(); @@ -149,39 +149,16 @@ int xen_smp_intr_init_pv(unsigned int cpu) return rc; } -static void __init _get_smp_config(unsigned int early) +static void __init xen_pv_smp_config(void) { - int i, rc; - unsigned int subtract = 0; - - if (early) - return; - - num_processors = 0; - disabled_cpus = 0; - for (i = 0; i < nr_cpu_ids; i++) { - rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); - if (rc >= 0) { - num_processors++; - set_cpu_possible(i, true); - } else { - set_cpu_possible(i, false); - set_cpu_present(i, false); - subtract++; - } - } -#ifdef CONFIG_HOTPLUG_CPU - /* This is akin to using 'nr_cpus' on the Linux command line. - * Which is OK as when we use 'dom0_max_vcpus=X' we can only - * have up to X, while nr_cpu_ids is greater than X. This - * normally is not a problem, except when CPU hotplugging - * is involved and then there might be more than X CPUs - * in the guest - which will not work as there is no - * hypercall to expand the max number of VCPUs an already - * running guest has. So cap it up to X. */ - if (subtract) - set_nr_cpu_ids(nr_cpu_ids - subtract); -#endif + u32 apicid = 0; + int i; + + topology_register_boot_apic(apicid++); + + for (i = 1; i < nr_cpu_ids; i++) + topology_register_apic(apicid++, CPU_ACPIID_INVALID, true); + /* Pretend to be a proper enumerated system */ smp_found_config = 1; } @@ -224,8 +201,6 @@ static void __init xen_pv_smp_prepare_cpus(unsigned int max_cpus) smp_prepare_cpus_common(); - cpu_data(0).x86_max_cores = 1; - speculative_store_bypass_ht_init(); xen_pmu_init(0); @@ -434,6 +409,20 @@ static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } +void __init xen_smp_count_cpus(void) +{ + unsigned int cpus; + + for (cpus = 0; cpus < nr_cpu_ids; cpus++) { + if (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpus, NULL) < 0) + break; + } + + pr_info("Xen PV: Detected %u vCPUS\n", cpus); + if (cpus < nr_cpu_ids) + set_nr_cpu_ids(cpus); +} + static const struct smp_ops xen_smp_ops __initconst = { .smp_prepare_boot_cpu = xen_pv_smp_prepare_boot_cpu, .smp_prepare_cpus = xen_pv_smp_prepare_cpus, @@ -458,6 +447,12 @@ void __init xen_smp_init(void) smp_ops = xen_smp_ops; /* Avoid searching for BIOS MP tables */ - x86_init.mpparse.find_smp_config = x86_init_noop; - x86_init.mpparse.get_smp_config = _get_smp_config; + x86_init.mpparse.find_mptable = x86_init_noop; + x86_init.mpparse.early_parse_smp_cfg = x86_init_noop; + + /* XEN/PV Dom0 has halfways sane topology information via CPUID/MADT */ + if (xen_initial_domain()) + x86_init.mpparse.parse_smp_cfg = x86_init_noop; + else + x86_init.mpparse.parse_smp_cfg = xen_pv_smp_config; } diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index a0ea285878db..04101b984f24 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -49,7 +49,7 @@ SYM_CODE_START(startup_xen) ANNOTATE_NOENDBR cld - leaq (__end_init_task - PTREGS_SIZE)(%rip), %rsp + leaq (__end_init_task - TOP_OF_KERNEL_STACK_PADDING - PTREGS_SIZE)(%rip), %rsp /* Set up %gs. * |