diff options
Diffstat (limited to 'arch/x86')
219 files changed, 4422 insertions, 3014 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 66bfabae8814..6a917f62eff2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -28,7 +28,6 @@ config X86_64 select ARCH_HAS_GIGANTIC_PAGE select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 select ARCH_SUPPORTS_PER_VMA_LOCK - select ARCH_USE_CMPXCHG_LOCKREF select HAVE_ARCH_SOFT_DIRTY select MODULES_USE_ELF_RELA select NEED_DMA_MAP_STATE @@ -118,6 +117,7 @@ config X86 select ARCH_SUPPORTS_LTO_CLANG select ARCH_SUPPORTS_LTO_CLANG_THIN select ARCH_USE_BUILTIN_BSWAP + select ARCH_USE_CMPXCHG_LOCKREF if X86_CMPXCHG64 select ARCH_USE_MEMTEST select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS @@ -1534,6 +1534,7 @@ config NUMA depends on X86_64 || (X86_32 && HIGHMEM64G && X86_BIGSMP) default y if X86_BIGSMP select USE_PERCPU_NUMA_NODE_ID + select OF_NUMA if OF help Enable NUMA (Non-Uniform Memory Access) support. @@ -1939,6 +1940,18 @@ config X86_USER_SHADOW_STACK If unsure, say N. +config INTEL_TDX_HOST + bool "Intel Trust Domain Extensions (TDX) host support" + depends on CPU_SUP_INTEL + depends on X86_64 + depends on KVM_INTEL + help + Intel Trust Domain Extensions (TDX) protects guest VMs from malicious + host and certain physical attacks. This option enables necessary TDX + support in the host kernel to run confidential VMs. + + If unsure, say N. + config EFI bool "EFI runtime service support" depends on ACPI @@ -2062,6 +2075,9 @@ config ARCH_SUPPORTS_CRASH_DUMP config ARCH_SUPPORTS_CRASH_HOTPLUG def_bool y +config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION + def_bool CRASH_CORE + config PHYSICAL_START hex "Physical address where the kernel is loaded" if (EXPERT || CRASH_DUMP) default "0x1000000" @@ -2954,6 +2970,15 @@ config IA32_EMULATION 64-bit kernel. You should likely turn this on, unless you're 100% sure that you don't have any 32-bit programs left. +config IA32_EMULATION_DEFAULT_DISABLED + bool "IA32 emulation disabled by default" + default n + depends on IA32_EMULATION + help + Make IA32 emulation disabled by default. This prevents loading 32-bit + processes and access to 32-bit syscalls. If unsure, leave it to its + default value. + config X86_X32_ABI bool "x32 ABI for 64-bit mode" depends on X86_64 diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 3ff53a2d4ff0..1a068de12a56 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -43,7 +43,7 @@ endif # How to compile the 16-bit code. Note we always compile for -march=i386; # that way we can complain to the user if the CPU is insufficient. -REALMODE_CFLAGS := -m16 -g -Os -DDISABLE_BRANCH_PROFILING -D__DISABLE_EXPORTS \ +REALMODE_CFLAGS := -std=gnu11 -m16 -g -Os -DDISABLE_BRANCH_PROFILING -D__DISABLE_EXPORTS \ -Wall -Wstrict-prototypes -march=i386 -mregparm=3 \ -fno-strict-aliasing -fomit-frame-pointer -fno-pic \ -mno-mmx -mno-sse $(call cc-option,-fcf-protection=none) @@ -81,6 +81,7 @@ ifeq ($(CONFIG_X86_KERNEL_IBT),y) # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104816 # KBUILD_CFLAGS += $(call cc-option,-fcf-protection=branch -fno-jump-tables) +KBUILD_RUSTFLAGS += -Zcf-protection=branch -Zno-jump-tables else KBUILD_CFLAGS += $(call cc-option,-fcf-protection=none) endif @@ -252,6 +253,8 @@ archheaders: libs-y += arch/x86/lib/ +core-y += arch/x86/virt/ + # drivers-y are linked after core-y drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/ drivers-$(CONFIG_PCI) += arch/x86/pci/ diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index f33e45ed1437..3cece19b7473 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -89,7 +89,7 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE SETUP_OBJS = $(addprefix $(obj)/,$(setup-y)) -sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [a-zA-Z] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|efi32_pe_entry\|input_data\|kernel_info\|_end\|_ehead\|_text\|z_.*\)$$/\#define ZO_\2 0x\1/p' +sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [a-zA-Z] \(startup_32\|efi.._stub_entry\|efi\(32\)\?_pe_entry\|input_data\|kernel_info\|_end\|_ehead\|_text\|_e\?data\|z_.*\)$$/\#define ZO_\2 0x\1/p' quiet_cmd_zoffset = ZOFFSET $@ cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@ diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c index 9caf89063e77..55c98fdd67d2 100644 --- a/arch/x86/boot/compressed/acpi.c +++ b/arch/x86/boot/compressed/acpi.c @@ -30,13 +30,13 @@ __efi_get_rsdp_addr(unsigned long cfg_tbl_pa, unsigned int cfg_tbl_len) * Search EFI system tables for RSDP. Preferred is ACPI_20_TABLE_GUID to * ACPI_TABLE_GUID because it has more features. */ - rsdp_addr = efi_find_vendor_table(boot_params, cfg_tbl_pa, cfg_tbl_len, + rsdp_addr = efi_find_vendor_table(boot_params_ptr, cfg_tbl_pa, cfg_tbl_len, ACPI_20_TABLE_GUID); if (rsdp_addr) return (acpi_physical_address)rsdp_addr; /* No ACPI_20_TABLE_GUID found, fallback to ACPI_TABLE_GUID. */ - rsdp_addr = efi_find_vendor_table(boot_params, cfg_tbl_pa, cfg_tbl_len, + rsdp_addr = efi_find_vendor_table(boot_params_ptr, cfg_tbl_pa, cfg_tbl_len, ACPI_TABLE_GUID); if (rsdp_addr) return (acpi_physical_address)rsdp_addr; @@ -56,15 +56,15 @@ static acpi_physical_address efi_get_rsdp_addr(void) enum efi_type et; int ret; - et = efi_get_type(boot_params); + et = efi_get_type(boot_params_ptr); if (et == EFI_TYPE_NONE) return 0; - systab_pa = efi_get_system_table(boot_params); + systab_pa = efi_get_system_table(boot_params_ptr); if (!systab_pa) error("EFI support advertised, but unable to locate system table."); - ret = efi_get_conf_table(boot_params, &cfg_tbl_pa, &cfg_tbl_len); + ret = efi_get_conf_table(boot_params_ptr, &cfg_tbl_pa, &cfg_tbl_len); if (ret || !cfg_tbl_pa) error("EFI config table not found."); @@ -156,7 +156,7 @@ acpi_physical_address get_rsdp_addr(void) { acpi_physical_address pa; - pa = boot_params->acpi_rsdp_addr; + pa = boot_params_ptr->acpi_rsdp_addr; if (!pa) pa = efi_get_rsdp_addr(); @@ -210,7 +210,7 @@ static unsigned long get_acpi_srat_table(void) rsdp = (struct acpi_table_rsdp *)get_cmdline_acpi_rsdp(); if (!rsdp) rsdp = (struct acpi_table_rsdp *)(long) - boot_params->acpi_rsdp_addr; + boot_params_ptr->acpi_rsdp_addr; if (!rsdp) return 0; diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c index f1add5d85da9..c1bb180973ea 100644 --- a/arch/x86/boot/compressed/cmdline.c +++ b/arch/x86/boot/compressed/cmdline.c @@ -14,9 +14,9 @@ static inline char rdfs8(addr_t addr) #include "../cmdline.c" unsigned long get_cmd_line_ptr(void) { - unsigned long cmd_line_ptr = boot_params->hdr.cmd_line_ptr; + unsigned long cmd_line_ptr = boot_params_ptr->hdr.cmd_line_ptr; - cmd_line_ptr |= (u64)boot_params->ext_cmd_line_ptr << 32; + cmd_line_ptr |= (u64)boot_params_ptr->ext_cmd_line_ptr << 32; return cmd_line_ptr; } diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c index 08f93b0401bb..473ba59b82a8 100644 --- a/arch/x86/boot/compressed/ident_map_64.c +++ b/arch/x86/boot/compressed/ident_map_64.c @@ -159,8 +159,9 @@ void initialize_identity_maps(void *rmode) * or does not touch all the pages covering them. */ kernel_add_identity_map((unsigned long)_head, (unsigned long)_end); - boot_params = rmode; - kernel_add_identity_map((unsigned long)boot_params, (unsigned long)(boot_params + 1)); + boot_params_ptr = rmode; + kernel_add_identity_map((unsigned long)boot_params_ptr, + (unsigned long)(boot_params_ptr + 1)); cmdline = get_cmd_line_ptr(); kernel_add_identity_map(cmdline, cmdline + COMMAND_LINE_SIZE); @@ -168,7 +169,7 @@ void initialize_identity_maps(void *rmode) * Also map the setup_data entries passed via boot_params in case they * need to be accessed by uncompressed kernel via the identity mapping. */ - sd = (struct setup_data *)boot_params->hdr.setup_data; + sd = (struct setup_data *)boot_params_ptr->hdr.setup_data; while (sd) { unsigned long sd_addr = (unsigned long)sd; diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 9193acf0e9cd..dec961c6d16a 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -63,7 +63,7 @@ static unsigned long get_boot_seed(void) unsigned long hash = 0; hash = rotate_xor(hash, build_str, sizeof(build_str)); - hash = rotate_xor(hash, boot_params, sizeof(*boot_params)); + hash = rotate_xor(hash, boot_params_ptr, sizeof(*boot_params_ptr)); return hash; } @@ -383,7 +383,7 @@ static void handle_mem_options(void) static void mem_avoid_init(unsigned long input, unsigned long input_size, unsigned long output) { - unsigned long init_size = boot_params->hdr.init_size; + unsigned long init_size = boot_params_ptr->hdr.init_size; u64 initrd_start, initrd_size; unsigned long cmd_line, cmd_line_size; @@ -395,10 +395,10 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, mem_avoid[MEM_AVOID_ZO_RANGE].size = (output + init_size) - input; /* Avoid initrd. */ - initrd_start = (u64)boot_params->ext_ramdisk_image << 32; - initrd_start |= boot_params->hdr.ramdisk_image; - initrd_size = (u64)boot_params->ext_ramdisk_size << 32; - initrd_size |= boot_params->hdr.ramdisk_size; + initrd_start = (u64)boot_params_ptr->ext_ramdisk_image << 32; + initrd_start |= boot_params_ptr->hdr.ramdisk_image; + initrd_size = (u64)boot_params_ptr->ext_ramdisk_size << 32; + initrd_size |= boot_params_ptr->hdr.ramdisk_size; mem_avoid[MEM_AVOID_INITRD].start = initrd_start; mem_avoid[MEM_AVOID_INITRD].size = initrd_size; /* No need to set mapping for initrd, it will be handled in VO. */ @@ -413,8 +413,8 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, } /* Avoid boot parameters. */ - mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params; - mem_avoid[MEM_AVOID_BOOTPARAMS].size = sizeof(*boot_params); + mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params_ptr; + mem_avoid[MEM_AVOID_BOOTPARAMS].size = sizeof(*boot_params_ptr); /* We don't need to set a mapping for setup_data. */ @@ -447,7 +447,7 @@ static bool mem_avoid_overlap(struct mem_vector *img, } /* Avoid all entries in the setup_data linked list. */ - ptr = (struct setup_data *)(unsigned long)boot_params->hdr.setup_data; + ptr = (struct setup_data *)(unsigned long)boot_params_ptr->hdr.setup_data; while (ptr) { struct mem_vector avoid; @@ -706,7 +706,7 @@ static inline bool memory_type_is_free(efi_memory_desc_t *md) static bool process_efi_entries(unsigned long minimum, unsigned long image_size) { - struct efi_info *e = &boot_params->efi_info; + struct efi_info *e = &boot_params_ptr->efi_info; bool efi_mirror_found = false; struct mem_vector region; efi_memory_desc_t *md; @@ -777,8 +777,8 @@ static void process_e820_entries(unsigned long minimum, struct boot_e820_entry *entry; /* Verify potential e820 positions, appending to slots list. */ - for (i = 0; i < boot_params->e820_entries; i++) { - entry = &boot_params->e820_table[i]; + for (i = 0; i < boot_params_ptr->e820_entries; i++) { + entry = &boot_params_ptr->e820_table[i]; /* Skip non-RAM entries. */ if (entry->type != E820_TYPE_RAM) continue; @@ -852,7 +852,7 @@ void choose_random_location(unsigned long input, return; } - boot_params->hdr.loadflags |= KASLR_FLAG; + boot_params_ptr->hdr.loadflags |= KASLR_FLAG; if (IS_ENABLED(CONFIG_X86_32)) mem_limit = KERNEL_IMAGE_SIZE; diff --git a/arch/x86/boot/compressed/mem.c b/arch/x86/boot/compressed/mem.c index 3c1609245f2a..b3c3a4be7471 100644 --- a/arch/x86/boot/compressed/mem.c +++ b/arch/x86/boot/compressed/mem.c @@ -54,17 +54,17 @@ bool init_unaccepted_memory(void) enum efi_type et; int ret; - et = efi_get_type(boot_params); + et = efi_get_type(boot_params_ptr); if (et == EFI_TYPE_NONE) return false; - ret = efi_get_conf_table(boot_params, &cfg_table_pa, &cfg_table_len); + ret = efi_get_conf_table(boot_params_ptr, &cfg_table_pa, &cfg_table_len); if (ret) { warn("EFI config table not found."); return false; } - table = (void *)efi_find_vendor_table(boot_params, cfg_table_pa, + table = (void *)efi_find_vendor_table(boot_params_ptr, cfg_table_pa, cfg_table_len, guid); if (!table) return false; diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index f711f2a85862..b99e08e6815b 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -46,7 +46,7 @@ void *memmove(void *dest, const void *src, size_t n); /* * This is set up by the setup-routine at boot-time */ -struct boot_params *boot_params; +struct boot_params *boot_params_ptr; struct port_io_ops pio_ops; @@ -132,8 +132,8 @@ void __putstr(const char *s) if (lines == 0 || cols == 0) return; - x = boot_params->screen_info.orig_x; - y = boot_params->screen_info.orig_y; + x = boot_params_ptr->screen_info.orig_x; + y = boot_params_ptr->screen_info.orig_y; while ((c = *s++) != '\0') { if (c == '\n') { @@ -154,8 +154,8 @@ void __putstr(const char *s) } } - boot_params->screen_info.orig_x = x; - boot_params->screen_info.orig_y = y; + boot_params_ptr->screen_info.orig_x = x; + boot_params_ptr->screen_info.orig_y = y; pos = (x + cols * y) * 2; /* Update cursor position */ outb(14, vidport); @@ -382,14 +382,14 @@ asmlinkage __visible void *extract_kernel(void *rmode, unsigned char *output) size_t entry_offset; /* Retain x86 boot parameters pointer passed from startup_32/64. */ - boot_params = rmode; + boot_params_ptr = rmode; /* Clear flags intended for solely in-kernel use. */ - boot_params->hdr.loadflags &= ~KASLR_FLAG; + boot_params_ptr->hdr.loadflags &= ~KASLR_FLAG; - sanitize_boot_params(boot_params); + sanitize_boot_params(boot_params_ptr); - if (boot_params->screen_info.orig_video_mode == 7) { + if (boot_params_ptr->screen_info.orig_video_mode == 7) { vidmem = (char *) 0xb0000; vidport = 0x3b4; } else { @@ -397,8 +397,8 @@ asmlinkage __visible void *extract_kernel(void *rmode, unsigned char *output) vidport = 0x3d4; } - lines = boot_params->screen_info.orig_video_lines; - cols = boot_params->screen_info.orig_video_cols; + lines = boot_params_ptr->screen_info.orig_video_lines; + cols = boot_params_ptr->screen_info.orig_video_cols; init_default_io_ops(); @@ -417,7 +417,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, unsigned char *output) * so that early debugging output from the RSDP parsing code can be * collected. */ - boot_params->acpi_rsdp_addr = get_rsdp_addr(); + boot_params_ptr->acpi_rsdp_addr = get_rsdp_addr(); debug_putstr("early console in extract_kernel\n"); diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index cc70d3fb9049..c0d502bd8716 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -61,7 +61,6 @@ extern memptr free_mem_ptr; extern memptr free_mem_end_ptr; void *malloc(int size); void free(void *where); -extern struct boot_params *boot_params; void __putstr(const char *s); void __puthex(unsigned long value); #define error_putstr(__x) __putstr(__x) diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index 7939eb6e6ce9..51f957b24ba7 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -28,7 +28,6 @@ static char trampoline_save[TRAMPOLINE_32BIT_SIZE]; */ unsigned long *trampoline_32bit __section(".data"); -extern struct boot_params *boot_params; int cmdline_find_option_bool(const char *option); static unsigned long find_trampoline_placement(void) @@ -49,7 +48,7 @@ static unsigned long find_trampoline_placement(void) * * Only look for values in the legacy ROM for non-EFI system. */ - signature = (char *)&boot_params->efi_info.efi_loader_signature; + signature = (char *)&boot_params_ptr->efi_info.efi_loader_signature; if (strncmp(signature, EFI32_LOADER_SIGNATURE, 4) && strncmp(signature, EFI64_LOADER_SIGNATURE, 4)) { ebda_start = *(unsigned short *)0x40e << 4; @@ -65,10 +64,10 @@ static unsigned long find_trampoline_placement(void) bios_start = round_down(bios_start, PAGE_SIZE); /* Find the first usable memory region under bios_start. */ - for (i = boot_params->e820_entries - 1; i >= 0; i--) { + for (i = boot_params_ptr->e820_entries - 1; i >= 0; i--) { unsigned long new = bios_start; - entry = &boot_params->e820_table[i]; + entry = &boot_params_ptr->e820_table[i]; /* Skip all entries above bios_start. */ if (bios_start <= entry->addr) @@ -107,7 +106,7 @@ asmlinkage void configure_5level_paging(struct boot_params *bp, void *pgtable) bool l5_required = false; /* Initialize boot_params. Required for cmdline_find_option_bool(). */ - boot_params = bp; + boot_params_ptr = bp; /* * Check if LA57 is desired and supported. diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index dc8c876fbd8f..454acd7a2daf 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -25,7 +25,7 @@ #include "error.h" #include "../msr.h" -struct ghcb boot_ghcb_page __aligned(PAGE_SIZE); +static struct ghcb boot_ghcb_page __aligned(PAGE_SIZE); struct ghcb *boot_ghcb; /* @@ -103,6 +103,16 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, return ES_OK; } +static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size) +{ + return ES_OK; +} + +static bool fault_in_kernel_space(unsigned long address) +{ + return false; +} + #undef __init #define __init @@ -605,7 +615,7 @@ void sev_prep_identity_maps(unsigned long top_level_pgt) * accessed after switchover. */ if (sev_snp_enabled()) { - unsigned long cc_info_pa = boot_params->cc_blob_address; + unsigned long cc_info_pa = boot_params_ptr->cc_blob_address; struct cc_blob_sev_info *cc_info; kernel_add_identity_map(cc_info_pa, cc_info_pa + sizeof(*cc_info)); diff --git a/arch/x86/boot/compressed/tdx.c b/arch/x86/boot/compressed/tdx.c index 8841b945a1e2..8451d6a1030c 100644 --- a/arch/x86/boot/compressed/tdx.c +++ b/arch/x86/boot/compressed/tdx.c @@ -18,7 +18,7 @@ void __tdx_hypercall_failed(void) static inline unsigned int tdx_io_in(int size, u16 port) { - struct tdx_hypercall_args args = { + struct tdx_module_args args = { .r10 = TDX_HYPERCALL_STANDARD, .r11 = hcall_func(EXIT_REASON_IO_INSTRUCTION), .r12 = size, @@ -26,7 +26,7 @@ static inline unsigned int tdx_io_in(int size, u16 port) .r14 = port, }; - if (__tdx_hypercall_ret(&args)) + if (__tdx_hypercall(&args)) return UINT_MAX; return args.r11; @@ -34,7 +34,7 @@ static inline unsigned int tdx_io_in(int size, u16 port) static inline void tdx_io_out(int size, u16 port, u32 value) { - struct tdx_hypercall_args args = { + struct tdx_module_args args = { .r10 = TDX_HYPERCALL_STANDARD, .r11 = hcall_func(EXIT_REASON_IO_INSTRUCTION), .r12 = size, diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S index b22f34b8684a..083ec6d7722a 100644 --- a/arch/x86/boot/compressed/vmlinux.lds.S +++ b/arch/x86/boot/compressed/vmlinux.lds.S @@ -43,11 +43,13 @@ SECTIONS *(.rodata.*) _erodata = . ; } - .data : { + .data : ALIGN(0x1000) { _data = . ; *(.data) *(.data.*) - *(.bss.efistub) + + /* Add 4 bytes of extra space for a CRC-32 checksum */ + . = ALIGN(. + 4, 0x200); _edata = . ; } . = ALIGN(L1_CACHE_BYTES); diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index b04ca8e2b213..b2771710ed98 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -36,66 +36,20 @@ SYSSEG = 0x1000 /* historical load address >> 4 */ #define ROOT_RDONLY 1 #endif + .set salign, 0x1000 + .set falign, 0x200 + .code16 .section ".bstext", "ax" - - .global bootsect_start -bootsect_start: #ifdef CONFIG_EFI_STUB # "MZ", MS-DOS header .word MZ_MAGIC -#endif - - # Normalize the start address - ljmp $BOOTSEG, $start2 - -start2: - movw %cs, %ax - movw %ax, %ds - movw %ax, %es - movw %ax, %ss - xorw %sp, %sp - sti - cld - - movw $bugger_off_msg, %si - -msg_loop: - lodsb - andb %al, %al - jz bs_die - movb $0xe, %ah - movw $7, %bx - int $0x10 - jmp msg_loop - -bs_die: - # Allow the user to press a key, then reboot - xorw %ax, %ax - int $0x16 - int $0x19 - - # int 0x19 should never return. In case it does anyway, - # invoke the BIOS reset code... - ljmp $0xf000,$0xfff0 - -#ifdef CONFIG_EFI_STUB .org 0x38 # # Offset to the PE header. # .long LINUX_PE_MAGIC .long pe_header -#endif /* CONFIG_EFI_STUB */ - - .section ".bsdata", "a" -bugger_off_msg: - .ascii "Use a boot loader.\r\n" - .ascii "\n" - .ascii "Remove disk and press any key to reboot...\r\n" - .byte 0 - -#ifdef CONFIG_EFI_STUB pe_header: .long PE_MAGIC @@ -124,30 +78,26 @@ optional_header: .byte 0x02 # MajorLinkerVersion .byte 0x14 # MinorLinkerVersion - # Filled in by build.c - .long 0 # SizeOfCode + .long ZO__data # SizeOfCode - .long 0 # SizeOfInitializedData + .long ZO__end - ZO__data # SizeOfInitializedData .long 0 # SizeOfUninitializedData - # Filled in by build.c - .long 0x0000 # AddressOfEntryPoint + .long setup_size + ZO_efi_pe_entry # AddressOfEntryPoint - .long 0x0200 # BaseOfCode + .long setup_size # BaseOfCode #ifdef CONFIG_X86_32 .long 0 # data #endif extra_header_fields: - # PE specification requires ImageBase to be 64k aligned - .set image_base, (LOAD_PHYSICAL_ADDR + 0xffff) & ~0xffff #ifdef CONFIG_X86_32 - .long image_base # ImageBase + .long 0 # ImageBase #else - .quad image_base # ImageBase + .quad 0 # ImageBase #endif - .long 0x20 # SectionAlignment - .long 0x20 # FileAlignment + .long salign # SectionAlignment + .long falign # FileAlignment .word 0 # MajorOperatingSystemVersion .word 0 # MinorOperatingSystemVersion .word LINUX_EFISTUB_MAJOR_VERSION # MajorImageVersion @@ -156,12 +106,10 @@ extra_header_fields: .word 0 # MinorSubsystemVersion .long 0 # Win32VersionValue - # - # The size of the bzImage is written in tools/build.c - # - .long 0 # SizeOfImage + .long setup_size + ZO__end + pecompat_vsize + # SizeOfImage - .long 0x200 # SizeOfHeaders + .long salign # SizeOfHeaders .long 0 # CheckSum .word IMAGE_SUBSYSTEM_EFI_APPLICATION # Subsystem (EFI application) #ifdef CONFIG_EFI_DXE_MEM_ATTRIBUTES @@ -192,87 +140,78 @@ extra_header_fields: # Section table section_table: - # - # The offset & size fields are filled in by build.c. - # .ascii ".setup" .byte 0 .byte 0 - .long 0 - .long 0x0 # startup_{32,64} - .long 0 # Size of initialized data - # on disk - .long 0x0 # startup_{32,64} - .long 0 # PointerToRelocations - .long 0 # PointerToLineNumbers - .word 0 # NumberOfRelocations - .word 0 # NumberOfLineNumbers - .long IMAGE_SCN_CNT_CODE | \ - IMAGE_SCN_MEM_READ | \ - IMAGE_SCN_MEM_EXECUTE | \ - IMAGE_SCN_ALIGN_16BYTES # Characteristics + .long setup_size - salign # VirtualSize + .long salign # VirtualAddress + .long pecompat_fstart - salign # SizeOfRawData + .long salign # PointerToRawData - # - # The EFI application loader requires a relocation section - # because EFI applications must be relocatable. The .reloc - # offset & size fields are filled in by build.c. - # - .ascii ".reloc" - .byte 0 - .byte 0 - .long 0 - .long 0 - .long 0 # SizeOfRawData - .long 0 # PointerToRawData - .long 0 # PointerToRelocations - .long 0 # PointerToLineNumbers - .word 0 # NumberOfRelocations - .word 0 # NumberOfLineNumbers + .long 0, 0, 0 .long IMAGE_SCN_CNT_INITIALIZED_DATA | \ IMAGE_SCN_MEM_READ | \ - IMAGE_SCN_MEM_DISCARDABLE | \ - IMAGE_SCN_ALIGN_1BYTES # Characteristics + IMAGE_SCN_MEM_DISCARDABLE # Characteristics #ifdef CONFIG_EFI_MIXED - # - # The offset & size fields are filled in by build.c. - # .asciz ".compat" - .long 0 - .long 0x0 - .long 0 # Size of initialized data - # on disk - .long 0x0 - .long 0 # PointerToRelocations - .long 0 # PointerToLineNumbers - .word 0 # NumberOfRelocations - .word 0 # NumberOfLineNumbers + + .long 8 # VirtualSize + .long setup_size + ZO__end # VirtualAddress + .long pecompat_fsize # SizeOfRawData + .long pecompat_fstart # PointerToRawData + + .long 0, 0, 0 .long IMAGE_SCN_CNT_INITIALIZED_DATA | \ IMAGE_SCN_MEM_READ | \ - IMAGE_SCN_MEM_DISCARDABLE | \ - IMAGE_SCN_ALIGN_1BYTES # Characteristics + IMAGE_SCN_MEM_DISCARDABLE # Characteristics + + /* + * Put the IA-32 machine type and the associated entry point address in + * the .compat section, so loaders can figure out which other execution + * modes this image supports. + */ + .pushsection ".pecompat", "a", @progbits + .balign falign + .set pecompat_vsize, salign + .globl pecompat_fstart +pecompat_fstart: + .byte 0x1 # Version + .byte 8 # Size + .word IMAGE_FILE_MACHINE_I386 # PE machine type + .long setup_size + ZO_efi32_pe_entry # Entrypoint + .popsection +#else + .set pecompat_vsize, 0 + .set pecompat_fstart, setup_size #endif - - # - # The offset & size fields are filled in by build.c. - # .ascii ".text" .byte 0 .byte 0 .byte 0 - .long 0 - .long 0x0 # startup_{32,64} - .long 0 # Size of initialized data + .long ZO__data + .long setup_size + .long ZO__data # Size of initialized data # on disk - .long 0x0 # startup_{32,64} + .long setup_size .long 0 # PointerToRelocations .long 0 # PointerToLineNumbers .word 0 # NumberOfRelocations .word 0 # NumberOfLineNumbers .long IMAGE_SCN_CNT_CODE | \ IMAGE_SCN_MEM_READ | \ - IMAGE_SCN_MEM_EXECUTE | \ - IMAGE_SCN_ALIGN_16BYTES # Characteristics + IMAGE_SCN_MEM_EXECUTE # Characteristics + + .ascii ".data\0\0\0" + .long ZO__end - ZO__data # VirtualSize + .long setup_size + ZO__data # VirtualAddress + .long ZO__edata - ZO__data # SizeOfRawData + .long setup_size + ZO__data # PointerToRawData + + .long 0, 0, 0 + .long IMAGE_SCN_CNT_INITIALIZED_DATA | \ + IMAGE_SCN_MEM_READ | \ + IMAGE_SCN_MEM_WRITE # Characteristics .set section_count, (. - section_table) / 40 #endif /* CONFIG_EFI_STUB */ @@ -286,12 +225,12 @@ sentinel: .byte 0xff, 0xff /* Used to detect broken loaders */ .globl hdr hdr: -setup_sects: .byte 0 /* Filled in by build.c */ + .byte setup_sects - 1 root_flags: .word ROOT_RDONLY -syssize: .long 0 /* Filled in by build.c */ +syssize: .long ZO__edata / 16 ram_size: .word 0 /* Obsolete */ vid_mode: .word SVGA_MODE -root_dev: .word 0 /* Filled in by build.c */ +root_dev: .word 0 /* Default to major/minor 0/0 */ boot_flag: .word 0xAA55 # offset 512, entry point @@ -579,9 +518,25 @@ pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr # define INIT_SIZE VO_INIT_SIZE #endif + .macro __handover_offset +#ifndef CONFIG_EFI_HANDOVER_PROTOCOL + .long 0 +#elif !defined(CONFIG_X86_64) + .long ZO_efi32_stub_entry +#else + /* Yes, this is really how we defined it :( */ + .long ZO_efi64_stub_entry - 0x200 +#ifdef CONFIG_EFI_MIXED + .if ZO_efi32_stub_entry != ZO_efi64_stub_entry - 0x200 + .error "32-bit and 64-bit EFI entry points do not match" + .endif +#endif +#endif + .endm + init_size: .long INIT_SIZE # kernel initialization size -handover_offset: .long 0 # Filled in by build.c -kernel_info_offset: .long 0 # Filled in by build.c +handover_offset: __handover_offset +kernel_info_offset: .long ZO_kernel_info # End of setup header ##################################################### diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld index 49546c247ae2..83bb7efad8ae 100644 --- a/arch/x86/boot/setup.ld +++ b/arch/x86/boot/setup.ld @@ -10,10 +10,11 @@ ENTRY(_start) SECTIONS { . = 0; - .bstext : { *(.bstext) } - .bsdata : { *(.bsdata) } + .bstext : { + *(.bstext) + . = 495; + } =0xffffffff - . = 495; .header : { *(.header) } .entrytext : { *(.entrytext) } .inittext : { *(.inittext) } @@ -35,11 +36,16 @@ SECTIONS . = ALIGN(16); .data : { *(.data*) } + .pecompat : { *(.pecompat) } + PROVIDE(pecompat_fsize = setup_size - pecompat_fstart); + .signature : { setup_sig = .; LONG(0x5a5aaa55) - } + setup_size = ALIGN(ABSOLUTE(.), 4096); + setup_sects = ABSOLUTE(setup_size / 512); + } . = ALIGN(16); .bss : diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c index bd247692b701..10311d77c67f 100644 --- a/arch/x86/boot/tools/build.c +++ b/arch/x86/boot/tools/build.c @@ -40,10 +40,6 @@ typedef unsigned char u8; typedef unsigned short u16; typedef unsigned int u32; -#define DEFAULT_MAJOR_ROOT 0 -#define DEFAULT_MINOR_ROOT 0 -#define DEFAULT_ROOT_DEV (DEFAULT_MAJOR_ROOT << 8 | DEFAULT_MINOR_ROOT) - /* Minimal number of setup sectors */ #define SETUP_SECT_MIN 5 #define SETUP_SECT_MAX 64 @@ -51,22 +47,7 @@ typedef unsigned int u32; /* This must be large enough to hold the entire setup */ u8 buf[SETUP_SECT_MAX*512]; -#define PECOFF_RELOC_RESERVE 0x20 - -#ifdef CONFIG_EFI_MIXED -#define PECOFF_COMPAT_RESERVE 0x20 -#else -#define PECOFF_COMPAT_RESERVE 0x0 -#endif - -static unsigned long efi32_stub_entry; -static unsigned long efi64_stub_entry; -static unsigned long efi_pe_entry; -static unsigned long efi32_pe_entry; -static unsigned long kernel_info; -static unsigned long startup_64; -static unsigned long _ehead; -static unsigned long _end; +static unsigned long _edata; /*----------------------------------------------------------------------*/ @@ -152,180 +133,6 @@ static void usage(void) die("Usage: build setup system zoffset.h image"); } -#ifdef CONFIG_EFI_STUB - -static void update_pecoff_section_header_fields(char *section_name, u32 vma, u32 size, u32 datasz, u32 offset) -{ - unsigned int pe_header; - unsigned short num_sections; - u8 *section; - - pe_header = get_unaligned_le32(&buf[0x3c]); - num_sections = get_unaligned_le16(&buf[pe_header + 6]); - -#ifdef CONFIG_X86_32 - section = &buf[pe_header + 0xa8]; -#else - section = &buf[pe_header + 0xb8]; -#endif - - while (num_sections > 0) { - if (strncmp((char*)section, section_name, 8) == 0) { - /* section header size field */ - put_unaligned_le32(size, section + 0x8); - - /* section header vma field */ - put_unaligned_le32(vma, section + 0xc); - - /* section header 'size of initialised data' field */ - put_unaligned_le32(datasz, section + 0x10); - - /* section header 'file offset' field */ - put_unaligned_le32(offset, section + 0x14); - - break; - } - section += 0x28; - num_sections--; - } -} - -static void update_pecoff_section_header(char *section_name, u32 offset, u32 size) -{ - update_pecoff_section_header_fields(section_name, offset, size, size, offset); -} - -static void update_pecoff_setup_and_reloc(unsigned int size) -{ - u32 setup_offset = 0x200; - u32 reloc_offset = size - PECOFF_RELOC_RESERVE - PECOFF_COMPAT_RESERVE; -#ifdef CONFIG_EFI_MIXED - u32 compat_offset = reloc_offset + PECOFF_RELOC_RESERVE; -#endif - u32 setup_size = reloc_offset - setup_offset; - - update_pecoff_section_header(".setup", setup_offset, setup_size); - update_pecoff_section_header(".reloc", reloc_offset, PECOFF_RELOC_RESERVE); - - /* - * Modify .reloc section contents with a single entry. The - * relocation is applied to offset 10 of the relocation section. - */ - put_unaligned_le32(reloc_offset + 10, &buf[reloc_offset]); - put_unaligned_le32(10, &buf[reloc_offset + 4]); - -#ifdef CONFIG_EFI_MIXED - update_pecoff_section_header(".compat", compat_offset, PECOFF_COMPAT_RESERVE); - - /* - * Put the IA-32 machine type (0x14c) and the associated entry point - * address in the .compat section, so loaders can figure out which other - * execution modes this image supports. - */ - buf[compat_offset] = 0x1; - buf[compat_offset + 1] = 0x8; - put_unaligned_le16(0x14c, &buf[compat_offset + 2]); - put_unaligned_le32(efi32_pe_entry + size, &buf[compat_offset + 4]); -#endif -} - -static void update_pecoff_text(unsigned int text_start, unsigned int file_sz, - unsigned int init_sz) -{ - unsigned int pe_header; - unsigned int text_sz = file_sz - text_start; - unsigned int bss_sz = init_sz - file_sz; - - pe_header = get_unaligned_le32(&buf[0x3c]); - - /* - * The PE/COFF loader may load the image at an address which is - * misaligned with respect to the kernel_alignment field in the setup - * header. - * - * In order to avoid relocating the kernel to correct the misalignment, - * add slack to allow the buffer to be aligned within the declared size - * of the image. - */ - bss_sz += CONFIG_PHYSICAL_ALIGN; - init_sz += CONFIG_PHYSICAL_ALIGN; - - /* - * Size of code: Subtract the size of the first sector (512 bytes) - * which includes the header. - */ - put_unaligned_le32(file_sz - 512 + bss_sz, &buf[pe_header + 0x1c]); - - /* Size of image */ - put_unaligned_le32(init_sz, &buf[pe_header + 0x50]); - - /* - * Address of entry point for PE/COFF executable - */ - put_unaligned_le32(text_start + efi_pe_entry, &buf[pe_header + 0x28]); - - update_pecoff_section_header_fields(".text", text_start, text_sz + bss_sz, - text_sz, text_start); -} - -static int reserve_pecoff_reloc_section(int c) -{ - /* Reserve 0x20 bytes for .reloc section */ - memset(buf+c, 0, PECOFF_RELOC_RESERVE); - return PECOFF_RELOC_RESERVE; -} - -static void efi_stub_defaults(void) -{ - /* Defaults for old kernel */ -#ifdef CONFIG_X86_32 - efi_pe_entry = 0x10; -#else - efi_pe_entry = 0x210; - startup_64 = 0x200; -#endif -} - -static void efi_stub_entry_update(void) -{ - unsigned long addr = efi32_stub_entry; - -#ifdef CONFIG_EFI_HANDOVER_PROTOCOL -#ifdef CONFIG_X86_64 - /* Yes, this is really how we defined it :( */ - addr = efi64_stub_entry - 0x200; -#endif - -#ifdef CONFIG_EFI_MIXED - if (efi32_stub_entry != addr) - die("32-bit and 64-bit EFI entry points do not match\n"); -#endif -#endif - put_unaligned_le32(addr, &buf[0x264]); -} - -#else - -static inline void update_pecoff_setup_and_reloc(unsigned int size) {} -static inline void update_pecoff_text(unsigned int text_start, - unsigned int file_sz, - unsigned int init_sz) {} -static inline void efi_stub_defaults(void) {} -static inline void efi_stub_entry_update(void) {} - -static inline int reserve_pecoff_reloc_section(int c) -{ - return 0; -} -#endif /* CONFIG_EFI_STUB */ - -static int reserve_pecoff_compat_section(int c) -{ - /* Reserve 0x20 bytes for .compat section */ - memset(buf+c, 0, PECOFF_COMPAT_RESERVE); - return PECOFF_COMPAT_RESERVE; -} - /* * Parse zoffset.h and find the entry points. We could just #include zoffset.h * but that would mean tools/build would have to be rebuilt every time. It's @@ -354,14 +161,7 @@ static void parse_zoffset(char *fname) p = (char *)buf; while (p && *p) { - PARSE_ZOFS(p, efi32_stub_entry); - PARSE_ZOFS(p, efi64_stub_entry); - PARSE_ZOFS(p, efi_pe_entry); - PARSE_ZOFS(p, efi32_pe_entry); - PARSE_ZOFS(p, kernel_info); - PARSE_ZOFS(p, startup_64); - PARSE_ZOFS(p, _ehead); - PARSE_ZOFS(p, _end); + PARSE_ZOFS(p, _edata); p = strchr(p, '\n'); while (p && (*p == '\r' || *p == '\n')) @@ -371,17 +171,14 @@ static void parse_zoffset(char *fname) int main(int argc, char ** argv) { - unsigned int i, sz, setup_sectors, init_sz; + unsigned int i, sz, setup_sectors; int c; - u32 sys_size; struct stat sb; FILE *file, *dest; int fd; void *kernel; u32 crc = 0xffffffffUL; - efi_stub_defaults(); - if (argc != 5) usage(); parse_zoffset(argv[3]); @@ -403,72 +200,27 @@ int main(int argc, char ** argv) die("Boot block hasn't got boot flag (0xAA55)"); fclose(file); - c += reserve_pecoff_compat_section(c); - c += reserve_pecoff_reloc_section(c); - /* Pad unused space with zeros */ - setup_sectors = (c + 511) / 512; + setup_sectors = (c + 4095) / 4096; + setup_sectors *= 8; if (setup_sectors < SETUP_SECT_MIN) setup_sectors = SETUP_SECT_MIN; i = setup_sectors*512; memset(buf+c, 0, i-c); - update_pecoff_setup_and_reloc(i); - - /* Set the default root device */ - put_unaligned_le16(DEFAULT_ROOT_DEV, &buf[508]); - /* Open and stat the kernel file */ fd = open(argv[2], O_RDONLY); if (fd < 0) die("Unable to open `%s': %m", argv[2]); if (fstat(fd, &sb)) die("Unable to stat `%s': %m", argv[2]); - sz = sb.st_size; + if (_edata != sb.st_size) + die("Unexpected file size `%s': %u != %u", argv[2], _edata, + sb.st_size); + sz = _edata - 4; kernel = mmap(NULL, sz, PROT_READ, MAP_SHARED, fd, 0); if (kernel == MAP_FAILED) die("Unable to mmap '%s': %m", argv[2]); - /* Number of 16-byte paragraphs, including space for a 4-byte CRC */ - sys_size = (sz + 15 + 4) / 16; -#ifdef CONFIG_EFI_STUB - /* - * COFF requires minimum 32-byte alignment of sections, and - * adding a signature is problematic without that alignment. - */ - sys_size = (sys_size + 1) & ~1; -#endif - - /* Patch the setup code with the appropriate size parameters */ - buf[0x1f1] = setup_sectors-1; - put_unaligned_le32(sys_size, &buf[0x1f4]); - - init_sz = get_unaligned_le32(&buf[0x260]); -#ifdef CONFIG_EFI_STUB - /* - * The decompression buffer will start at ImageBase. When relocating - * the compressed kernel to its end, we must ensure that the head - * section does not get overwritten. The head section occupies - * [i, i + _ehead), and the destination is [init_sz - _end, init_sz). - * - * At present these should never overlap, because 'i' is at most 32k - * because of SETUP_SECT_MAX, '_ehead' is less than 1k, and the - * calculation of INIT_SIZE in boot/header.S ensures that - * 'init_sz - _end' is at least 64k. - * - * For future-proofing, increase init_sz if necessary. - */ - - if (init_sz - _end < i + _ehead) { - init_sz = (i + _ehead + _end + 4095) & ~4095; - put_unaligned_le32(init_sz, &buf[0x260]); - } -#endif - update_pecoff_text(setup_sectors * 512, i + (sys_size * 16), init_sz); - - efi_stub_entry_update(); - - /* Update kernel_info offset. */ - put_unaligned_le32(kernel_info, &buf[0x268]); crc = partial_crc32(buf, i, crc); if (fwrite(buf, 1, i, dest) != i) @@ -479,13 +231,6 @@ int main(int argc, char ** argv) if (fwrite(kernel, 1, sz, dest) != sz) die("Writing kernel failed"); - /* Add padding leaving 4 bytes for the checksum */ - while (sz++ < (sys_size*16) - 4) { - crc = partial_crc32_one('\0', crc); - if (fwrite("\0", 1, 1, dest) != 1) - die("Writing padding failed"); - } - /* Write the CRC */ put_unaligned_le32(crc, buf); if (fwrite(buf, 1, 4, dest) != 4) diff --git a/arch/x86/coco/tdx/tdcall.S b/arch/x86/coco/tdx/tdcall.S index b193c0a1d8db..52d9786da308 100644 --- a/arch/x86/coco/tdx/tdcall.S +++ b/arch/x86/coco/tdx/tdcall.S @@ -1,239 +1,63 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include <asm/asm-offsets.h> #include <asm/asm.h> -#include <asm/frame.h> -#include <asm/unwind_hints.h> #include <linux/linkage.h> -#include <linux/bits.h> #include <linux/errno.h> #include "../../virt/vmx/tdx/tdxcall.S" -/* - * Bitmasks of exposed registers (with VMM). - */ -#define TDX_RDX BIT(2) -#define TDX_RBX BIT(3) -#define TDX_RSI BIT(6) -#define TDX_RDI BIT(7) -#define TDX_R8 BIT(8) -#define TDX_R9 BIT(9) -#define TDX_R10 BIT(10) -#define TDX_R11 BIT(11) -#define TDX_R12 BIT(12) -#define TDX_R13 BIT(13) -#define TDX_R14 BIT(14) -#define TDX_R15 BIT(15) - -/* - * These registers are clobbered to hold arguments for each - * TDVMCALL. They are safe to expose to the VMM. - * Each bit in this mask represents a register ID. Bit field - * details can be found in TDX GHCI specification, section - * titled "TDCALL [TDG.VP.VMCALL] leaf". - */ -#define TDVMCALL_EXPOSE_REGS_MASK \ - ( TDX_RDX | TDX_RBX | TDX_RSI | TDX_RDI | TDX_R8 | TDX_R9 | \ - TDX_R10 | TDX_R11 | TDX_R12 | TDX_R13 | TDX_R14 | TDX_R15 ) - .section .noinstr.text, "ax" /* - * __tdx_module_call() - Used by TDX guests to request services from - * the TDX module (does not include VMM services) using TDCALL instruction. - * - * Transforms function call register arguments into the TDCALL register ABI. - * After TDCALL operation, TDX module output is saved in @out (if it is - * provided by the user). - * - *------------------------------------------------------------------------- - * TDCALL ABI: - *------------------------------------------------------------------------- - * Input Registers: - * - * RAX - TDCALL Leaf number. - * RCX,RDX,R8-R9 - TDCALL Leaf specific input registers. - * - * Output Registers: + * __tdcall() - Used by TDX guests to request services from the TDX + * module (does not include VMM services) using TDCALL instruction. * - * RAX - TDCALL instruction error code. - * RCX,RDX,R8-R11 - TDCALL Leaf specific output registers. + * __tdcall() function ABI: * - *------------------------------------------------------------------------- + * @fn (RDI) - TDCALL Leaf ID, moved to RAX + * @args (RSI) - struct tdx_module_args for input * - * __tdx_module_call() function ABI: - * - * @fn (RDI) - TDCALL Leaf ID, moved to RAX - * @rcx (RSI) - Input parameter 1, moved to RCX - * @rdx (RDX) - Input parameter 2, moved to RDX - * @r8 (RCX) - Input parameter 3, moved to R8 - * @r9 (R8) - Input parameter 4, moved to R9 - * - * @out (R9) - struct tdx_module_output pointer - * stored temporarily in R12 (not - * shared with the TDX module). It - * can be NULL. + * Only RCX/RDX/R8-R11 are used as input registers. * * Return status of TDCALL via RAX. */ -SYM_FUNC_START(__tdx_module_call) - FRAME_BEGIN +SYM_FUNC_START(__tdcall) TDX_MODULE_CALL host=0 - FRAME_END - RET -SYM_FUNC_END(__tdx_module_call) +SYM_FUNC_END(__tdcall) /* - * TDX_HYPERCALL - Make hypercalls to a TDX VMM using TDVMCALL leaf of TDCALL - * instruction - * - * Transforms values in function call argument struct tdx_hypercall_args @args - * into the TDCALL register ABI. After TDCALL operation, VMM output is saved - * back in @args, if \ret is 1. - * - *------------------------------------------------------------------------- - * TD VMCALL ABI: - *------------------------------------------------------------------------- - * - * Input Registers: + * __tdcall_ret() - Used by TDX guests to request services from the TDX + * module (does not include VMM services) using TDCALL instruction, with + * saving output registers to the 'struct tdx_module_args' used as input. * - * RAX - TDCALL instruction leaf number (0 - TDG.VP.VMCALL) - * RCX - BITMAP which controls which part of TD Guest GPR - * is passed as-is to the VMM and back. - * R10 - Set 0 to indicate TDCALL follows standard TDX ABI - * specification. Non zero value indicates vendor - * specific ABI. - * R11 - VMCALL sub function number - * RBX, RDX, RDI, RSI - Used to pass VMCALL sub function specific arguments. - * R8-R9, R12-R15 - Same as above. + * __tdcall_ret() function ABI: * - * Output Registers: + * @fn (RDI) - TDCALL Leaf ID, moved to RAX + * @args (RSI) - struct tdx_module_args for input and output * - * RAX - TDCALL instruction status (Not related to hypercall - * output). - * RBX, RDX, RDI, RSI - Hypercall sub function specific output values. - * R8-R15 - Same as above. + * Only RCX/RDX/R8-R11 are used as input/output registers. * + * Return status of TDCALL via RAX. */ -.macro TDX_HYPERCALL ret:req - FRAME_BEGIN - - /* Save callee-saved GPRs as mandated by the x86_64 ABI */ - push %r15 - push %r14 - push %r13 - push %r12 - push %rbx - - /* Free RDI to be used as TDVMCALL arguments */ - movq %rdi, %rax - - /* Copy hypercall registers from arg struct: */ - movq TDX_HYPERCALL_r8(%rax), %r8 - movq TDX_HYPERCALL_r9(%rax), %r9 - movq TDX_HYPERCALL_r10(%rax), %r10 - movq TDX_HYPERCALL_r11(%rax), %r11 - movq TDX_HYPERCALL_r12(%rax), %r12 - movq TDX_HYPERCALL_r13(%rax), %r13 - movq TDX_HYPERCALL_r14(%rax), %r14 - movq TDX_HYPERCALL_r15(%rax), %r15 - movq TDX_HYPERCALL_rdi(%rax), %rdi - movq TDX_HYPERCALL_rsi(%rax), %rsi - movq TDX_HYPERCALL_rbx(%rax), %rbx - movq TDX_HYPERCALL_rdx(%rax), %rdx - - push %rax - - /* Mangle function call ABI into TDCALL ABI: */ - /* Set TDCALL leaf ID (TDVMCALL (0)) in RAX */ - xor %eax, %eax - - movl $TDVMCALL_EXPOSE_REGS_MASK, %ecx - - tdcall - - /* - * RAX!=0 indicates a failure of the TDVMCALL mechanism itself and that - * something has gone horribly wrong with the TDX module. - * - * The return status of the hypercall operation is in a separate - * register (in R10). Hypercall errors are a part of normal operation - * and are handled by callers. - */ - testq %rax, %rax - jne .Lpanic\@ - - pop %rax - - .if \ret - movq %r8, TDX_HYPERCALL_r8(%rax) - movq %r9, TDX_HYPERCALL_r9(%rax) - movq %r10, TDX_HYPERCALL_r10(%rax) - movq %r11, TDX_HYPERCALL_r11(%rax) - movq %r12, TDX_HYPERCALL_r12(%rax) - movq %r13, TDX_HYPERCALL_r13(%rax) - movq %r14, TDX_HYPERCALL_r14(%rax) - movq %r15, TDX_HYPERCALL_r15(%rax) - movq %rdi, TDX_HYPERCALL_rdi(%rax) - movq %rsi, TDX_HYPERCALL_rsi(%rax) - movq %rbx, TDX_HYPERCALL_rbx(%rax) - movq %rdx, TDX_HYPERCALL_rdx(%rax) - .endif - - /* TDVMCALL leaf return code is in R10 */ - movq %r10, %rax - - /* - * Zero out registers exposed to the VMM to avoid speculative execution - * with VMM-controlled values. This needs to include all registers - * present in TDVMCALL_EXPOSE_REGS_MASK, except RBX, and R12-R15 which - * will be restored. - */ - xor %r8d, %r8d - xor %r9d, %r9d - xor %r10d, %r10d - xor %r11d, %r11d - xor %rdi, %rdi - xor %rdx, %rdx - - /* Restore callee-saved GPRs as mandated by the x86_64 ABI */ - pop %rbx - pop %r12 - pop %r13 - pop %r14 - pop %r15 - - FRAME_END - - RET -.Lpanic\@: - call __tdx_hypercall_failed - /* __tdx_hypercall_failed never returns */ - REACHABLE - jmp .Lpanic\@ -.endm +SYM_FUNC_START(__tdcall_ret) + TDX_MODULE_CALL host=0 ret=1 +SYM_FUNC_END(__tdcall_ret) /* + * __tdcall_saved_ret() - Used by TDX guests to request services from the + * TDX module (including VMM services) using TDCALL instruction, with + * saving output registers to the 'struct tdx_module_args' used as input. * - * __tdx_hypercall() function ABI: - * - * @args (RDI) - struct tdx_hypercall_args for input - * - * On successful completion, return the hypercall error code. - */ -SYM_FUNC_START(__tdx_hypercall) - TDX_HYPERCALL ret=0 -SYM_FUNC_END(__tdx_hypercall) - -/* + * __tdcall_saved_ret() function ABI: * - * __tdx_hypercall_ret() function ABI: + * @fn (RDI) - TDCALL leaf ID, moved to RAX + * @args (RSI) - struct tdx_module_args for input/output * - * @args (RDI) - struct tdx_hypercall_args for input and output + * All registers in @args are used as input/output registers. * * On successful completion, return the hypercall error code. */ -SYM_FUNC_START(__tdx_hypercall_ret) - TDX_HYPERCALL ret=1 -SYM_FUNC_END(__tdx_hypercall_ret) +SYM_FUNC_START(__tdcall_saved_ret) + TDX_MODULE_CALL host=0 ret=1 saved=1 +SYM_FUNC_END(__tdcall_saved_ret) diff --git a/arch/x86/coco/tdx/tdx-shared.c b/arch/x86/coco/tdx/tdx-shared.c index ef20ddc37b58..78e413269791 100644 --- a/arch/x86/coco/tdx/tdx-shared.c +++ b/arch/x86/coco/tdx/tdx-shared.c @@ -5,7 +5,7 @@ static unsigned long try_accept_one(phys_addr_t start, unsigned long len, enum pg_level pg_level) { unsigned long accept_size = page_level_size(pg_level); - u64 tdcall_rcx; + struct tdx_module_args args = {}; u8 page_size; if (!IS_ALIGNED(start, accept_size)) @@ -34,8 +34,8 @@ static unsigned long try_accept_one(phys_addr_t start, unsigned long len, return 0; } - tdcall_rcx = start | page_size; - if (__tdx_module_call(TDX_ACCEPT_PAGE, tdcall_rcx, 0, 0, 0, NULL)) + args.rcx = start | page_size; + if (__tdcall(TDG_MEM_PAGE_ACCEPT, &args)) return 0; return accept_size; @@ -45,7 +45,7 @@ bool tdx_accept_memory(phys_addr_t start, phys_addr_t end) { /* * For shared->private conversion, accept the page using - * TDX_ACCEPT_PAGE TDX module call. + * TDG_MEM_PAGE_ACCEPT TDX module call. */ while (start < end) { unsigned long len = end - start; @@ -69,3 +69,23 @@ bool tdx_accept_memory(phys_addr_t start, phys_addr_t end) return true; } + +noinstr u64 __tdx_hypercall(struct tdx_module_args *args) +{ + /* + * For TDVMCALL explicitly set RCX to the bitmap of shared registers. + * The caller isn't expected to set @args->rcx anyway. + */ + args->rcx = TDVMCALL_EXPOSE_REGS_MASK; + + /* + * Failure of __tdcall_saved_ret() indicates a failure of the TDVMCALL + * mechanism itself and that something has gone horribly wrong with + * the TDX module. __tdx_hypercall_failed() never returns. + */ + if (__tdcall_saved_ret(TDG_VP_VMCALL, args)) + __tdx_hypercall_failed(); + + /* TDVMCALL leaf return code is in R10 */ + return args->r10; +} diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 1d6b863c42b0..d11206ceff3b 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -38,7 +38,7 @@ #define TDREPORT_SUBTYPE_0 0 /* Called from __tdx_hypercall() for unrecoverable failure */ -noinstr void __tdx_hypercall_failed(void) +noinstr void __noreturn __tdx_hypercall_failed(void) { instrumentation_begin(); panic("TDVMCALL failed. TDX module bug?"); @@ -48,7 +48,7 @@ noinstr void __tdx_hypercall_failed(void) long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, unsigned long p2, unsigned long p3, unsigned long p4) { - struct tdx_hypercall_args args = { + struct tdx_module_args args = { .r10 = nr, .r11 = p1, .r12 = p2, @@ -66,10 +66,9 @@ EXPORT_SYMBOL_GPL(tdx_kvm_hypercall); * should only be used for calls that have no legitimate reason to fail * or where the kernel can not survive the call failing. */ -static inline void tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9, - struct tdx_module_output *out) +static inline void tdcall(u64 fn, struct tdx_module_args *args) { - if (__tdx_module_call(fn, rcx, rdx, r8, r9, out)) + if (__tdcall_ret(fn, args)) panic("TDCALL %lld failed (Buggy TDX module!)\n", fn); } @@ -89,11 +88,14 @@ static inline void tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9, */ int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport) { + struct tdx_module_args args = { + .rcx = virt_to_phys(tdreport), + .rdx = virt_to_phys(reportdata), + .r8 = TDREPORT_SUBTYPE_0, + }; u64 ret; - ret = __tdx_module_call(TDX_GET_REPORT, virt_to_phys(tdreport), - virt_to_phys(reportdata), TDREPORT_SUBTYPE_0, - 0, NULL); + ret = __tdcall(TDG_MR_REPORT, &args); if (ret) { if (TDCALL_RETURN_CODE(ret) == TDCALL_INVALID_OPERAND) return -EINVAL; @@ -106,7 +108,7 @@ EXPORT_SYMBOL_GPL(tdx_mcall_get_report0); static void __noreturn tdx_panic(const char *msg) { - struct tdx_hypercall_args args = { + struct tdx_module_args args = { .r10 = TDX_HYPERCALL_STANDARD, .r11 = TDVMCALL_REPORT_FATAL_ERROR, .r12 = 0, /* Error code: 0 is Panic */ @@ -119,7 +121,7 @@ static void __noreturn tdx_panic(const char *msg) } message; /* VMM assumes '\0' in byte 65, if the message took all 64 bytes */ - strncpy(message.str, msg, 64); + strtomem_pad(message.str, msg, '\0'); args.r8 = message.r8; args.r9 = message.r9; @@ -141,7 +143,7 @@ static void __noreturn tdx_panic(const char *msg) static void tdx_parse_tdinfo(u64 *cc_mask) { - struct tdx_module_output out; + struct tdx_module_args args = {}; unsigned int gpa_width; u64 td_attr; @@ -152,7 +154,7 @@ static void tdx_parse_tdinfo(u64 *cc_mask) * Guest-Host-Communication Interface (GHCI), section 2.4.2 TDCALL * [TDG.VP.INFO]. */ - tdx_module_call(TDX_GET_INFO, 0, 0, 0, 0, &out); + tdcall(TDG_VP_INFO, &args); /* * The highest bit of a guest physical address is the "sharing" bit. @@ -161,7 +163,7 @@ static void tdx_parse_tdinfo(u64 *cc_mask) * The GPA width that comes out of this call is critical. TDX guests * can not meaningfully run without it. */ - gpa_width = out.rcx & GENMASK(5, 0); + gpa_width = args.rcx & GENMASK(5, 0); *cc_mask = BIT_ULL(gpa_width - 1); /* @@ -169,7 +171,7 @@ static void tdx_parse_tdinfo(u64 *cc_mask) * memory. Ensure that no #VE will be delivered for accesses to * TD-private memory. Only VMM-shared memory (MMIO) will #VE. */ - td_attr = out.rdx; + td_attr = args.rdx; if (!(td_attr & ATTR_SEPT_VE_DISABLE)) { const char *msg = "TD misconfiguration: SEPT_VE_DISABLE attribute must be set."; @@ -228,7 +230,7 @@ static int ve_instr_len(struct ve_info *ve) static u64 __cpuidle __halt(const bool irq_disabled) { - struct tdx_hypercall_args args = { + struct tdx_module_args args = { .r10 = TDX_HYPERCALL_STANDARD, .r11 = hcall_func(EXIT_REASON_HLT), .r12 = irq_disabled, @@ -272,7 +274,7 @@ void __cpuidle tdx_safe_halt(void) static int read_msr(struct pt_regs *regs, struct ve_info *ve) { - struct tdx_hypercall_args args = { + struct tdx_module_args args = { .r10 = TDX_HYPERCALL_STANDARD, .r11 = hcall_func(EXIT_REASON_MSR_READ), .r12 = regs->cx, @@ -283,7 +285,7 @@ static int read_msr(struct pt_regs *regs, struct ve_info *ve) * can be found in TDX Guest-Host-Communication Interface * (GHCI), section titled "TDG.VP.VMCALL<Instruction.RDMSR>". */ - if (__tdx_hypercall_ret(&args)) + if (__tdx_hypercall(&args)) return -EIO; regs->ax = lower_32_bits(args.r11); @@ -293,7 +295,7 @@ static int read_msr(struct pt_regs *regs, struct ve_info *ve) static int write_msr(struct pt_regs *regs, struct ve_info *ve) { - struct tdx_hypercall_args args = { + struct tdx_module_args args = { .r10 = TDX_HYPERCALL_STANDARD, .r11 = hcall_func(EXIT_REASON_MSR_WRITE), .r12 = regs->cx, @@ -313,7 +315,7 @@ static int write_msr(struct pt_regs *regs, struct ve_info *ve) static int handle_cpuid(struct pt_regs *regs, struct ve_info *ve) { - struct tdx_hypercall_args args = { + struct tdx_module_args args = { .r10 = TDX_HYPERCALL_STANDARD, .r11 = hcall_func(EXIT_REASON_CPUID), .r12 = regs->ax, @@ -337,7 +339,7 @@ static int handle_cpuid(struct pt_regs *regs, struct ve_info *ve) * ABI can be found in TDX Guest-Host-Communication Interface * (GHCI), section titled "VP.VMCALL<Instruction.CPUID>". */ - if (__tdx_hypercall_ret(&args)) + if (__tdx_hypercall(&args)) return -EIO; /* @@ -355,7 +357,7 @@ static int handle_cpuid(struct pt_regs *regs, struct ve_info *ve) static bool mmio_read(int size, unsigned long addr, unsigned long *val) { - struct tdx_hypercall_args args = { + struct tdx_module_args args = { .r10 = TDX_HYPERCALL_STANDARD, .r11 = hcall_func(EXIT_REASON_EPT_VIOLATION), .r12 = size, @@ -364,8 +366,9 @@ static bool mmio_read(int size, unsigned long addr, unsigned long *val) .r15 = *val, }; - if (__tdx_hypercall_ret(&args)) + if (__tdx_hypercall(&args)) return false; + *val = args.r11; return true; } @@ -483,7 +486,7 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve) static bool handle_in(struct pt_regs *regs, int size, int port) { - struct tdx_hypercall_args args = { + struct tdx_module_args args = { .r10 = TDX_HYPERCALL_STANDARD, .r11 = hcall_func(EXIT_REASON_IO_INSTRUCTION), .r12 = size, @@ -498,7 +501,7 @@ static bool handle_in(struct pt_regs *regs, int size, int port) * in TDX Guest-Host-Communication Interface (GHCI) section titled * "TDG.VP.VMCALL<Instruction.IO>". */ - success = !__tdx_hypercall_ret(&args); + success = !__tdx_hypercall(&args); /* Update part of the register affected by the emulated instruction */ regs->ax &= ~mask; @@ -577,7 +580,7 @@ __init bool tdx_early_handle_ve(struct pt_regs *regs) void tdx_get_ve_info(struct ve_info *ve) { - struct tdx_module_output out; + struct tdx_module_args args = {}; /* * Called during #VE handling to retrieve the #VE info from the @@ -594,15 +597,15 @@ void tdx_get_ve_info(struct ve_info *ve) * Note, the TDX module treats virtual NMIs as inhibited if the #VE * valid flag is set. It means that NMI=>#VE will not result in a #DF. */ - tdx_module_call(TDX_GET_VEINFO, 0, 0, 0, 0, &out); + tdcall(TDG_VP_VEINFO_GET, &args); /* Transfer the output parameters */ - ve->exit_reason = out.rcx; - ve->exit_qual = out.rdx; - ve->gla = out.r8; - ve->gpa = out.r9; - ve->instr_len = lower_32_bits(out.r10); - ve->instr_info = upper_32_bits(out.r10); + ve->exit_reason = args.rcx; + ve->exit_qual = args.rdx; + ve->gla = args.r8; + ve->gpa = args.r9; + ve->instr_len = lower_32_bits(args.r10); + ve->instr_info = upper_32_bits(args.r10); } /* @@ -703,14 +706,15 @@ static bool tdx_cache_flush_required(void) } /* - * Inform the VMM of the guest's intent for this physical page: shared with - * the VMM or private to the guest. The VMM is expected to change its mapping - * of the page in response. + * Notify the VMM about page mapping conversion. More info about ABI + * can be found in TDX Guest-Host-Communication Interface (GHCI), + * section "TDG.VP.VMCALL<MapGPA>". */ -static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc) +static bool tdx_map_gpa(phys_addr_t start, phys_addr_t end, bool enc) { - phys_addr_t start = __pa(vaddr); - phys_addr_t end = __pa(vaddr + numpages * PAGE_SIZE); + /* Retrying the hypercall a second time should succeed; use 3 just in case */ + const int max_retries_per_page = 3; + int retry_count = 0; if (!enc) { /* Set the shared (decrypted) bits: */ @@ -718,12 +722,51 @@ static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc) end |= cc_mkdec(0); } - /* - * Notify the VMM about page mapping conversion. More info about ABI - * can be found in TDX Guest-Host-Communication Interface (GHCI), - * section "TDG.VP.VMCALL<MapGPA>" - */ - if (_tdx_hypercall(TDVMCALL_MAP_GPA, start, end - start, 0, 0)) + while (retry_count < max_retries_per_page) { + struct tdx_module_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = TDVMCALL_MAP_GPA, + .r12 = start, + .r13 = end - start }; + + u64 map_fail_paddr; + u64 ret = __tdx_hypercall(&args); + + if (ret != TDVMCALL_STATUS_RETRY) + return !ret; + /* + * The guest must retry the operation for the pages in the + * region starting at the GPA specified in R11. R11 comes + * from the untrusted VMM. Sanity check it. + */ + map_fail_paddr = args.r11; + if (map_fail_paddr < start || map_fail_paddr >= end) + return false; + + /* "Consume" a retry without forward progress */ + if (map_fail_paddr == start) { + retry_count++; + continue; + } + + start = map_fail_paddr; + retry_count = 0; + } + + return false; +} + +/* + * Inform the VMM of the guest's intent for this physical page: shared with + * the VMM or private to the guest. The VMM is expected to change its mapping + * of the page in response. + */ +static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc) +{ + phys_addr_t start = __pa(vaddr); + phys_addr_t end = __pa(vaddr + numpages * PAGE_SIZE); + + if (!tdx_map_gpa(start, end, enc)) return false; /* shared->private conversion requires memory to be accepted before use */ @@ -759,6 +802,10 @@ static bool tdx_enc_status_change_finish(unsigned long vaddr, int numpages, void __init tdx_early_init(void) { + struct tdx_module_args args = { + .rdx = TDCS_NOTIFY_ENABLES, + .r9 = -1ULL, + }; u64 cc_mask; u32 eax, sig[3]; @@ -769,12 +816,15 @@ void __init tdx_early_init(void) setup_force_cpu_cap(X86_FEATURE_TDX_GUEST); + /* TSC is the only reliable clock in TDX guest */ + setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); + cc_vendor = CC_VENDOR_INTEL; tdx_parse_tdinfo(&cc_mask); cc_set_mask(cc_mask); /* Kernel does not use NOTIFY_ENABLES and does not need random #VEs */ - tdx_module_call(TDX_WR, 0, TDCS_NOTIFY_ENABLES, 0, -1ULL, NULL); + tdcall(TDG_VM_WR, &args); /* * All bits above GPA width are reserved and kernel treats shared bit diff --git a/arch/x86/configs/hardening.config b/arch/x86/configs/hardening.config new file mode 100644 index 000000000000..7b497f3b7bc3 --- /dev/null +++ b/arch/x86/configs/hardening.config @@ -0,0 +1,14 @@ +# Basic kernel hardening options (specific to x86) + +# Modern libc no longer needs a fixed-position mapping in userspace, remove +# it as a possible target. +CONFIG_LEGACY_VSYSCALL_NONE=y + +# Enable chip-specific IOMMU support. +CONFIG_INTEL_IOMMU=y +CONFIG_INTEL_IOMMU_DEFAULT_ON=y +CONFIG_INTEL_IOMMU_SVM=y +CONFIG_AMD_IOMMU=y + +# Enable CET Shadow Stack for userspace. +CONFIG_X86_USER_SHADOW_STACK=y diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 1b411bbf3cb0..73abbbdd26f8 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -281,4 +281,5 @@ CONFIG_PROVIDE_OHCI1394_DMA_INIT=y CONFIG_EARLY_PRINTK_DBGP=y CONFIG_DEBUG_BOOT_PARAMS=y CONFIG_UNWINDER_FRAME_POINTER=y +CONFIG_DEBUG_ENTRY=y # CONFIG_64BIT is not set diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 409e9182bd29..61e25f6209ed 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -276,3 +276,4 @@ CONFIG_BLK_DEV_IO_TRACE=y CONFIG_PROVIDE_OHCI1394_DMA_INIT=y CONFIG_EARLY_PRINTK_DBGP=y CONFIG_DEBUG_BOOT_PARAMS=y +CONFIG_DEBUG_ENTRY=y diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 3ac7487ecad2..187f913cc239 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -672,7 +672,7 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff add %r13, %r10 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling sub $16, %r10 - # Determine if if partial block is not being filled and + # Determine if partial block is not being filled and # shift mask accordingly jge .L_no_extra_mask_1_\@ sub %r10, %r12 @@ -708,7 +708,7 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff add %r13, %r10 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling sub $16, %r10 - # Determine if if partial block is not being filled and + # Determine if partial block is not being filled and # shift mask accordingly jge .L_no_extra_mask_2_\@ sub %r10, %r12 diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index 46cddd78857b..74dd230973cf 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -753,7 +753,7 @@ VARIABLE_OFFSET = 16*8 add %r13, %r10 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling sub $16, %r10 - # Determine if if partial block is not being filled and + # Determine if partial block is not being filled and # shift mask accordingly jge .L_no_extra_mask_1_\@ sub %r10, %r12 @@ -789,7 +789,7 @@ VARIABLE_OFFSET = 16*8 add %r13, %r10 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling sub $16, %r10 - # Determine if if partial block is not being filled and + # Determine if partial block is not being filled and # shift mask accordingly jge .L_no_extra_mask_2_\@ sub %r10, %r12 diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 39d6a62ac627..b1d90c25975a 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -61,8 +61,8 @@ struct generic_gcmaes_ctx { }; struct aesni_xts_ctx { - u8 raw_tweak_ctx[sizeof(struct crypto_aes_ctx)] AESNI_ALIGN_ATTR; - u8 raw_crypt_ctx[sizeof(struct crypto_aes_ctx)] AESNI_ALIGN_ATTR; + struct crypto_aes_ctx tweak_ctx AESNI_ALIGN_ATTR; + struct crypto_aes_ctx crypt_ctx AESNI_ALIGN_ATTR; }; #define GCM_BLOCK_LEN 16 @@ -80,6 +80,13 @@ struct gcm_context_data { u8 hash_keys[GCM_BLOCK_LEN * 16]; }; +static inline void *aes_align_addr(void *addr) +{ + if (crypto_tfm_ctx_alignment() >= AESNI_ALIGN) + return addr; + return PTR_ALIGN(addr, AESNI_ALIGN); +} + asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, unsigned int key_len); asmlinkage void aesni_enc(const void *ctx, u8 *out, const u8 *in); @@ -201,32 +208,24 @@ static __ro_after_init DEFINE_STATIC_KEY_FALSE(gcm_use_avx2); static inline struct aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm) { - unsigned long align = AESNI_ALIGN; - - if (align <= crypto_tfm_ctx_alignment()) - align = 1; - return PTR_ALIGN(crypto_aead_ctx(tfm), align); + return aes_align_addr(crypto_aead_ctx(tfm)); } static inline struct generic_gcmaes_ctx *generic_gcmaes_ctx_get(struct crypto_aead *tfm) { - unsigned long align = AESNI_ALIGN; - - if (align <= crypto_tfm_ctx_alignment()) - align = 1; - return PTR_ALIGN(crypto_aead_ctx(tfm), align); + return aes_align_addr(crypto_aead_ctx(tfm)); } #endif static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx) { - unsigned long addr = (unsigned long)raw_ctx; - unsigned long align = AESNI_ALIGN; + return aes_align_addr(raw_ctx); +} - if (align <= crypto_tfm_ctx_alignment()) - align = 1; - return (struct crypto_aes_ctx *)ALIGN(addr, align); +static inline struct aesni_xts_ctx *aes_xts_ctx(struct crypto_skcipher *tfm) +{ + return aes_align_addr(crypto_skcipher_ctx(tfm)); } static int aes_set_key_common(struct crypto_aes_ctx *ctx, @@ -881,7 +880,7 @@ static int helper_rfc4106_decrypt(struct aead_request *req) static int xts_aesni_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { - struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm); + struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm); int err; err = xts_verify_key(tfm, key, keylen); @@ -891,19 +890,18 @@ static int xts_aesni_setkey(struct crypto_skcipher *tfm, const u8 *key, keylen /= 2; /* first half of xts-key is for crypt */ - err = aes_set_key_common(aes_ctx(ctx->raw_crypt_ctx), key, keylen); + err = aes_set_key_common(&ctx->crypt_ctx, key, keylen); if (err) return err; /* second half of xts-key is for tweak */ - return aes_set_key_common(aes_ctx(ctx->raw_tweak_ctx), key + keylen, - keylen); + return aes_set_key_common(&ctx->tweak_ctx, key + keylen, keylen); } static int xts_crypt(struct skcipher_request *req, bool encrypt) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm); + struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm); int tail = req->cryptlen % AES_BLOCK_SIZE; struct skcipher_request subreq; struct skcipher_walk walk; @@ -939,7 +937,7 @@ static int xts_crypt(struct skcipher_request *req, bool encrypt) kernel_fpu_begin(); /* calculate first value of T */ - aesni_enc(aes_ctx(ctx->raw_tweak_ctx), walk.iv, walk.iv); + aesni_enc(&ctx->tweak_ctx, walk.iv, walk.iv); while (walk.nbytes > 0) { int nbytes = walk.nbytes; @@ -948,11 +946,11 @@ static int xts_crypt(struct skcipher_request *req, bool encrypt) nbytes &= ~(AES_BLOCK_SIZE - 1); if (encrypt) - aesni_xts_encrypt(aes_ctx(ctx->raw_crypt_ctx), + aesni_xts_encrypt(&ctx->crypt_ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes, walk.iv); else - aesni_xts_decrypt(aes_ctx(ctx->raw_crypt_ctx), + aesni_xts_decrypt(&ctx->crypt_ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes, walk.iv); kernel_fpu_end(); @@ -980,11 +978,11 @@ static int xts_crypt(struct skcipher_request *req, bool encrypt) kernel_fpu_begin(); if (encrypt) - aesni_xts_encrypt(aes_ctx(ctx->raw_crypt_ctx), + aesni_xts_encrypt(&ctx->crypt_ctx, walk.dst.virt.addr, walk.src.virt.addr, walk.nbytes, walk.iv); else - aesni_xts_decrypt(aes_ctx(ctx->raw_crypt_ctx), + aesni_xts_decrypt(&ctx->crypt_ctx, walk.dst.virt.addr, walk.src.virt.addr, walk.nbytes, walk.iv); kernel_fpu_end(); diff --git a/arch/x86/crypto/nhpoly1305-avx2-glue.c b/arch/x86/crypto/nhpoly1305-avx2-glue.c index 46b036204ed9..c3a872f4d6a7 100644 --- a/arch/x86/crypto/nhpoly1305-avx2-glue.c +++ b/arch/x86/crypto/nhpoly1305-avx2-glue.c @@ -34,6 +34,14 @@ static int nhpoly1305_avx2_update(struct shash_desc *desc, return 0; } +static int nhpoly1305_avx2_digest(struct shash_desc *desc, + const u8 *src, unsigned int srclen, u8 *out) +{ + return crypto_nhpoly1305_init(desc) ?: + nhpoly1305_avx2_update(desc, src, srclen) ?: + crypto_nhpoly1305_final(desc, out); +} + static struct shash_alg nhpoly1305_alg = { .base.cra_name = "nhpoly1305", .base.cra_driver_name = "nhpoly1305-avx2", @@ -44,6 +52,7 @@ static struct shash_alg nhpoly1305_alg = { .init = crypto_nhpoly1305_init, .update = nhpoly1305_avx2_update, .final = crypto_nhpoly1305_final, + .digest = nhpoly1305_avx2_digest, .setkey = crypto_nhpoly1305_setkey, .descsize = sizeof(struct nhpoly1305_state), }; diff --git a/arch/x86/crypto/nhpoly1305-sse2-glue.c b/arch/x86/crypto/nhpoly1305-sse2-glue.c index 4a4970d75107..a268a8439a5c 100644 --- a/arch/x86/crypto/nhpoly1305-sse2-glue.c +++ b/arch/x86/crypto/nhpoly1305-sse2-glue.c @@ -34,6 +34,14 @@ static int nhpoly1305_sse2_update(struct shash_desc *desc, return 0; } +static int nhpoly1305_sse2_digest(struct shash_desc *desc, + const u8 *src, unsigned int srclen, u8 *out) +{ + return crypto_nhpoly1305_init(desc) ?: + nhpoly1305_sse2_update(desc, src, srclen) ?: + crypto_nhpoly1305_final(desc, out); +} + static struct shash_alg nhpoly1305_alg = { .base.cra_name = "nhpoly1305", .base.cra_driver_name = "nhpoly1305-sse2", @@ -44,6 +52,7 @@ static struct shash_alg nhpoly1305_alg = { .init = crypto_nhpoly1305_init, .update = nhpoly1305_sse2_update, .final = crypto_nhpoly1305_final, + .digest = nhpoly1305_sse2_digest, .setkey = crypto_nhpoly1305_setkey, .descsize = sizeof(struct nhpoly1305_state), }; diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index 44340a1139e0..959afa705e95 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -24,8 +24,17 @@ #include <linux/types.h> #include <crypto/sha1.h> #include <crypto/sha1_base.h> +#include <asm/cpu_device_id.h> #include <asm/simd.h> +static const struct x86_cpu_id module_cpu_ids[] = { + X86_MATCH_FEATURE(X86_FEATURE_AVX2, NULL), + X86_MATCH_FEATURE(X86_FEATURE_AVX, NULL), + X86_MATCH_FEATURE(X86_FEATURE_SSSE3, NULL), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, module_cpu_ids); + static int sha1_update(struct shash_desc *desc, const u8 *data, unsigned int len, sha1_block_fn *sha1_xform) { @@ -301,6 +310,9 @@ static inline void unregister_sha1_ni(void) { } static int __init sha1_ssse3_mod_init(void) { + if (!x86_match_cpu(module_cpu_ids)) + return -ENODEV; + if (register_sha1_ssse3()) goto fail; diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c index 3a5f6be7dbba..4c0383a90e11 100644 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ b/arch/x86/crypto/sha256_ssse3_glue.c @@ -38,11 +38,20 @@ #include <crypto/sha2.h> #include <crypto/sha256_base.h> #include <linux/string.h> +#include <asm/cpu_device_id.h> #include <asm/simd.h> asmlinkage void sha256_transform_ssse3(struct sha256_state *state, const u8 *data, int blocks); +static const struct x86_cpu_id module_cpu_ids[] = { + X86_MATCH_FEATURE(X86_FEATURE_AVX2, NULL), + X86_MATCH_FEATURE(X86_FEATURE_AVX, NULL), + X86_MATCH_FEATURE(X86_FEATURE_SSSE3, NULL), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, module_cpu_ids); + static int _sha256_update(struct shash_desc *desc, const u8 *data, unsigned int len, sha256_block_fn *sha256_xform) { @@ -98,12 +107,20 @@ static int sha256_ssse3_final(struct shash_desc *desc, u8 *out) return sha256_ssse3_finup(desc, NULL, 0, out); } +static int sha256_ssse3_digest(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return sha256_base_init(desc) ?: + sha256_ssse3_finup(desc, data, len, out); +} + static struct shash_alg sha256_ssse3_algs[] = { { .digestsize = SHA256_DIGEST_SIZE, .init = sha256_base_init, .update = sha256_ssse3_update, .final = sha256_ssse3_final, .finup = sha256_ssse3_finup, + .digest = sha256_ssse3_digest, .descsize = sizeof(struct sha256_state), .base = { .cra_name = "sha256", @@ -163,12 +180,20 @@ static int sha256_avx_final(struct shash_desc *desc, u8 *out) return sha256_avx_finup(desc, NULL, 0, out); } +static int sha256_avx_digest(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return sha256_base_init(desc) ?: + sha256_avx_finup(desc, data, len, out); +} + static struct shash_alg sha256_avx_algs[] = { { .digestsize = SHA256_DIGEST_SIZE, .init = sha256_base_init, .update = sha256_avx_update, .final = sha256_avx_final, .finup = sha256_avx_finup, + .digest = sha256_avx_digest, .descsize = sizeof(struct sha256_state), .base = { .cra_name = "sha256", @@ -239,12 +264,20 @@ static int sha256_avx2_final(struct shash_desc *desc, u8 *out) return sha256_avx2_finup(desc, NULL, 0, out); } +static int sha256_avx2_digest(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return sha256_base_init(desc) ?: + sha256_avx2_finup(desc, data, len, out); +} + static struct shash_alg sha256_avx2_algs[] = { { .digestsize = SHA256_DIGEST_SIZE, .init = sha256_base_init, .update = sha256_avx2_update, .final = sha256_avx2_final, .finup = sha256_avx2_finup, + .digest = sha256_avx2_digest, .descsize = sizeof(struct sha256_state), .base = { .cra_name = "sha256", @@ -314,12 +347,20 @@ static int sha256_ni_final(struct shash_desc *desc, u8 *out) return sha256_ni_finup(desc, NULL, 0, out); } +static int sha256_ni_digest(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return sha256_base_init(desc) ?: + sha256_ni_finup(desc, data, len, out); +} + static struct shash_alg sha256_ni_algs[] = { { .digestsize = SHA256_DIGEST_SIZE, .init = sha256_base_init, .update = sha256_ni_update, .final = sha256_ni_final, .finup = sha256_ni_finup, + .digest = sha256_ni_digest, .descsize = sizeof(struct sha256_state), .base = { .cra_name = "sha256", @@ -366,6 +407,9 @@ static inline void unregister_sha256_ni(void) { } static int __init sha256_ssse3_mod_init(void) { + if (!x86_match_cpu(module_cpu_ids)) + return -ENODEV; + if (register_sha256_ssse3()) goto fail; diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 93c60c0c9d4a..d813160b14d8 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -19,6 +19,7 @@ #include <linux/nospec.h> #include <linux/syscalls.h> #include <linux/uaccess.h> +#include <linux/init.h> #ifdef CONFIG_XEN_PV #include <xen/xen-ops.h> @@ -70,7 +71,8 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr) return false; } -__visible noinstr void do_syscall_64(struct pt_regs *regs, int nr) +/* Returns true to return using SYSRET, or false to use IRET */ +__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr) { add_random_kstack_offset(); nr = syscall_enter_from_user_mode(regs, nr); @@ -84,6 +86,46 @@ __visible noinstr void do_syscall_64(struct pt_regs *regs, int nr) instrumentation_end(); syscall_exit_to_user_mode(regs); + + /* + * Check that the register state is valid for using SYSRET to exit + * to userspace. Otherwise use the slower but fully capable IRET + * exit path. + */ + + /* XEN PV guests always use the IRET path */ + if (cpu_feature_enabled(X86_FEATURE_XENPV)) + return false; + + /* SYSRET requires RCX == RIP and R11 == EFLAGS */ + if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags)) + return false; + + /* CS and SS must match the values set in MSR_STAR */ + if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS)) + return false; + + /* + * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP + * in kernel space. This essentially lets the user take over + * the kernel, since userspace controls RSP. + * + * TASK_SIZE_MAX covers all user-accessible addresses other than + * the deprecated vsyscall page. + */ + if (unlikely(regs->ip >= TASK_SIZE_MAX)) + return false; + + /* + * SYSRET cannot restore RF. It can restore TF, but unlike IRET, + * restoring TF results in a trap from userspace immediately after + * SYSRET. + */ + if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF))) + return false; + + /* Use SYSRET to exit to userspace */ + return true; } #endif @@ -96,6 +138,16 @@ static __always_inline int syscall_32_enter(struct pt_regs *regs) return (int)regs->orig_ax; } +#ifdef CONFIG_IA32_EMULATION +bool __ia32_enabled __ro_after_init = !IS_ENABLED(CONFIG_IA32_EMULATION_DEFAULT_DISABLED); + +static int ia32_emulation_override_cmdline(char *arg) +{ + return kstrtobool(arg, &__ia32_enabled); +} +early_param("ia32_emulation", ia32_emulation_override_cmdline); +#endif + /* * Invoke a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. */ @@ -182,8 +234,8 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) return true; } -/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ -__visible noinstr long do_fast_syscall_32(struct pt_regs *regs) +/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */ +__visible noinstr bool do_fast_syscall_32(struct pt_regs *regs) { /* * Called using the internal vDSO SYSENTER/SYSCALL32 calling @@ -201,41 +253,36 @@ __visible noinstr long do_fast_syscall_32(struct pt_regs *regs) /* Invoke the syscall. If it failed, keep it simple: use IRET. */ if (!__do_fast_syscall_32(regs)) - return 0; + return false; -#ifdef CONFIG_X86_64 /* - * Opportunistic SYSRETL: if possible, try to return using SYSRETL. - * SYSRETL is available on all 64-bit CPUs, so we don't need to - * bother with SYSEXIT. - * - * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP, - * because the ECX fixup above will ensure that this is essentially - * never the case. - */ - return regs->cs == __USER32_CS && regs->ss == __USER_DS && - regs->ip == landing_pad && - (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0; -#else - /* - * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT. - * - * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP, - * because the ECX fixup above will ensure that this is essentially - * never the case. - * - * We don't allow syscalls at all from VM86 mode, but we still - * need to check VM, because we might be returning from sys_vm86. + * Check that the register state is valid for using SYSRETL/SYSEXIT + * to exit to userspace. Otherwise use the slower but fully capable + * IRET exit path. */ - return static_cpu_has(X86_FEATURE_SEP) && - regs->cs == __USER_CS && regs->ss == __USER_DS && - regs->ip == landing_pad && - (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0; -#endif + + /* XEN PV guests always use the IRET path */ + if (cpu_feature_enabled(X86_FEATURE_XENPV)) + return false; + + /* EIP must point to the VDSO landing pad */ + if (unlikely(regs->ip != landing_pad)) + return false; + + /* CS and SS must match the values set in MSR_STAR */ + if (unlikely(regs->cs != __USER32_CS || regs->ss != __USER_DS)) + return false; + + /* If the TF, RF, or VM flags are set, use IRET */ + if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM))) + return false; + + /* Use SYSRETL/SYSEXIT to exit to userspace */ + return true; } -/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ -__visible noinstr long do_SYSENTER_32(struct pt_regs *regs) +/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */ +__visible noinstr bool do_SYSENTER_32(struct pt_regs *regs) { /* SYSENTER loses RSP, but the vDSO saved it in RBP. */ regs->sp = regs->bp; diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S index bfb7bcb362bc..8c8d38f0cb1d 100644 --- a/arch/x86/entry/entry.S +++ b/arch/x86/entry/entry.S @@ -3,8 +3,8 @@ * Common place for both 32- and 64-bit entry routines. */ +#include <linux/export.h> #include <linux/linkage.h> -#include <asm/export.h> #include <asm/msr-index.h> .pushsection .noinstr.text, "ax" diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 6e6af42e044a..c73047bf9f4b 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -837,7 +837,7 @@ SYM_FUNC_START(entry_SYSENTER_32) movl %esp, %eax call do_SYSENTER_32 - testl %eax, %eax + testb %al, %al jz .Lsyscall_32_done STACKLEAK_ERASE diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 43606de22511..de6469dffe3a 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -18,6 +18,7 @@ * - SYM_FUNC_START/END:Define functions in the symbol table. * - idtentry: Define exception entry points. */ +#include <linux/export.h> #include <linux/linkage.h> #include <asm/segment.h> #include <asm/cache.h> @@ -34,7 +35,6 @@ #include <asm/asm.h> #include <asm/smap.h> #include <asm/pgtable_types.h> -#include <asm/export.h> #include <asm/frame.h> #include <asm/trapnr.h> #include <asm/nospec-branch.h> @@ -126,70 +126,8 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) * In the Xen PV case we must use iret anyway. */ - ALTERNATIVE "", "jmp swapgs_restore_regs_and_return_to_usermode", \ - X86_FEATURE_XENPV - - movq RCX(%rsp), %rcx - movq RIP(%rsp), %r11 - - cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */ - jne swapgs_restore_regs_and_return_to_usermode - - /* - * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP - * in kernel space. This essentially lets the user take over - * the kernel, since userspace controls RSP. - * - * If width of "canonical tail" ever becomes variable, this will need - * to be updated to remain correct on both old and new CPUs. - * - * Change top bits to match most significant bit (47th or 56th bit - * depending on paging mode) in the address. - */ -#ifdef CONFIG_X86_5LEVEL - ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \ - "shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57 -#else - shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx - sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx -#endif - - /* If this changed %rcx, it was not canonical */ - cmpq %rcx, %r11 - jne swapgs_restore_regs_and_return_to_usermode - - cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */ - jne swapgs_restore_regs_and_return_to_usermode - - movq R11(%rsp), %r11 - cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */ - jne swapgs_restore_regs_and_return_to_usermode - - /* - * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot - * restore RF properly. If the slowpath sets it for whatever reason, we - * need to restore it correctly. - * - * SYSRET can restore TF, but unlike IRET, restoring TF results in a - * trap from userspace immediately after SYSRET. This would cause an - * infinite loop whenever #DB happens with register state that satisfies - * the opportunistic SYSRET conditions. For example, single-stepping - * this user code: - * - * movq $stuck_here, %rcx - * pushfq - * popq %r11 - * stuck_here: - * - * would never get past 'stuck_here'. - */ - testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 - jnz swapgs_restore_regs_and_return_to_usermode - - /* nothing to check for RSP */ - - cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */ - jne swapgs_restore_regs_and_return_to_usermode + ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \ + "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV /* * We win! This label is here just for ease of understanding @@ -1163,8 +1101,8 @@ SYM_CODE_START(asm_exc_nmi) * anyway. * * To handle this case we do the following: - * Check the a special location on the stack that contains - * a variable that is set when NMIs are executing. + * Check a special location on the stack that contains a + * variable that is set when NMIs are executing. * The interrupted task's stack is also checked to see if it * is an NMI stack. * If the variable is not set and the stack is not the NMI @@ -1237,7 +1175,6 @@ SYM_CODE_START(asm_exc_nmi) */ movq %rsp, %rdi - movq $-1, %rsi call exc_nmi /* @@ -1295,8 +1232,8 @@ SYM_CODE_START(asm_exc_nmi) * end_repeat_nmi, then we are a nested NMI. We must not * modify the "iret" frame because it's being written by * the outer NMI. That's okay; the outer NMI handler is - * about to about to call exc_nmi() anyway, so we can just - * resume the outer NMI. + * about to call exc_nmi() anyway, so we can just resume + * the outer NMI. */ movq $repeat_nmi, %rdx @@ -1451,7 +1388,6 @@ end_repeat_nmi: UNWIND_HINT_REGS movq %rsp, %rdi - movq $-1, %rsi call exc_nmi /* Always restore stashed SPEC_CTRL value (see paranoid_entry) */ @@ -1511,18 +1447,16 @@ nmi_restore: iretq SYM_CODE_END(asm_exc_nmi) -#ifndef CONFIG_IA32_EMULATION /* * This handles SYSCALL from 32-bit code. There is no way to program * MSRs to fully disable 32-bit SYSCALL. */ -SYM_CODE_START(ignore_sysret) +SYM_CODE_START(entry_SYSCALL32_ignore) UNWIND_HINT_END_OF_STACK ENDBR mov $-ENOSYS, %eax sysretl -SYM_CODE_END(ignore_sysret) -#endif +SYM_CODE_END(entry_SYSCALL32_ignore) .pushsection .text, "ax" __FUNC_ALIGN diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 70150298f8bd..27c05d08558a 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -118,9 +118,6 @@ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL) movq %rsp, %rdi call do_SYSENTER_32 - /* XEN PV guests always use IRET path */ - ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \ - "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV jmp sysret32_from_system_call .Lsysenter_fix_flags: @@ -212,13 +209,15 @@ SYM_INNER_LABEL(entry_SYSCALL_compat_after_hwframe, SYM_L_GLOBAL) movq %rsp, %rdi call do_fast_syscall_32 + +sysret32_from_system_call: /* XEN PV guests always use IRET path */ - ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \ + ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \ "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV - /* Opportunistic SYSRET */ -sysret32_from_system_call: /* + * Opportunistic SYSRET + * * We are not going to return to userspace from the trampoline * stack. So let's erase the thread stack right now. */ diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 2d0b1bd866ea..c8fac5205803 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -264,7 +264,7 @@ 250 i386 fadvise64 sys_ia32_fadvise64 # 251 is available for reuse (was briefly sys_set_zone_reclaim) 252 i386 exit_group sys_exit_group -253 i386 lookup_dcookie sys_lookup_dcookie compat_sys_lookup_dcookie +253 i386 lookup_dcookie 254 i386 epoll_create sys_epoll_create 255 i386 epoll_ctl sys_epoll_ctl 256 i386 epoll_wait sys_epoll_wait @@ -457,3 +457,7 @@ 450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node 451 i386 cachestat sys_cachestat 452 i386 fchmodat2 sys_fchmodat2 +453 i386 map_shadow_stack sys_map_shadow_stack +454 i386 futex_wake sys_futex_wake +455 i386 futex_wait sys_futex_wait +456 i386 futex_requeue sys_futex_requeue diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 1d6eee30eceb..8cb8bf68721c 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -220,7 +220,7 @@ 209 64 io_submit sys_io_submit 210 common io_cancel sys_io_cancel 211 64 get_thread_area -212 common lookup_dcookie sys_lookup_dcookie +212 common lookup_dcookie 213 common epoll_create sys_epoll_create 214 64 epoll_ctl_old 215 64 epoll_wait_old @@ -375,6 +375,9 @@ 451 common cachestat sys_cachestat 452 common fchmodat2 sys_fchmodat2 453 64 map_shadow_stack sys_map_shadow_stack +454 common futex_wake sys_futex_wake +455 common futex_wait sys_futex_wait +456 common futex_requeue sys_futex_requeue # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/x86/entry/thunk_32.S b/arch/x86/entry/thunk_32.S index ff6e7003da97..0103e103a657 100644 --- a/arch/x86/entry/thunk_32.S +++ b/arch/x86/entry/thunk_32.S @@ -4,9 +4,9 @@ * Copyright 2008 by Steven Rostedt, Red Hat, Inc * (inspired by Andi Kleen's thunk_64.S) */ + #include <linux/export.h> #include <linux/linkage.h> #include <asm/asm.h> - #include <asm/export.h> /* put return address in eax (arg1) */ .macro THUNK name, func, put_ret_addr_in_eax=0 diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S index 27b5da2111ac..416b400f39db 100644 --- a/arch/x86/entry/thunk_64.S +++ b/arch/x86/entry/thunk_64.S @@ -4,10 +4,10 @@ * disturbance of register allocation in some inline assembly constructs. * Copyright 2001,2002 by Andi Kleen, SuSE Labs. */ +#include <linux/export.h> #include <linux/linkage.h> #include "calling.h" #include <asm/asm.h> -#include <asm/export.h> /* rdi: arg1 ... normal C conventions. rax is saved/restored. */ .macro THUNK name, func diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index c197efd82922..b1b8dd1608f7 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -42,7 +42,8 @@ vdso_img-$(VDSO64-y) += 64 vdso_img-$(VDSOX32-y) += x32 vdso_img-$(VDSO32-y) += 32 -obj-$(VDSO32-y) += vdso32-setup.o +obj-$(VDSO32-y) += vdso32-setup.o +OBJECT_FILES_NON_STANDARD_vdso32-setup.o := n vobjs := $(foreach F,$(vobjs-y),$(obj)/$F) vobjs32 := $(foreach F,$(vobjs32-y),$(obj)/$F) diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c index f3b3cacbcbb0..76e4e74f35b5 100644 --- a/arch/x86/entry/vdso/vdso32-setup.c +++ b/arch/x86/entry/vdso/vdso32-setup.c @@ -67,7 +67,6 @@ static struct ctl_table abi_table2[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - {} }; static __init int ia32_binfmt_init(void) diff --git a/arch/x86/entry/vdso/vsgx.S b/arch/x86/entry/vdso/vsgx.S index d77d278ee9dd..37a3d4c02366 100644 --- a/arch/x86/entry/vdso/vsgx.S +++ b/arch/x86/entry/vdso/vsgx.S @@ -1,7 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/linkage.h> -#include <asm/export.h> #include <asm/errno.h> #include <asm/enclu.h> diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index 83f15fe411b3..5bf03c575812 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -26,57 +26,66 @@ #define RDPMC_BASE_LLC 10 #define COUNTER_SHIFT 16 +#define UNCORE_NAME_LEN 16 +#define UNCORE_GROUP_MAX 256 #undef pr_fmt #define pr_fmt(fmt) "amd_uncore: " fmt static int pmu_version; -static int num_counters_llc; -static int num_counters_nb; -static bool l3_mask; -static HLIST_HEAD(uncore_unused_list); - -struct amd_uncore { - int id; +struct amd_uncore_ctx { int refcnt; int cpu; + struct perf_event **events; + struct hlist_node node; +}; + +struct amd_uncore_pmu { + char name[UNCORE_NAME_LEN]; int num_counters; int rdpmc_base; u32 msr_base; - cpumask_t *active_mask; - struct pmu *pmu; - struct perf_event **events; - struct hlist_node node; + int group; + cpumask_t active_mask; + struct pmu pmu; + struct amd_uncore_ctx * __percpu *ctx; }; -static struct amd_uncore * __percpu *amd_uncore_nb; -static struct amd_uncore * __percpu *amd_uncore_llc; +enum { + UNCORE_TYPE_DF, + UNCORE_TYPE_L3, + UNCORE_TYPE_UMC, -static struct pmu amd_nb_pmu; -static struct pmu amd_llc_pmu; + UNCORE_TYPE_MAX +}; -static cpumask_t amd_nb_active_mask; -static cpumask_t amd_llc_active_mask; +union amd_uncore_info { + struct { + u64 aux_data:32; /* auxiliary data */ + u64 num_pmcs:8; /* number of counters */ + u64 gid:8; /* group id */ + u64 cid:8; /* context id */ + } split; + u64 full; +}; -static bool is_nb_event(struct perf_event *event) -{ - return event->pmu->type == amd_nb_pmu.type; -} +struct amd_uncore { + union amd_uncore_info * __percpu info; + struct amd_uncore_pmu *pmus; + unsigned int num_pmus; + bool init_done; + void (*scan)(struct amd_uncore *uncore, unsigned int cpu); + int (*init)(struct amd_uncore *uncore, unsigned int cpu); + void (*move)(struct amd_uncore *uncore, unsigned int cpu); + void (*free)(struct amd_uncore *uncore, unsigned int cpu); +}; -static bool is_llc_event(struct perf_event *event) -{ - return event->pmu->type == amd_llc_pmu.type; -} +static struct amd_uncore uncores[UNCORE_TYPE_MAX]; -static struct amd_uncore *event_to_amd_uncore(struct perf_event *event) +static struct amd_uncore_pmu *event_to_amd_uncore_pmu(struct perf_event *event) { - if (is_nb_event(event) && amd_uncore_nb) - return *per_cpu_ptr(amd_uncore_nb, event->cpu); - else if (is_llc_event(event) && amd_uncore_llc) - return *per_cpu_ptr(amd_uncore_llc, event->cpu); - - return NULL; + return container_of(event->pmu, struct amd_uncore_pmu, pmu); } static void amd_uncore_read(struct perf_event *event) @@ -91,7 +100,16 @@ static void amd_uncore_read(struct perf_event *event) */ prev = local64_read(&hwc->prev_count); - rdpmcl(hwc->event_base_rdpmc, new); + + /* + * Some uncore PMUs do not have RDPMC assignments. In such cases, + * read counts directly from the corresponding PERF_CTR. + */ + if (hwc->event_base_rdpmc < 0) + rdmsrl(hwc->event_base, new); + else + rdpmcl(hwc->event_base_rdpmc, new); + local64_set(&hwc->prev_count, new); delta = (new << COUNTER_SHIFT) - (prev << COUNTER_SHIFT); delta >>= COUNTER_SHIFT; @@ -118,7 +136,7 @@ static void amd_uncore_stop(struct perf_event *event, int flags) hwc->state |= PERF_HES_STOPPED; if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { - amd_uncore_read(event); + event->pmu->read(event); hwc->state |= PERF_HES_UPTODATE; } } @@ -126,15 +144,16 @@ static void amd_uncore_stop(struct perf_event *event, int flags) static int amd_uncore_add(struct perf_event *event, int flags) { int i; - struct amd_uncore *uncore = event_to_amd_uncore(event); + struct amd_uncore_pmu *pmu = event_to_amd_uncore_pmu(event); + struct amd_uncore_ctx *ctx = *per_cpu_ptr(pmu->ctx, event->cpu); struct hw_perf_event *hwc = &event->hw; /* are we already assigned? */ - if (hwc->idx != -1 && uncore->events[hwc->idx] == event) + if (hwc->idx != -1 && ctx->events[hwc->idx] == event) goto out; - for (i = 0; i < uncore->num_counters; i++) { - if (uncore->events[i] == event) { + for (i = 0; i < pmu->num_counters; i++) { + if (ctx->events[i] == event) { hwc->idx = i; goto out; } @@ -142,8 +161,8 @@ static int amd_uncore_add(struct perf_event *event, int flags) /* if not, take the first available counter */ hwc->idx = -1; - for (i = 0; i < uncore->num_counters; i++) { - if (cmpxchg(&uncore->events[i], NULL, event) == NULL) { + for (i = 0; i < pmu->num_counters; i++) { + if (cmpxchg(&ctx->events[i], NULL, event) == NULL) { hwc->idx = i; break; } @@ -153,23 +172,16 @@ out: if (hwc->idx == -1) return -EBUSY; - hwc->config_base = uncore->msr_base + (2 * hwc->idx); - hwc->event_base = uncore->msr_base + 1 + (2 * hwc->idx); - hwc->event_base_rdpmc = uncore->rdpmc_base + hwc->idx; + hwc->config_base = pmu->msr_base + (2 * hwc->idx); + hwc->event_base = pmu->msr_base + 1 + (2 * hwc->idx); + hwc->event_base_rdpmc = pmu->rdpmc_base + hwc->idx; hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; - /* - * The first four DF counters are accessible via RDPMC index 6 to 9 - * followed by the L3 counters from index 10 to 15. For processors - * with more than four DF counters, the DF RDPMC assignments become - * discontiguous as the additional counters are accessible starting - * from index 16. - */ - if (is_nb_event(event) && hwc->idx >= NUM_COUNTERS_NB) - hwc->event_base_rdpmc += NUM_COUNTERS_L3; + if (pmu->rdpmc_base < 0) + hwc->event_base_rdpmc = -1; if (flags & PERF_EF_START) - amd_uncore_start(event, PERF_EF_RELOAD); + event->pmu->start(event, PERF_EF_RELOAD); return 0; } @@ -177,55 +189,36 @@ out: static void amd_uncore_del(struct perf_event *event, int flags) { int i; - struct amd_uncore *uncore = event_to_amd_uncore(event); + struct amd_uncore_pmu *pmu = event_to_amd_uncore_pmu(event); + struct amd_uncore_ctx *ctx = *per_cpu_ptr(pmu->ctx, event->cpu); struct hw_perf_event *hwc = &event->hw; - amd_uncore_stop(event, PERF_EF_UPDATE); + event->pmu->stop(event, PERF_EF_UPDATE); - for (i = 0; i < uncore->num_counters; i++) { - if (cmpxchg(&uncore->events[i], event, NULL) == event) + for (i = 0; i < pmu->num_counters; i++) { + if (cmpxchg(&ctx->events[i], event, NULL) == event) break; } hwc->idx = -1; } -/* - * Return a full thread and slice mask unless user - * has provided them - */ -static u64 l3_thread_slice_mask(u64 config) -{ - if (boot_cpu_data.x86 <= 0x18) - return ((config & AMD64_L3_SLICE_MASK) ? : AMD64_L3_SLICE_MASK) | - ((config & AMD64_L3_THREAD_MASK) ? : AMD64_L3_THREAD_MASK); - - /* - * If the user doesn't specify a threadmask, they're not trying to - * count core 0, so we enable all cores & threads. - * We'll also assume that they want to count slice 0 if they specify - * a threadmask and leave sliceid and enallslices unpopulated. - */ - if (!(config & AMD64_L3_F19H_THREAD_MASK)) - return AMD64_L3_F19H_THREAD_MASK | AMD64_L3_EN_ALL_SLICES | - AMD64_L3_EN_ALL_CORES; - - return config & (AMD64_L3_F19H_THREAD_MASK | AMD64_L3_SLICEID_MASK | - AMD64_L3_EN_ALL_CORES | AMD64_L3_EN_ALL_SLICES | - AMD64_L3_COREID_MASK); -} - static int amd_uncore_event_init(struct perf_event *event) { - struct amd_uncore *uncore; + struct amd_uncore_pmu *pmu; + struct amd_uncore_ctx *ctx; struct hw_perf_event *hwc = &event->hw; - u64 event_mask = AMD64_RAW_EVENT_MASK_NB; if (event->attr.type != event->pmu->type) return -ENOENT; - if (pmu_version >= 2 && is_nb_event(event)) - event_mask = AMD64_PERFMON_V2_RAW_EVENT_MASK_NB; + if (event->cpu < 0) + return -EINVAL; + + pmu = event_to_amd_uncore_pmu(event); + ctx = *per_cpu_ptr(pmu->ctx, event->cpu); + if (!ctx) + return -ENODEV; /* * NB and Last level cache counters (MSRs) are shared across all cores @@ -235,28 +228,14 @@ static int amd_uncore_event_init(struct perf_event *event) * out. So we do not support sampling and per-thread events via * CAP_NO_INTERRUPT, and we do not enable counter overflow interrupts: */ - hwc->config = event->attr.config & event_mask; + hwc->config = event->attr.config; hwc->idx = -1; - if (event->cpu < 0) - return -EINVAL; - - /* - * SliceMask and ThreadMask need to be set for certain L3 events. - * For other events, the two fields do not affect the count. - */ - if (l3_mask && is_llc_event(event)) - hwc->config |= l3_thread_slice_mask(event->attr.config); - - uncore = event_to_amd_uncore(event); - if (!uncore) - return -ENODEV; - /* * since request can come in to any of the shared cores, we will remap * to a single common cpu. */ - event->cpu = uncore->cpu; + event->cpu = ctx->cpu; return 0; } @@ -278,17 +257,10 @@ static ssize_t amd_uncore_attr_show_cpumask(struct device *dev, struct device_attribute *attr, char *buf) { - cpumask_t *active_mask; - struct pmu *pmu = dev_get_drvdata(dev); + struct pmu *ptr = dev_get_drvdata(dev); + struct amd_uncore_pmu *pmu = container_of(ptr, struct amd_uncore_pmu, pmu); - if (pmu->type == amd_nb_pmu.type) - active_mask = &amd_nb_active_mask; - else if (pmu->type == amd_llc_pmu.type) - active_mask = &amd_llc_active_mask; - else - return 0; - - return cpumap_print_to_pagebuf(true, buf, active_mask); + return cpumap_print_to_pagebuf(true, buf, &pmu->active_mask); } static DEVICE_ATTR(cpumask, S_IRUGO, amd_uncore_attr_show_cpumask, NULL); @@ -315,7 +287,7 @@ static struct device_attribute format_attr_##_var = \ DEFINE_UNCORE_FORMAT_ATTR(event12, event, "config:0-7,32-35"); DEFINE_UNCORE_FORMAT_ATTR(event14, event, "config:0-7,32-35,59-60"); /* F17h+ DF */ DEFINE_UNCORE_FORMAT_ATTR(event14v2, event, "config:0-7,32-37"); /* PerfMonV2 DF */ -DEFINE_UNCORE_FORMAT_ATTR(event8, event, "config:0-7"); /* F17h+ L3 */ +DEFINE_UNCORE_FORMAT_ATTR(event8, event, "config:0-7"); /* F17h+ L3, PerfMonV2 UMC */ DEFINE_UNCORE_FORMAT_ATTR(umask8, umask, "config:8-15"); DEFINE_UNCORE_FORMAT_ATTR(umask12, umask, "config:8-15,24-27"); /* PerfMonV2 DF */ DEFINE_UNCORE_FORMAT_ATTR(coreid, coreid, "config:42-44"); /* F19h L3 */ @@ -325,6 +297,7 @@ DEFINE_UNCORE_FORMAT_ATTR(threadmask2, threadmask, "config:56-57"); /* F19h L DEFINE_UNCORE_FORMAT_ATTR(enallslices, enallslices, "config:46"); /* F19h L3 */ DEFINE_UNCORE_FORMAT_ATTR(enallcores, enallcores, "config:47"); /* F19h L3 */ DEFINE_UNCORE_FORMAT_ATTR(sliceid, sliceid, "config:48-50"); /* F19h L3 */ +DEFINE_UNCORE_FORMAT_ATTR(rdwrmask, rdwrmask, "config:8-9"); /* PerfMonV2 UMC */ /* Common DF and NB attributes */ static struct attribute *amd_uncore_df_format_attr[] = { @@ -341,6 +314,13 @@ static struct attribute *amd_uncore_l3_format_attr[] = { NULL, }; +/* Common UMC attributes */ +static struct attribute *amd_uncore_umc_format_attr[] = { + &format_attr_event8.attr, /* event */ + &format_attr_rdwrmask.attr, /* rdwrmask */ + NULL, +}; + /* F17h unique L3 attributes */ static struct attribute *amd_f17h_uncore_l3_format_attr[] = { &format_attr_slicemask.attr, /* slicemask */ @@ -378,6 +358,11 @@ static struct attribute_group amd_f19h_uncore_l3_format_group = { .is_visible = amd_f19h_uncore_is_visible, }; +static struct attribute_group amd_uncore_umc_format_group = { + .name = "format", + .attrs = amd_uncore_umc_format_attr, +}; + static const struct attribute_group *amd_uncore_df_attr_groups[] = { &amd_uncore_attr_group, &amd_uncore_df_format_group, @@ -396,259 +381,636 @@ static const struct attribute_group *amd_uncore_l3_attr_update[] = { NULL, }; -static struct pmu amd_nb_pmu = { - .task_ctx_nr = perf_invalid_context, - .attr_groups = amd_uncore_df_attr_groups, - .name = "amd_nb", - .event_init = amd_uncore_event_init, - .add = amd_uncore_add, - .del = amd_uncore_del, - .start = amd_uncore_start, - .stop = amd_uncore_stop, - .read = amd_uncore_read, - .capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT, - .module = THIS_MODULE, +static const struct attribute_group *amd_uncore_umc_attr_groups[] = { + &amd_uncore_attr_group, + &amd_uncore_umc_format_group, + NULL, }; -static struct pmu amd_llc_pmu = { - .task_ctx_nr = perf_invalid_context, - .attr_groups = amd_uncore_l3_attr_groups, - .attr_update = amd_uncore_l3_attr_update, - .name = "amd_l2", - .event_init = amd_uncore_event_init, - .add = amd_uncore_add, - .del = amd_uncore_del, - .start = amd_uncore_start, - .stop = amd_uncore_stop, - .read = amd_uncore_read, - .capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT, - .module = THIS_MODULE, -}; +static __always_inline +int amd_uncore_ctx_cid(struct amd_uncore *uncore, unsigned int cpu) +{ + union amd_uncore_info *info = per_cpu_ptr(uncore->info, cpu); + return info->split.cid; +} -static struct amd_uncore *amd_uncore_alloc(unsigned int cpu) +static __always_inline +int amd_uncore_ctx_gid(struct amd_uncore *uncore, unsigned int cpu) { - return kzalloc_node(sizeof(struct amd_uncore), GFP_KERNEL, - cpu_to_node(cpu)); + union amd_uncore_info *info = per_cpu_ptr(uncore->info, cpu); + return info->split.gid; } -static inline struct perf_event ** -amd_uncore_events_alloc(unsigned int num, unsigned int cpu) +static __always_inline +int amd_uncore_ctx_num_pmcs(struct amd_uncore *uncore, unsigned int cpu) { - return kzalloc_node(sizeof(struct perf_event *) * num, GFP_KERNEL, - cpu_to_node(cpu)); + union amd_uncore_info *info = per_cpu_ptr(uncore->info, cpu); + return info->split.num_pmcs; } -static int amd_uncore_cpu_up_prepare(unsigned int cpu) +static void amd_uncore_ctx_free(struct amd_uncore *uncore, unsigned int cpu) { - struct amd_uncore *uncore_nb = NULL, *uncore_llc = NULL; + struct amd_uncore_pmu *pmu; + struct amd_uncore_ctx *ctx; + int i; - if (amd_uncore_nb) { - *per_cpu_ptr(amd_uncore_nb, cpu) = NULL; - uncore_nb = amd_uncore_alloc(cpu); - if (!uncore_nb) - goto fail; - uncore_nb->cpu = cpu; - uncore_nb->num_counters = num_counters_nb; - uncore_nb->rdpmc_base = RDPMC_BASE_NB; - uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL; - uncore_nb->active_mask = &amd_nb_active_mask; - uncore_nb->pmu = &amd_nb_pmu; - uncore_nb->events = amd_uncore_events_alloc(num_counters_nb, cpu); - if (!uncore_nb->events) - goto fail; - uncore_nb->id = -1; - *per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb; + if (!uncore->init_done) + return; + + for (i = 0; i < uncore->num_pmus; i++) { + pmu = &uncore->pmus[i]; + ctx = *per_cpu_ptr(pmu->ctx, cpu); + if (!ctx) + continue; + + if (cpu == ctx->cpu) + cpumask_clear_cpu(cpu, &pmu->active_mask); + + if (!--ctx->refcnt) { + kfree(ctx->events); + kfree(ctx); + } + + *per_cpu_ptr(pmu->ctx, cpu) = NULL; } +} - if (amd_uncore_llc) { - *per_cpu_ptr(amd_uncore_llc, cpu) = NULL; - uncore_llc = amd_uncore_alloc(cpu); - if (!uncore_llc) - goto fail; - uncore_llc->cpu = cpu; - uncore_llc->num_counters = num_counters_llc; - uncore_llc->rdpmc_base = RDPMC_BASE_LLC; - uncore_llc->msr_base = MSR_F16H_L2I_PERF_CTL; - uncore_llc->active_mask = &amd_llc_active_mask; - uncore_llc->pmu = &amd_llc_pmu; - uncore_llc->events = amd_uncore_events_alloc(num_counters_llc, cpu); - if (!uncore_llc->events) - goto fail; - uncore_llc->id = -1; - *per_cpu_ptr(amd_uncore_llc, cpu) = uncore_llc; +static int amd_uncore_ctx_init(struct amd_uncore *uncore, unsigned int cpu) +{ + struct amd_uncore_ctx *curr, *prev; + struct amd_uncore_pmu *pmu; + int node, cid, gid, i, j; + + if (!uncore->init_done || !uncore->num_pmus) + return 0; + + cid = amd_uncore_ctx_cid(uncore, cpu); + gid = amd_uncore_ctx_gid(uncore, cpu); + + for (i = 0; i < uncore->num_pmus; i++) { + pmu = &uncore->pmus[i]; + *per_cpu_ptr(pmu->ctx, cpu) = NULL; + curr = NULL; + + /* Check for group exclusivity */ + if (gid != pmu->group) + continue; + + /* Find a sibling context */ + for_each_online_cpu(j) { + if (cpu == j) + continue; + + prev = *per_cpu_ptr(pmu->ctx, j); + if (!prev) + continue; + + if (cid == amd_uncore_ctx_cid(uncore, j)) { + curr = prev; + break; + } + } + + /* Allocate context if sibling does not exist */ + if (!curr) { + node = cpu_to_node(cpu); + curr = kzalloc_node(sizeof(*curr), GFP_KERNEL, node); + if (!curr) + goto fail; + + curr->cpu = cpu; + curr->events = kzalloc_node(sizeof(*curr->events) * + pmu->num_counters, + GFP_KERNEL, node); + if (!curr->events) { + kfree(curr); + goto fail; + } + + cpumask_set_cpu(cpu, &pmu->active_mask); + } + + curr->refcnt++; + *per_cpu_ptr(pmu->ctx, cpu) = curr; } return 0; fail: - if (uncore_nb) { - kfree(uncore_nb->events); - kfree(uncore_nb); - } - - if (uncore_llc) { - kfree(uncore_llc->events); - kfree(uncore_llc); - } + amd_uncore_ctx_free(uncore, cpu); return -ENOMEM; } -static struct amd_uncore * -amd_uncore_find_online_sibling(struct amd_uncore *this, - struct amd_uncore * __percpu *uncores) +static void amd_uncore_ctx_move(struct amd_uncore *uncore, unsigned int cpu) { - unsigned int cpu; - struct amd_uncore *that; - - for_each_online_cpu(cpu) { - that = *per_cpu_ptr(uncores, cpu); + struct amd_uncore_ctx *curr, *next; + struct amd_uncore_pmu *pmu; + int i, j; - if (!that) - continue; + if (!uncore->init_done) + return; - if (this == that) + for (i = 0; i < uncore->num_pmus; i++) { + pmu = &uncore->pmus[i]; + curr = *per_cpu_ptr(pmu->ctx, cpu); + if (!curr) continue; - if (this->id == that->id) { - hlist_add_head(&this->node, &uncore_unused_list); - this = that; - break; + /* Migrate to a shared sibling if possible */ + for_each_online_cpu(j) { + next = *per_cpu_ptr(pmu->ctx, j); + if (!next || cpu == j) + continue; + + if (curr == next) { + perf_pmu_migrate_context(&pmu->pmu, cpu, j); + cpumask_clear_cpu(cpu, &pmu->active_mask); + cpumask_set_cpu(j, &pmu->active_mask); + next->cpu = j; + break; + } } } - - this->refcnt++; - return this; } static int amd_uncore_cpu_starting(unsigned int cpu) { - unsigned int eax, ebx, ecx, edx; struct amd_uncore *uncore; + int i; + + for (i = 0; i < UNCORE_TYPE_MAX; i++) { + uncore = &uncores[i]; + uncore->scan(uncore, cpu); + } + + return 0; +} - if (amd_uncore_nb) { - uncore = *per_cpu_ptr(amd_uncore_nb, cpu); - cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); - uncore->id = ecx & 0xff; +static int amd_uncore_cpu_online(unsigned int cpu) +{ + struct amd_uncore *uncore; + int i; - uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_nb); - *per_cpu_ptr(amd_uncore_nb, cpu) = uncore; + for (i = 0; i < UNCORE_TYPE_MAX; i++) { + uncore = &uncores[i]; + if (uncore->init(uncore, cpu)) + break; } - if (amd_uncore_llc) { - uncore = *per_cpu_ptr(amd_uncore_llc, cpu); - uncore->id = get_llc_id(cpu); + return 0; +} - uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_llc); - *per_cpu_ptr(amd_uncore_llc, cpu) = uncore; +static int amd_uncore_cpu_down_prepare(unsigned int cpu) +{ + struct amd_uncore *uncore; + int i; + + for (i = 0; i < UNCORE_TYPE_MAX; i++) { + uncore = &uncores[i]; + uncore->move(uncore, cpu); } return 0; } -static void uncore_clean_online(void) +static int amd_uncore_cpu_dead(unsigned int cpu) { struct amd_uncore *uncore; - struct hlist_node *n; + int i; - hlist_for_each_entry_safe(uncore, n, &uncore_unused_list, node) { - hlist_del(&uncore->node); - kfree(uncore->events); - kfree(uncore); + for (i = 0; i < UNCORE_TYPE_MAX; i++) { + uncore = &uncores[i]; + uncore->free(uncore, cpu); } + + return 0; } -static void uncore_online(unsigned int cpu, - struct amd_uncore * __percpu *uncores) +static int amd_uncore_df_event_init(struct perf_event *event) { - struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu); + struct hw_perf_event *hwc = &event->hw; + int ret = amd_uncore_event_init(event); - uncore_clean_online(); + if (ret || pmu_version < 2) + return ret; - if (cpu == uncore->cpu) - cpumask_set_cpu(cpu, uncore->active_mask); + hwc->config = event->attr.config & + (pmu_version >= 2 ? AMD64_PERFMON_V2_RAW_EVENT_MASK_NB : + AMD64_RAW_EVENT_MASK_NB); + + return 0; } -static int amd_uncore_cpu_online(unsigned int cpu) +static int amd_uncore_df_add(struct perf_event *event, int flags) { - if (amd_uncore_nb) - uncore_online(cpu, amd_uncore_nb); + int ret = amd_uncore_add(event, flags & ~PERF_EF_START); + struct hw_perf_event *hwc = &event->hw; + + if (ret) + return ret; + + /* + * The first four DF counters are accessible via RDPMC index 6 to 9 + * followed by the L3 counters from index 10 to 15. For processors + * with more than four DF counters, the DF RDPMC assignments become + * discontiguous as the additional counters are accessible starting + * from index 16. + */ + if (hwc->idx >= NUM_COUNTERS_NB) + hwc->event_base_rdpmc += NUM_COUNTERS_L3; - if (amd_uncore_llc) - uncore_online(cpu, amd_uncore_llc); + /* Delayed start after rdpmc base update */ + if (flags & PERF_EF_START) + amd_uncore_start(event, PERF_EF_RELOAD); return 0; } -static void uncore_down_prepare(unsigned int cpu, - struct amd_uncore * __percpu *uncores) +static +void amd_uncore_df_ctx_scan(struct amd_uncore *uncore, unsigned int cpu) { - unsigned int i; - struct amd_uncore *this = *per_cpu_ptr(uncores, cpu); + union cpuid_0x80000022_ebx ebx; + union amd_uncore_info info; - if (this->cpu != cpu) + if (!boot_cpu_has(X86_FEATURE_PERFCTR_NB)) return; - /* this cpu is going down, migrate to a shared sibling if possible */ - for_each_online_cpu(i) { - struct amd_uncore *that = *per_cpu_ptr(uncores, i); + info.split.aux_data = 0; + info.split.num_pmcs = NUM_COUNTERS_NB; + info.split.gid = 0; + info.split.cid = topology_die_id(cpu); - if (cpu == i) - continue; + if (pmu_version >= 2) { + ebx.full = cpuid_ebx(EXT_PERFMON_DEBUG_FEATURES); + info.split.num_pmcs = ebx.split.num_df_pmc; + } - if (this == that) { - perf_pmu_migrate_context(this->pmu, cpu, i); - cpumask_clear_cpu(cpu, that->active_mask); - cpumask_set_cpu(i, that->active_mask); - that->cpu = i; - break; - } + *per_cpu_ptr(uncore->info, cpu) = info; +} + +static +int amd_uncore_df_ctx_init(struct amd_uncore *uncore, unsigned int cpu) +{ + struct attribute **df_attr = amd_uncore_df_format_attr; + struct amd_uncore_pmu *pmu; + + /* Run just once */ + if (uncore->init_done) + return amd_uncore_ctx_init(uncore, cpu); + + /* No grouping, single instance for a system */ + uncore->pmus = kzalloc(sizeof(*uncore->pmus), GFP_KERNEL); + if (!uncore->pmus) { + uncore->num_pmus = 0; + goto done; } + + /* + * For Family 17h and above, the Northbridge counters are repurposed + * as Data Fabric counters. The PMUs are exported based on family as + * either NB or DF. + */ + pmu = &uncore->pmus[0]; + strscpy(pmu->name, boot_cpu_data.x86 >= 0x17 ? "amd_df" : "amd_nb", + sizeof(pmu->name)); + pmu->num_counters = amd_uncore_ctx_num_pmcs(uncore, cpu); + pmu->msr_base = MSR_F15H_NB_PERF_CTL; + pmu->rdpmc_base = RDPMC_BASE_NB; + pmu->group = amd_uncore_ctx_gid(uncore, cpu); + + if (pmu_version >= 2) { + *df_attr++ = &format_attr_event14v2.attr; + *df_attr++ = &format_attr_umask12.attr; + } else if (boot_cpu_data.x86 >= 0x17) { + *df_attr = &format_attr_event14.attr; + } + + pmu->ctx = alloc_percpu(struct amd_uncore_ctx *); + if (!pmu->ctx) + goto done; + + pmu->pmu = (struct pmu) { + .task_ctx_nr = perf_invalid_context, + .attr_groups = amd_uncore_df_attr_groups, + .name = pmu->name, + .event_init = amd_uncore_df_event_init, + .add = amd_uncore_df_add, + .del = amd_uncore_del, + .start = amd_uncore_start, + .stop = amd_uncore_stop, + .read = amd_uncore_read, + .capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT, + .module = THIS_MODULE, + }; + + if (perf_pmu_register(&pmu->pmu, pmu->pmu.name, -1)) { + free_percpu(pmu->ctx); + pmu->ctx = NULL; + goto done; + } + + pr_info("%d %s%s counters detected\n", pmu->num_counters, + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ? "HYGON " : "", + pmu->pmu.name); + + uncore->num_pmus = 1; + +done: + uncore->init_done = true; + + return amd_uncore_ctx_init(uncore, cpu); } -static int amd_uncore_cpu_down_prepare(unsigned int cpu) +static int amd_uncore_l3_event_init(struct perf_event *event) { - if (amd_uncore_nb) - uncore_down_prepare(cpu, amd_uncore_nb); + int ret = amd_uncore_event_init(event); + struct hw_perf_event *hwc = &event->hw; + u64 config = event->attr.config; + u64 mask; - if (amd_uncore_llc) - uncore_down_prepare(cpu, amd_uncore_llc); + hwc->config = config & AMD64_RAW_EVENT_MASK_NB; + + /* + * SliceMask and ThreadMask need to be set for certain L3 events. + * For other events, the two fields do not affect the count. + */ + if (ret || boot_cpu_data.x86 < 0x17) + return ret; + + mask = config & (AMD64_L3_F19H_THREAD_MASK | AMD64_L3_SLICEID_MASK | + AMD64_L3_EN_ALL_CORES | AMD64_L3_EN_ALL_SLICES | + AMD64_L3_COREID_MASK); + + if (boot_cpu_data.x86 <= 0x18) + mask = ((config & AMD64_L3_SLICE_MASK) ? : AMD64_L3_SLICE_MASK) | + ((config & AMD64_L3_THREAD_MASK) ? : AMD64_L3_THREAD_MASK); + + /* + * If the user doesn't specify a ThreadMask, they're not trying to + * count core 0, so we enable all cores & threads. + * We'll also assume that they want to count slice 0 if they specify + * a ThreadMask and leave SliceId and EnAllSlices unpopulated. + */ + else if (!(config & AMD64_L3_F19H_THREAD_MASK)) + mask = AMD64_L3_F19H_THREAD_MASK | AMD64_L3_EN_ALL_SLICES | + AMD64_L3_EN_ALL_CORES; + + hwc->config |= mask; return 0; } -static void uncore_dead(unsigned int cpu, struct amd_uncore * __percpu *uncores) +static +void amd_uncore_l3_ctx_scan(struct amd_uncore *uncore, unsigned int cpu) { - struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu); + union amd_uncore_info info; - if (cpu == uncore->cpu) - cpumask_clear_cpu(cpu, uncore->active_mask); + if (!boot_cpu_has(X86_FEATURE_PERFCTR_LLC)) + return; + + info.split.aux_data = 0; + info.split.num_pmcs = NUM_COUNTERS_L2; + info.split.gid = 0; + info.split.cid = per_cpu_llc_id(cpu); - if (!--uncore->refcnt) { - kfree(uncore->events); - kfree(uncore); + if (boot_cpu_data.x86 >= 0x17) + info.split.num_pmcs = NUM_COUNTERS_L3; + + *per_cpu_ptr(uncore->info, cpu) = info; +} + +static +int amd_uncore_l3_ctx_init(struct amd_uncore *uncore, unsigned int cpu) +{ + struct attribute **l3_attr = amd_uncore_l3_format_attr; + struct amd_uncore_pmu *pmu; + + /* Run just once */ + if (uncore->init_done) + return amd_uncore_ctx_init(uncore, cpu); + + /* No grouping, single instance for a system */ + uncore->pmus = kzalloc(sizeof(*uncore->pmus), GFP_KERNEL); + if (!uncore->pmus) { + uncore->num_pmus = 0; + goto done; } - *per_cpu_ptr(uncores, cpu) = NULL; + /* + * For Family 17h and above, L3 cache counters are available instead + * of L2 cache counters. The PMUs are exported based on family as + * either L2 or L3. + */ + pmu = &uncore->pmus[0]; + strscpy(pmu->name, boot_cpu_data.x86 >= 0x17 ? "amd_l3" : "amd_l2", + sizeof(pmu->name)); + pmu->num_counters = amd_uncore_ctx_num_pmcs(uncore, cpu); + pmu->msr_base = MSR_F16H_L2I_PERF_CTL; + pmu->rdpmc_base = RDPMC_BASE_LLC; + pmu->group = amd_uncore_ctx_gid(uncore, cpu); + + if (boot_cpu_data.x86 >= 0x17) { + *l3_attr++ = &format_attr_event8.attr; + *l3_attr++ = &format_attr_umask8.attr; + *l3_attr++ = boot_cpu_data.x86 >= 0x19 ? + &format_attr_threadmask2.attr : + &format_attr_threadmask8.attr; + } + + pmu->ctx = alloc_percpu(struct amd_uncore_ctx *); + if (!pmu->ctx) + goto done; + + pmu->pmu = (struct pmu) { + .task_ctx_nr = perf_invalid_context, + .attr_groups = amd_uncore_l3_attr_groups, + .attr_update = amd_uncore_l3_attr_update, + .name = pmu->name, + .event_init = amd_uncore_l3_event_init, + .add = amd_uncore_add, + .del = amd_uncore_del, + .start = amd_uncore_start, + .stop = amd_uncore_stop, + .read = amd_uncore_read, + .capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT, + .module = THIS_MODULE, + }; + + if (perf_pmu_register(&pmu->pmu, pmu->pmu.name, -1)) { + free_percpu(pmu->ctx); + pmu->ctx = NULL; + goto done; + } + + pr_info("%d %s%s counters detected\n", pmu->num_counters, + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ? "HYGON " : "", + pmu->pmu.name); + + uncore->num_pmus = 1; + +done: + uncore->init_done = true; + + return amd_uncore_ctx_init(uncore, cpu); } -static int amd_uncore_cpu_dead(unsigned int cpu) +static int amd_uncore_umc_event_init(struct perf_event *event) { - if (amd_uncore_nb) - uncore_dead(cpu, amd_uncore_nb); + struct hw_perf_event *hwc = &event->hw; + int ret = amd_uncore_event_init(event); + + if (ret) + return ret; - if (amd_uncore_llc) - uncore_dead(cpu, amd_uncore_llc); + hwc->config = event->attr.config & AMD64_PERFMON_V2_RAW_EVENT_MASK_UMC; return 0; } -static int __init amd_uncore_init(void) +static void amd_uncore_umc_start(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + + if (flags & PERF_EF_RELOAD) + wrmsrl(hwc->event_base, (u64)local64_read(&hwc->prev_count)); + + hwc->state = 0; + wrmsrl(hwc->config_base, (hwc->config | AMD64_PERFMON_V2_ENABLE_UMC)); + perf_event_update_userpage(event); +} + +static +void amd_uncore_umc_ctx_scan(struct amd_uncore *uncore, unsigned int cpu) { - struct attribute **df_attr = amd_uncore_df_format_attr; - struct attribute **l3_attr = amd_uncore_l3_format_attr; union cpuid_0x80000022_ebx ebx; + union amd_uncore_info info; + unsigned int eax, ecx, edx; + + if (pmu_version < 2) + return; + + cpuid(EXT_PERFMON_DEBUG_FEATURES, &eax, &ebx.full, &ecx, &edx); + info.split.aux_data = ecx; /* stash active mask */ + info.split.num_pmcs = ebx.split.num_umc_pmc; + info.split.gid = topology_die_id(cpu); + info.split.cid = topology_die_id(cpu); + *per_cpu_ptr(uncore->info, cpu) = info; +} + +static +int amd_uncore_umc_ctx_init(struct amd_uncore *uncore, unsigned int cpu) +{ + DECLARE_BITMAP(gmask, UNCORE_GROUP_MAX) = { 0 }; + u8 group_num_pmus[UNCORE_GROUP_MAX] = { 0 }; + u8 group_num_pmcs[UNCORE_GROUP_MAX] = { 0 }; + union amd_uncore_info info; + struct amd_uncore_pmu *pmu; + int index = 0, gid, i; + + if (pmu_version < 2) + return 0; + + /* Run just once */ + if (uncore->init_done) + return amd_uncore_ctx_init(uncore, cpu); + + /* Find unique groups */ + for_each_online_cpu(i) { + info = *per_cpu_ptr(uncore->info, i); + gid = info.split.gid; + if (test_bit(gid, gmask)) + continue; + + __set_bit(gid, gmask); + group_num_pmus[gid] = hweight32(info.split.aux_data); + group_num_pmcs[gid] = info.split.num_pmcs; + uncore->num_pmus += group_num_pmus[gid]; + } + + uncore->pmus = kzalloc(sizeof(*uncore->pmus) * uncore->num_pmus, + GFP_KERNEL); + if (!uncore->pmus) { + uncore->num_pmus = 0; + goto done; + } + + for_each_set_bit(gid, gmask, UNCORE_GROUP_MAX) { + for (i = 0; i < group_num_pmus[gid]; i++) { + pmu = &uncore->pmus[index]; + snprintf(pmu->name, sizeof(pmu->name), "amd_umc_%d", index); + pmu->num_counters = group_num_pmcs[gid] / group_num_pmus[gid]; + pmu->msr_base = MSR_F19H_UMC_PERF_CTL + i * pmu->num_counters * 2; + pmu->rdpmc_base = -1; + pmu->group = gid; + + pmu->ctx = alloc_percpu(struct amd_uncore_ctx *); + if (!pmu->ctx) + goto done; + + pmu->pmu = (struct pmu) { + .task_ctx_nr = perf_invalid_context, + .attr_groups = amd_uncore_umc_attr_groups, + .name = pmu->name, + .event_init = amd_uncore_umc_event_init, + .add = amd_uncore_add, + .del = amd_uncore_del, + .start = amd_uncore_umc_start, + .stop = amd_uncore_stop, + .read = amd_uncore_read, + .capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT, + .module = THIS_MODULE, + }; + + if (perf_pmu_register(&pmu->pmu, pmu->pmu.name, -1)) { + free_percpu(pmu->ctx); + pmu->ctx = NULL; + goto done; + } + + pr_info("%d %s counters detected\n", pmu->num_counters, + pmu->pmu.name); + + index++; + } + } + +done: + uncore->num_pmus = index; + uncore->init_done = true; + + return amd_uncore_ctx_init(uncore, cpu); +} + +static struct amd_uncore uncores[UNCORE_TYPE_MAX] = { + /* UNCORE_TYPE_DF */ + { + .scan = amd_uncore_df_ctx_scan, + .init = amd_uncore_df_ctx_init, + .move = amd_uncore_ctx_move, + .free = amd_uncore_ctx_free, + }, + /* UNCORE_TYPE_L3 */ + { + .scan = amd_uncore_l3_ctx_scan, + .init = amd_uncore_l3_ctx_init, + .move = amd_uncore_ctx_move, + .free = amd_uncore_ctx_free, + }, + /* UNCORE_TYPE_UMC */ + { + .scan = amd_uncore_umc_ctx_scan, + .init = amd_uncore_umc_ctx_init, + .move = amd_uncore_ctx_move, + .free = amd_uncore_ctx_free, + }, +}; + +static int __init amd_uncore_init(void) +{ + struct amd_uncore *uncore; int ret = -ENODEV; + int i; if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) @@ -660,125 +1022,91 @@ static int __init amd_uncore_init(void) if (boot_cpu_has(X86_FEATURE_PERFMON_V2)) pmu_version = 2; - num_counters_nb = NUM_COUNTERS_NB; - num_counters_llc = NUM_COUNTERS_L2; - if (boot_cpu_data.x86 >= 0x17) { - /* - * For F17h and above, the Northbridge counters are - * repurposed as Data Fabric counters. Also, L3 - * counters are supported too. The PMUs are exported - * based on family as either L2 or L3 and NB or DF. - */ - num_counters_llc = NUM_COUNTERS_L3; - amd_nb_pmu.name = "amd_df"; - amd_llc_pmu.name = "amd_l3"; - l3_mask = true; - } + for (i = 0; i < UNCORE_TYPE_MAX; i++) { + uncore = &uncores[i]; - if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) { - if (pmu_version >= 2) { - *df_attr++ = &format_attr_event14v2.attr; - *df_attr++ = &format_attr_umask12.attr; - } else if (boot_cpu_data.x86 >= 0x17) { - *df_attr = &format_attr_event14.attr; - } + BUG_ON(!uncore->scan); + BUG_ON(!uncore->init); + BUG_ON(!uncore->move); + BUG_ON(!uncore->free); - amd_uncore_nb = alloc_percpu(struct amd_uncore *); - if (!amd_uncore_nb) { + uncore->info = alloc_percpu(union amd_uncore_info); + if (!uncore->info) { ret = -ENOMEM; - goto fail_nb; - } - ret = perf_pmu_register(&amd_nb_pmu, amd_nb_pmu.name, -1); - if (ret) - goto fail_nb; - - if (pmu_version >= 2) { - ebx.full = cpuid_ebx(EXT_PERFMON_DEBUG_FEATURES); - num_counters_nb = ebx.split.num_df_pmc; - } - - pr_info("%d %s %s counters detected\n", num_counters_nb, - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ? "HYGON" : "", - amd_nb_pmu.name); - - ret = 0; - } - - if (boot_cpu_has(X86_FEATURE_PERFCTR_LLC)) { - if (boot_cpu_data.x86 >= 0x19) { - *l3_attr++ = &format_attr_event8.attr; - *l3_attr++ = &format_attr_umask8.attr; - *l3_attr++ = &format_attr_threadmask2.attr; - } else if (boot_cpu_data.x86 >= 0x17) { - *l3_attr++ = &format_attr_event8.attr; - *l3_attr++ = &format_attr_umask8.attr; - *l3_attr++ = &format_attr_threadmask8.attr; - } - - amd_uncore_llc = alloc_percpu(struct amd_uncore *); - if (!amd_uncore_llc) { - ret = -ENOMEM; - goto fail_llc; + goto fail; } - ret = perf_pmu_register(&amd_llc_pmu, amd_llc_pmu.name, -1); - if (ret) - goto fail_llc; - - pr_info("%d %s %s counters detected\n", num_counters_llc, - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ? "HYGON" : "", - amd_llc_pmu.name); - ret = 0; - } + }; /* * Install callbacks. Core will call them for each online cpu. */ - if (cpuhp_setup_state(CPUHP_PERF_X86_AMD_UNCORE_PREP, - "perf/x86/amd/uncore:prepare", - amd_uncore_cpu_up_prepare, amd_uncore_cpu_dead)) - goto fail_llc; - - if (cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING, - "perf/x86/amd/uncore:starting", - amd_uncore_cpu_starting, NULL)) + ret = cpuhp_setup_state(CPUHP_PERF_X86_AMD_UNCORE_PREP, + "perf/x86/amd/uncore:prepare", + NULL, amd_uncore_cpu_dead); + if (ret) + goto fail; + + ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING, + "perf/x86/amd/uncore:starting", + amd_uncore_cpu_starting, NULL); + if (ret) goto fail_prep; - if (cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE, - "perf/x86/amd/uncore:online", - amd_uncore_cpu_online, - amd_uncore_cpu_down_prepare)) + + ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE, + "perf/x86/amd/uncore:online", + amd_uncore_cpu_online, + amd_uncore_cpu_down_prepare); + if (ret) goto fail_start; + return 0; fail_start: cpuhp_remove_state(CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING); fail_prep: cpuhp_remove_state(CPUHP_PERF_X86_AMD_UNCORE_PREP); -fail_llc: - if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) - perf_pmu_unregister(&amd_nb_pmu); - free_percpu(amd_uncore_llc); -fail_nb: - free_percpu(amd_uncore_nb); +fail: + for (i = 0; i < UNCORE_TYPE_MAX; i++) { + uncore = &uncores[i]; + if (uncore->info) { + free_percpu(uncore->info); + uncore->info = NULL; + } + } return ret; } static void __exit amd_uncore_exit(void) { + struct amd_uncore *uncore; + struct amd_uncore_pmu *pmu; + int i, j; + cpuhp_remove_state(CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE); cpuhp_remove_state(CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING); cpuhp_remove_state(CPUHP_PERF_X86_AMD_UNCORE_PREP); - if (boot_cpu_has(X86_FEATURE_PERFCTR_LLC)) { - perf_pmu_unregister(&amd_llc_pmu); - free_percpu(amd_uncore_llc); - amd_uncore_llc = NULL; - } + for (i = 0; i < UNCORE_TYPE_MAX; i++) { + uncore = &uncores[i]; + if (!uncore->info) + continue; + + free_percpu(uncore->info); + uncore->info = NULL; + + for (j = 0; j < uncore->num_pmus; j++) { + pmu = &uncore->pmus[j]; + if (!pmu->ctx) + continue; + + perf_pmu_unregister(&pmu->pmu); + free_percpu(pmu->ctx); + pmu->ctx = NULL; + } - if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) { - perf_pmu_unregister(&amd_nb_pmu); - free_percpu(amd_uncore_nb); - amd_uncore_nb = NULL; + kfree(uncore->pmus); + uncore->pmus = NULL; } } diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 185f902e5f28..40ad1425ffa2 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1887,9 +1887,9 @@ ssize_t events_hybrid_sysfs_show(struct device *dev, str = pmu_attr->event_str; for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { - if (!(x86_pmu.hybrid_pmu[i].cpu_type & pmu_attr->pmu_type)) + if (!(x86_pmu.hybrid_pmu[i].pmu_type & pmu_attr->pmu_type)) continue; - if (x86_pmu.hybrid_pmu[i].cpu_type & pmu->cpu_type) { + if (x86_pmu.hybrid_pmu[i].pmu_type & pmu->pmu_type) { next_str = strchr(str, ';'); if (next_str) return snprintf(page, next_str - str + 1, "%s", str); @@ -2169,7 +2169,7 @@ static int __init init_hw_perf_events(void) hybrid_pmu->pmu.capabilities |= PERF_PMU_CAP_EXTENDED_HW_TYPE; err = perf_pmu_register(&hybrid_pmu->pmu, hybrid_pmu->name, - (hybrid_pmu->cpu_type == hybrid_big) ? PERF_TYPE_RAW : -1); + (hybrid_pmu->pmu_type == hybrid_big) ? PERF_TYPE_RAW : -1); if (err) break; } diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index fa355d3658a6..a08f794a0e79 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -211,6 +211,14 @@ static struct event_constraint intel_slm_event_constraints[] __read_mostly = EVENT_CONSTRAINT_END }; +static struct event_constraint intel_grt_event_constraints[] __read_mostly = { + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x013c, 2), /* CPU_CLK_UNHALTED.REF_TSC_P */ + EVENT_CONSTRAINT_END +}; + static struct event_constraint intel_skl_event_constraints[] = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ @@ -299,7 +307,7 @@ static struct extra_reg intel_icl_extra_regs[] __read_mostly = { EVENT_EXTRA_END }; -static struct extra_reg intel_spr_extra_regs[] __read_mostly = { +static struct extra_reg intel_glc_extra_regs[] __read_mostly = { INTEL_UEVENT_EXTRA_REG(0x012a, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0), INTEL_UEVENT_EXTRA_REG(0x012b, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1), INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), @@ -309,11 +317,12 @@ static struct extra_reg intel_spr_extra_regs[] __read_mostly = { EVENT_EXTRA_END }; -static struct event_constraint intel_spr_event_constraints[] = { +static struct event_constraint intel_glc_event_constraints[] = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x0100, 0), /* INST_RETIRED.PREC_DIST */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x013c, 2), /* CPU_CLK_UNHALTED.REF_TSC_P */ FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_RETIRING, 0), METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BAD_SPEC, 1), @@ -349,7 +358,7 @@ static struct event_constraint intel_spr_event_constraints[] = { EVENT_CONSTRAINT_END }; -static struct extra_reg intel_gnr_extra_regs[] __read_mostly = { +static struct extra_reg intel_rwc_extra_regs[] __read_mostly = { INTEL_UEVENT_EXTRA_REG(0x012a, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0), INTEL_UEVENT_EXTRA_REG(0x012b, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1), INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), @@ -473,7 +482,7 @@ static u64 intel_pmu_event_map(int hw_event) return intel_perfmon_event_map[hw_event]; } -static __initconst const u64 spr_hw_cache_event_ids +static __initconst const u64 glc_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = @@ -552,7 +561,7 @@ static __initconst const u64 spr_hw_cache_event_ids }, }; -static __initconst const u64 spr_hw_cache_extra_regs +static __initconst const u64 glc_hw_cache_extra_regs [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = @@ -2556,16 +2565,6 @@ static int icl_set_topdown_event_period(struct perf_event *event) return 0; } -static int adl_set_topdown_event_period(struct perf_event *event) -{ - struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu); - - if (pmu->cpu_type != hybrid_big) - return 0; - - return icl_set_topdown_event_period(event); -} - DEFINE_STATIC_CALL(intel_pmu_set_topdown_event_period, x86_perf_event_set_period); static inline u64 icl_get_metrics_event_value(u64 metric, u64 slots, int idx) @@ -2708,16 +2707,6 @@ static u64 icl_update_topdown_event(struct perf_event *event) x86_pmu.num_topdown_events - 1); } -static u64 adl_update_topdown_event(struct perf_event *event) -{ - struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu); - - if (pmu->cpu_type != hybrid_big) - return 0; - - return icl_update_topdown_event(event); -} - DEFINE_STATIC_CALL(intel_pmu_update_topdown_event, x86_perf_event_update); static void intel_pmu_read_topdown_event(struct perf_event *event) @@ -3869,7 +3858,7 @@ static inline bool require_mem_loads_aux_event(struct perf_event *event) return false; if (is_hybrid()) - return hybrid_pmu(event->pmu)->cpu_type == hybrid_big; + return hybrid_pmu(event->pmu)->pmu_type == hybrid_big; return true; } @@ -4273,7 +4262,7 @@ icl_get_event_constraints(struct cpu_hw_events *cpuc, int idx, } static struct event_constraint * -spr_get_event_constraints(struct cpu_hw_events *cpuc, int idx, +glc_get_event_constraints(struct cpu_hw_events *cpuc, int idx, struct perf_event *event) { struct event_constraint *c; @@ -4361,9 +4350,9 @@ adl_get_event_constraints(struct cpu_hw_events *cpuc, int idx, { struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu); - if (pmu->cpu_type == hybrid_big) - return spr_get_event_constraints(cpuc, idx, event); - else if (pmu->cpu_type == hybrid_small) + if (pmu->pmu_type == hybrid_big) + return glc_get_event_constraints(cpuc, idx, event); + else if (pmu->pmu_type == hybrid_small) return tnt_get_event_constraints(cpuc, idx, event); WARN_ON(1); @@ -4409,7 +4398,7 @@ rwc_get_event_constraints(struct cpu_hw_events *cpuc, int idx, { struct event_constraint *c; - c = spr_get_event_constraints(cpuc, idx, event); + c = glc_get_event_constraints(cpuc, idx, event); /* The Retire Latency is not supported by the fixed counter 0. */ if (event->attr.precise_ip && @@ -4433,9 +4422,9 @@ mtl_get_event_constraints(struct cpu_hw_events *cpuc, int idx, { struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu); - if (pmu->cpu_type == hybrid_big) + if (pmu->pmu_type == hybrid_big) return rwc_get_event_constraints(cpuc, idx, event); - if (pmu->cpu_type == hybrid_small) + if (pmu->pmu_type == hybrid_small) return cmt_get_event_constraints(cpuc, idx, event); WARN_ON(1); @@ -4446,18 +4435,18 @@ static int adl_hw_config(struct perf_event *event) { struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu); - if (pmu->cpu_type == hybrid_big) + if (pmu->pmu_type == hybrid_big) return hsw_hw_config(event); - else if (pmu->cpu_type == hybrid_small) + else if (pmu->pmu_type == hybrid_small) return intel_pmu_hw_config(event); WARN_ON(1); return -EOPNOTSUPP; } -static u8 adl_get_hybrid_cpu_type(void) +static enum hybrid_cpu_type adl_get_hybrid_cpu_type(void) { - return hybrid_big; + return HYBRID_INTEL_CORE; } /* @@ -4490,7 +4479,7 @@ static void nhm_limit_period(struct perf_event *event, s64 *left) *left = max(*left, 32LL); } -static void spr_limit_period(struct perf_event *event, s64 *left) +static void glc_limit_period(struct perf_event *event, s64 *left) { if (event->attr.precise_ip == 3) *left = max(*left, 128LL); @@ -4618,6 +4607,23 @@ static void intel_pmu_check_num_counters(int *num_counters, int *num_counters_fixed, u64 *intel_ctrl, u64 fixed_mask); +static void intel_pmu_check_event_constraints(struct event_constraint *event_constraints, + int num_counters, + int num_counters_fixed, + u64 intel_ctrl); + +static void intel_pmu_check_extra_regs(struct extra_reg *extra_regs); + +static inline bool intel_pmu_broken_perf_cap(void) +{ + /* The Perf Metric (Bit 15) is always cleared */ + if ((boot_cpu_data.x86_model == INTEL_FAM6_METEORLAKE) || + (boot_cpu_data.x86_model == INTEL_FAM6_METEORLAKE_L)) + return true; + + return false; +} + static void update_pmu_cap(struct x86_hybrid_pmu *pmu) { unsigned int sub_bitmaps = cpuid_eax(ARCH_PERFMON_EXT_LEAF); @@ -4628,27 +4634,83 @@ static void update_pmu_cap(struct x86_hybrid_pmu *pmu) &eax, &ebx, &ecx, &edx); pmu->num_counters = fls(eax); pmu->num_counters_fixed = fls(ebx); - intel_pmu_check_num_counters(&pmu->num_counters, &pmu->num_counters_fixed, - &pmu->intel_ctrl, ebx); + } + + + if (!intel_pmu_broken_perf_cap()) { + /* Perf Metric (Bit 15) and PEBS via PT (Bit 16) are hybrid enumeration */ + rdmsrl(MSR_IA32_PERF_CAPABILITIES, pmu->intel_cap.capabilities); } } -static bool init_hybrid_pmu(int cpu) +static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu) +{ + intel_pmu_check_num_counters(&pmu->num_counters, &pmu->num_counters_fixed, + &pmu->intel_ctrl, (1ULL << pmu->num_counters_fixed) - 1); + pmu->max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, pmu->num_counters); + pmu->unconstrained = (struct event_constraint) + __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1, + 0, pmu->num_counters, 0, 0); + + if (pmu->intel_cap.perf_metrics) + pmu->intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS; + else + pmu->intel_ctrl &= ~(1ULL << GLOBAL_CTRL_EN_PERF_METRICS); + + if (pmu->intel_cap.pebs_output_pt_available) + pmu->pmu.capabilities |= PERF_PMU_CAP_AUX_OUTPUT; + else + pmu->pmu.capabilities |= ~PERF_PMU_CAP_AUX_OUTPUT; + + intel_pmu_check_event_constraints(pmu->event_constraints, + pmu->num_counters, + pmu->num_counters_fixed, + pmu->intel_ctrl); + + intel_pmu_check_extra_regs(pmu->extra_regs); +} + +static struct x86_hybrid_pmu *find_hybrid_pmu_for_cpu(void) { - struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); u8 cpu_type = get_this_hybrid_cpu_type(); - struct x86_hybrid_pmu *pmu = NULL; int i; - if (!cpu_type && x86_pmu.get_hybrid_cpu_type) - cpu_type = x86_pmu.get_hybrid_cpu_type(); + /* + * This is running on a CPU model that is known to have hybrid + * configurations. But the CPU told us it is not hybrid, shame + * on it. There should be a fixup function provided for these + * troublesome CPUs (->get_hybrid_cpu_type). + */ + if (cpu_type == HYBRID_INTEL_NONE) { + if (x86_pmu.get_hybrid_cpu_type) + cpu_type = x86_pmu.get_hybrid_cpu_type(); + else + return NULL; + } + /* + * This essentially just maps between the 'hybrid_cpu_type' + * and 'hybrid_pmu_type' enums: + */ for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { - if (x86_pmu.hybrid_pmu[i].cpu_type == cpu_type) { - pmu = &x86_pmu.hybrid_pmu[i]; - break; - } + enum hybrid_pmu_type pmu_type = x86_pmu.hybrid_pmu[i].pmu_type; + + if (cpu_type == HYBRID_INTEL_CORE && + pmu_type == hybrid_big) + return &x86_pmu.hybrid_pmu[i]; + if (cpu_type == HYBRID_INTEL_ATOM && + pmu_type == hybrid_small) + return &x86_pmu.hybrid_pmu[i]; } + + return NULL; +} + +static bool init_hybrid_pmu(int cpu) +{ + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + struct x86_hybrid_pmu *pmu = find_hybrid_pmu_for_cpu(); + if (WARN_ON_ONCE(!pmu || (pmu->pmu.type == -1))) { cpuc->pmu = NULL; return false; @@ -4661,6 +4723,8 @@ static bool init_hybrid_pmu(int cpu) if (this_cpu_has(X86_FEATURE_ARCH_PERFMON_EXT)) update_pmu_cap(pmu); + intel_pmu_check_hybrid_pmus(pmu); + if (!check_hw_exists(&pmu->pmu, pmu->num_counters, pmu->num_counters_fixed)) return false; @@ -5337,14 +5401,14 @@ static struct attribute *icl_tsx_events_attrs[] = { EVENT_ATTR_STR(mem-stores, mem_st_spr, "event=0xcd,umask=0x2"); EVENT_ATTR_STR(mem-loads-aux, mem_ld_aux, "event=0x03,umask=0x82"); -static struct attribute *spr_events_attrs[] = { +static struct attribute *glc_events_attrs[] = { EVENT_PTR(mem_ld_hsw), EVENT_PTR(mem_st_spr), EVENT_PTR(mem_ld_aux), NULL, }; -static struct attribute *spr_td_events_attrs[] = { +static struct attribute *glc_td_events_attrs[] = { EVENT_PTR(slots), EVENT_PTR(td_retiring), EVENT_PTR(td_bad_spec), @@ -5357,7 +5421,7 @@ static struct attribute *spr_td_events_attrs[] = { NULL, }; -static struct attribute *spr_tsx_events_attrs[] = { +static struct attribute *glc_tsx_events_attrs[] = { EVENT_PTR(tx_start), EVENT_PTR(tx_abort), EVENT_PTR(tx_commit), @@ -5699,7 +5763,7 @@ static bool is_attr_for_this_pmu(struct kobject *kobj, struct attribute *attr) struct perf_pmu_events_hybrid_attr *pmu_attr = container_of(attr, struct perf_pmu_events_hybrid_attr, attr.attr); - return pmu->cpu_type & pmu_attr->pmu_type; + return pmu->pmu_type & pmu_attr->pmu_type; } static umode_t hybrid_events_is_visible(struct kobject *kobj, @@ -5736,7 +5800,7 @@ static umode_t hybrid_format_is_visible(struct kobject *kobj, container_of(attr, struct perf_pmu_format_hybrid_attr, attr.attr); int cpu = hybrid_find_supported_cpu(pmu); - return (cpu >= 0) && (pmu->cpu_type & pmu_attr->pmu_type) ? attr->mode : 0; + return (cpu >= 0) && (pmu->pmu_type & pmu_attr->pmu_type) ? attr->mode : 0; } static struct attribute_group hybrid_group_events_td = { @@ -5880,40 +5944,105 @@ static void intel_pmu_check_extra_regs(struct extra_reg *extra_regs) } } -static void intel_pmu_check_hybrid_pmus(u64 fixed_mask) +static const struct { enum hybrid_pmu_type id; char *name; } intel_hybrid_pmu_type_map[] __initconst = { + { hybrid_small, "cpu_atom" }, + { hybrid_big, "cpu_core" }, +}; + +static __always_inline int intel_pmu_init_hybrid(enum hybrid_pmu_type pmus) { + unsigned long pmus_mask = pmus; struct x86_hybrid_pmu *pmu; - int i; + int idx = 0, bit; - for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { - pmu = &x86_pmu.hybrid_pmu[i]; + x86_pmu.num_hybrid_pmus = hweight_long(pmus_mask); + x86_pmu.hybrid_pmu = kcalloc(x86_pmu.num_hybrid_pmus, + sizeof(struct x86_hybrid_pmu), + GFP_KERNEL); + if (!x86_pmu.hybrid_pmu) + return -ENOMEM; - intel_pmu_check_num_counters(&pmu->num_counters, - &pmu->num_counters_fixed, - &pmu->intel_ctrl, - fixed_mask); + static_branch_enable(&perf_is_hybrid); + x86_pmu.filter = intel_pmu_filter; - if (pmu->intel_cap.perf_metrics) { - pmu->intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS; - pmu->intel_ctrl |= INTEL_PMC_MSK_FIXED_SLOTS; + for_each_set_bit(bit, &pmus_mask, ARRAY_SIZE(intel_hybrid_pmu_type_map)) { + pmu = &x86_pmu.hybrid_pmu[idx++]; + pmu->pmu_type = intel_hybrid_pmu_type_map[bit].id; + pmu->name = intel_hybrid_pmu_type_map[bit].name; + + pmu->num_counters = x86_pmu.num_counters; + pmu->num_counters_fixed = x86_pmu.num_counters_fixed; + pmu->max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, pmu->num_counters); + pmu->unconstrained = (struct event_constraint) + __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1, + 0, pmu->num_counters, 0, 0); + + pmu->intel_cap.capabilities = x86_pmu.intel_cap.capabilities; + if (pmu->pmu_type & hybrid_small) { + pmu->intel_cap.perf_metrics = 0; + pmu->intel_cap.pebs_output_pt_available = 1; + pmu->mid_ack = true; + } else if (pmu->pmu_type & hybrid_big) { + pmu->intel_cap.perf_metrics = 1; + pmu->intel_cap.pebs_output_pt_available = 0; + pmu->late_ack = true; } + } - if (pmu->intel_cap.pebs_output_pt_available) - pmu->pmu.capabilities |= PERF_PMU_CAP_AUX_OUTPUT; + return 0; +} - intel_pmu_check_event_constraints(pmu->event_constraints, - pmu->num_counters, - pmu->num_counters_fixed, - pmu->intel_ctrl); +static __always_inline void intel_pmu_ref_cycles_ext(void) +{ + if (!(x86_pmu.events_maskl & (INTEL_PMC_MSK_FIXED_REF_CYCLES >> INTEL_PMC_IDX_FIXED))) + intel_perfmon_event_map[PERF_COUNT_HW_REF_CPU_CYCLES] = 0x013c; +} - intel_pmu_check_extra_regs(pmu->extra_regs); - } +static __always_inline void intel_pmu_init_glc(struct pmu *pmu) +{ + x86_pmu.late_ack = true; + x86_pmu.limit_period = glc_limit_period; + x86_pmu.pebs_aliases = NULL; + x86_pmu.pebs_prec_dist = true; + x86_pmu.pebs_block = true; + x86_pmu.flags |= PMU_FL_HAS_RSP_1; + x86_pmu.flags |= PMU_FL_NO_HT_SHARING; + x86_pmu.flags |= PMU_FL_INSTR_LATENCY; + x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04); + x86_pmu.lbr_pt_coexist = true; + x86_pmu.num_topdown_events = 8; + static_call_update(intel_pmu_update_topdown_event, + &icl_update_topdown_event); + static_call_update(intel_pmu_set_topdown_event_period, + &icl_set_topdown_event_period); + + memcpy(hybrid_var(pmu, hw_cache_event_ids), glc_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + memcpy(hybrid_var(pmu, hw_cache_extra_regs), glc_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); + hybrid(pmu, event_constraints) = intel_glc_event_constraints; + hybrid(pmu, pebs_constraints) = intel_glc_pebs_event_constraints; + + intel_pmu_ref_cycles_ext(); } -static __always_inline bool is_mtl(u8 x86_model) +static __always_inline void intel_pmu_init_grt(struct pmu *pmu) { - return (x86_model == INTEL_FAM6_METEORLAKE) || - (x86_model == INTEL_FAM6_METEORLAKE_L); + x86_pmu.mid_ack = true; + x86_pmu.limit_period = glc_limit_period; + x86_pmu.pebs_aliases = NULL; + x86_pmu.pebs_prec_dist = true; + x86_pmu.pebs_block = true; + x86_pmu.lbr_pt_coexist = true; + x86_pmu.flags |= PMU_FL_HAS_RSP_1; + x86_pmu.flags |= PMU_FL_INSTR_LATENCY; + + memcpy(hybrid_var(pmu, hw_cache_event_ids), glp_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + memcpy(hybrid_var(pmu, hw_cache_extra_regs), tnt_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); + hybrid_var(pmu, hw_cache_event_ids)[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1; + hybrid(pmu, event_constraints) = intel_grt_event_constraints; + hybrid(pmu, pebs_constraints) = intel_grt_pebs_event_constraints; + hybrid(pmu, extra_regs) = intel_grt_extra_regs; + + intel_pmu_ref_cycles_ext(); } __init int intel_pmu_init(void) @@ -6194,28 +6323,10 @@ __init int intel_pmu_init(void) break; case INTEL_FAM6_ATOM_GRACEMONT: - x86_pmu.mid_ack = true; - memcpy(hw_cache_event_ids, glp_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - memcpy(hw_cache_extra_regs, tnt_hw_cache_extra_regs, - sizeof(hw_cache_extra_regs)); - hw_cache_event_ids[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1; - - x86_pmu.event_constraints = intel_slm_event_constraints; - x86_pmu.pebs_constraints = intel_grt_pebs_event_constraints; - x86_pmu.extra_regs = intel_grt_extra_regs; - - x86_pmu.pebs_aliases = NULL; - x86_pmu.pebs_prec_dist = true; - x86_pmu.pebs_block = true; - x86_pmu.lbr_pt_coexist = true; - x86_pmu.flags |= PMU_FL_HAS_RSP_1; - x86_pmu.flags |= PMU_FL_INSTR_LATENCY; - + intel_pmu_init_grt(NULL); intel_pmu_pebs_data_source_grt(); x86_pmu.pebs_latency_data = adl_latency_data_small; x86_pmu.get_event_constraints = tnt_get_event_constraints; - x86_pmu.limit_period = spr_limit_period; td_attr = tnt_events_attrs; mem_attr = grt_mem_attrs; extra_attr = nhm_format_attr; @@ -6225,28 +6336,11 @@ __init int intel_pmu_init(void) case INTEL_FAM6_ATOM_CRESTMONT: case INTEL_FAM6_ATOM_CRESTMONT_X: - x86_pmu.mid_ack = true; - memcpy(hw_cache_event_ids, glp_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - memcpy(hw_cache_extra_regs, tnt_hw_cache_extra_regs, - sizeof(hw_cache_extra_regs)); - hw_cache_event_ids[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1; - - x86_pmu.event_constraints = intel_slm_event_constraints; - x86_pmu.pebs_constraints = intel_grt_pebs_event_constraints; + intel_pmu_init_grt(NULL); x86_pmu.extra_regs = intel_cmt_extra_regs; - - x86_pmu.pebs_aliases = NULL; - x86_pmu.pebs_prec_dist = true; - x86_pmu.lbr_pt_coexist = true; - x86_pmu.pebs_block = true; - x86_pmu.flags |= PMU_FL_HAS_RSP_1; - x86_pmu.flags |= PMU_FL_INSTR_LATENCY; - intel_pmu_pebs_data_source_cmt(); x86_pmu.pebs_latency_data = mtl_latency_data_small; x86_pmu.get_event_constraints = cmt_get_event_constraints; - x86_pmu.limit_period = spr_limit_period; td_attr = cmt_events_attrs; mem_attr = grt_mem_attrs; extra_attr = cmt_format_attr; @@ -6563,44 +6657,23 @@ __init int intel_pmu_init(void) case INTEL_FAM6_SAPPHIRERAPIDS_X: case INTEL_FAM6_EMERALDRAPIDS_X: x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX; - x86_pmu.extra_regs = intel_spr_extra_regs; + x86_pmu.extra_regs = intel_glc_extra_regs; fallthrough; case INTEL_FAM6_GRANITERAPIDS_X: case INTEL_FAM6_GRANITERAPIDS_D: - pmem = true; - x86_pmu.late_ack = true; - memcpy(hw_cache_event_ids, spr_hw_cache_event_ids, sizeof(hw_cache_event_ids)); - memcpy(hw_cache_extra_regs, spr_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); - - x86_pmu.event_constraints = intel_spr_event_constraints; - x86_pmu.pebs_constraints = intel_spr_pebs_event_constraints; + intel_pmu_init_glc(NULL); if (!x86_pmu.extra_regs) - x86_pmu.extra_regs = intel_gnr_extra_regs; - x86_pmu.limit_period = spr_limit_period; + x86_pmu.extra_regs = intel_rwc_extra_regs; x86_pmu.pebs_ept = 1; - x86_pmu.pebs_aliases = NULL; - x86_pmu.pebs_prec_dist = true; - x86_pmu.pebs_block = true; - x86_pmu.flags |= PMU_FL_HAS_RSP_1; - x86_pmu.flags |= PMU_FL_NO_HT_SHARING; - x86_pmu.flags |= PMU_FL_INSTR_LATENCY; - x86_pmu.hw_config = hsw_hw_config; - x86_pmu.get_event_constraints = spr_get_event_constraints; + x86_pmu.get_event_constraints = glc_get_event_constraints; extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? hsw_format_attr : nhm_format_attr; extra_skl_attr = skl_format_attr; - mem_attr = spr_events_attrs; - td_attr = spr_td_events_attrs; - tsx_attr = spr_tsx_events_attrs; - x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04); - x86_pmu.lbr_pt_coexist = true; - intel_pmu_pebs_data_source_skl(pmem); - x86_pmu.num_topdown_events = 8; - static_call_update(intel_pmu_update_topdown_event, - &icl_update_topdown_event); - static_call_update(intel_pmu_set_topdown_event_period, - &icl_set_topdown_event_period); + mem_attr = glc_events_attrs; + td_attr = glc_td_events_attrs; + tsx_attr = glc_tsx_events_attrs; + intel_pmu_pebs_data_source_skl(true); pr_cont("Sapphire Rapids events, "); name = "sapphire_rapids"; break; @@ -6610,47 +6683,17 @@ __init int intel_pmu_init(void) case INTEL_FAM6_RAPTORLAKE: case INTEL_FAM6_RAPTORLAKE_P: case INTEL_FAM6_RAPTORLAKE_S: - case INTEL_FAM6_METEORLAKE: - case INTEL_FAM6_METEORLAKE_L: /* * Alder Lake has 2 types of CPU, core and atom. * * Initialize the common PerfMon capabilities here. */ - x86_pmu.hybrid_pmu = kcalloc(X86_HYBRID_NUM_PMUS, - sizeof(struct x86_hybrid_pmu), - GFP_KERNEL); - if (!x86_pmu.hybrid_pmu) - return -ENOMEM; - static_branch_enable(&perf_is_hybrid); - x86_pmu.num_hybrid_pmus = X86_HYBRID_NUM_PMUS; + intel_pmu_init_hybrid(hybrid_big_small); - x86_pmu.pebs_aliases = NULL; - x86_pmu.pebs_prec_dist = true; - x86_pmu.pebs_block = true; - x86_pmu.flags |= PMU_FL_HAS_RSP_1; - x86_pmu.flags |= PMU_FL_NO_HT_SHARING; - x86_pmu.flags |= PMU_FL_INSTR_LATENCY; - x86_pmu.lbr_pt_coexist = true; x86_pmu.pebs_latency_data = adl_latency_data_small; - x86_pmu.num_topdown_events = 8; - static_call_update(intel_pmu_update_topdown_event, - &adl_update_topdown_event); - static_call_update(intel_pmu_set_topdown_event_period, - &adl_set_topdown_event_period); - - x86_pmu.filter = intel_pmu_filter; x86_pmu.get_event_constraints = adl_get_event_constraints; x86_pmu.hw_config = adl_hw_config; - x86_pmu.limit_period = spr_limit_period; x86_pmu.get_hybrid_cpu_type = adl_get_hybrid_cpu_type; - /* - * The rtm_abort_event is used to check whether to enable GPRs - * for the RTM abort event. Atom doesn't have the RTM abort - * event. There is no harmful to set it in the common - * x86_pmu.rtm_abort_event. - */ - x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04); td_attr = adl_hybrid_events_attrs; mem_attr = adl_hybrid_mem_attrs; @@ -6660,9 +6703,7 @@ __init int intel_pmu_init(void) /* Initialize big core specific PerfMon capabilities.*/ pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX]; - pmu->name = "cpu_core"; - pmu->cpu_type = hybrid_big; - pmu->late_ack = true; + intel_pmu_init_glc(&pmu->pmu); if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) { pmu->num_counters = x86_pmu.num_counters + 2; pmu->num_counters_fixed = x86_pmu.num_counters_fixed + 1; @@ -6687,54 +6728,45 @@ __init int intel_pmu_init(void) pmu->unconstrained = (struct event_constraint) __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1, 0, pmu->num_counters, 0, 0); - pmu->intel_cap.capabilities = x86_pmu.intel_cap.capabilities; - pmu->intel_cap.perf_metrics = 1; - pmu->intel_cap.pebs_output_pt_available = 0; + pmu->extra_regs = intel_glc_extra_regs; + + /* Initialize Atom core specific PerfMon capabilities.*/ + pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX]; + intel_pmu_init_grt(&pmu->pmu); + + x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX; + intel_pmu_pebs_data_source_adl(); + pr_cont("Alderlake Hybrid events, "); + name = "alderlake_hybrid"; + break; + + case INTEL_FAM6_METEORLAKE: + case INTEL_FAM6_METEORLAKE_L: + intel_pmu_init_hybrid(hybrid_big_small); - memcpy(pmu->hw_cache_event_ids, spr_hw_cache_event_ids, sizeof(pmu->hw_cache_event_ids)); - memcpy(pmu->hw_cache_extra_regs, spr_hw_cache_extra_regs, sizeof(pmu->hw_cache_extra_regs)); - pmu->event_constraints = intel_spr_event_constraints; - pmu->pebs_constraints = intel_spr_pebs_event_constraints; - pmu->extra_regs = intel_spr_extra_regs; + x86_pmu.pebs_latency_data = mtl_latency_data_small; + x86_pmu.get_event_constraints = mtl_get_event_constraints; + x86_pmu.hw_config = adl_hw_config; + + td_attr = adl_hybrid_events_attrs; + mem_attr = mtl_hybrid_mem_attrs; + tsx_attr = adl_hybrid_tsx_attrs; + extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? + mtl_hybrid_extra_attr_rtm : mtl_hybrid_extra_attr; + + /* Initialize big core specific PerfMon capabilities.*/ + pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX]; + intel_pmu_init_glc(&pmu->pmu); + pmu->extra_regs = intel_rwc_extra_regs; /* Initialize Atom core specific PerfMon capabilities.*/ pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX]; - pmu->name = "cpu_atom"; - pmu->cpu_type = hybrid_small; - pmu->mid_ack = true; - pmu->num_counters = x86_pmu.num_counters; - pmu->num_counters_fixed = x86_pmu.num_counters_fixed; - pmu->max_pebs_events = x86_pmu.max_pebs_events; - pmu->unconstrained = (struct event_constraint) - __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1, - 0, pmu->num_counters, 0, 0); - pmu->intel_cap.capabilities = x86_pmu.intel_cap.capabilities; - pmu->intel_cap.perf_metrics = 0; - pmu->intel_cap.pebs_output_pt_available = 1; - - memcpy(pmu->hw_cache_event_ids, glp_hw_cache_event_ids, sizeof(pmu->hw_cache_event_ids)); - memcpy(pmu->hw_cache_extra_regs, tnt_hw_cache_extra_regs, sizeof(pmu->hw_cache_extra_regs)); - pmu->hw_cache_event_ids[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1; - pmu->event_constraints = intel_slm_event_constraints; - pmu->pebs_constraints = intel_grt_pebs_event_constraints; - pmu->extra_regs = intel_grt_extra_regs; - if (is_mtl(boot_cpu_data.x86_model)) { - x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX].extra_regs = intel_gnr_extra_regs; - x86_pmu.pebs_latency_data = mtl_latency_data_small; - extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? - mtl_hybrid_extra_attr_rtm : mtl_hybrid_extra_attr; - mem_attr = mtl_hybrid_mem_attrs; - intel_pmu_pebs_data_source_mtl(); - x86_pmu.get_event_constraints = mtl_get_event_constraints; - pmu->extra_regs = intel_cmt_extra_regs; - pr_cont("Meteorlake Hybrid events, "); - name = "meteorlake_hybrid"; - } else { - x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX; - intel_pmu_pebs_data_source_adl(); - pr_cont("Alderlake Hybrid events, "); - name = "alderlake_hybrid"; - } + intel_pmu_init_grt(&pmu->pmu); + pmu->extra_regs = intel_cmt_extra_regs; + + intel_pmu_pebs_data_source_mtl(); + pr_cont("Meteorlake Hybrid events, "); + name = "meteorlake_hybrid"; break; default: @@ -6846,9 +6878,6 @@ __init int intel_pmu_init(void) if (!is_hybrid() && x86_pmu.intel_cap.perf_metrics) x86_pmu.intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS; - if (is_hybrid()) - intel_pmu_check_hybrid_pmus((u64)fixed_mask); - if (x86_pmu.intel_cap.pebs_timing_info) x86_pmu.flags |= PMU_FL_RETIRE_LATENCY; diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 96fffb2d521d..cbeb6d2bf5b4 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -336,6 +336,9 @@ static int cstate_pmu_event_init(struct perf_event *event) cfg = array_index_nospec((unsigned long)cfg, PERF_CSTATE_PKG_EVENT_MAX); if (!(pkg_msr_mask & (1 << cfg))) return -EINVAL; + + event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG; + event->hw.event_base = pkg_msr[cfg].msr; cpu = cpumask_any_and(&cstate_pkg_cpu_mask, topology_die_cpumask(event->cpu)); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index eb8dd8b8a1e8..bf97ab904d40 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -261,7 +261,7 @@ static u64 __adl_latency_data_small(struct perf_event *event, u64 status, { u64 val; - WARN_ON_ONCE(hybrid_pmu(event->pmu)->cpu_type == hybrid_big); + WARN_ON_ONCE(hybrid_pmu(event->pmu)->pmu_type == hybrid_big); dse &= PERF_PEBS_DATA_SOURCE_MASK; val = hybrid_var(event->pmu, pebs_data_source)[dse]; @@ -1058,7 +1058,7 @@ struct event_constraint intel_icl_pebs_event_constraints[] = { EVENT_CONSTRAINT_END }; -struct event_constraint intel_spr_pebs_event_constraints[] = { +struct event_constraint intel_glc_pebs_event_constraints[] = { INTEL_FLAGS_UEVENT_CONSTRAINT(0x100, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */ INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL), diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 42a55794004a..8e2a12235e62 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -736,6 +736,7 @@ static bool topa_table_full(struct topa *topa) /** * topa_insert_pages() - create a list of ToPA tables * @buf: PT buffer being initialized. + * @cpu: CPU on which to allocate. * @gfp: Allocation flags. * * This initializes a list of ToPA tables with entries from @@ -1207,8 +1208,11 @@ static void pt_buffer_fini_topa(struct pt_buffer *buf) /** * pt_buffer_init_topa() - initialize ToPA table for pt buffer * @buf: PT buffer. - * @size: Total size of all regions within this ToPA. + * @cpu: CPU on which to allocate. + * @nr_pages: No. of pages to allocate. * @gfp: Allocation flags. + * + * Return: 0 on success or error code. */ static int pt_buffer_init_topa(struct pt_buffer *buf, int cpu, unsigned long nr_pages, gfp_t gfp) @@ -1281,7 +1285,7 @@ out: /** * pt_buffer_setup_aux() - set up topa tables for a PT buffer - * @cpu: Cpu on which to allocate, -1 means current. + * @event: Performance event * @pages: Array of pointers to buffer pages passed from perf core. * @nr_pages: Number of pages in the buffer. * @snapshot: If this is a snapshot/overwrite counter. diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 69043e02e8a7..01023aa5125b 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -74,7 +74,7 @@ int uncore_device_to_die(struct pci_dev *dev) struct cpuinfo_x86 *c = &cpu_data(cpu); if (c->initialized && cpu_to_node(cpu) == node) - return c->logical_die_id; + return c->topo.logical_die_id; } return -1; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index c8ba2be7585d..53dd5d495ba6 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -652,10 +652,29 @@ enum { #define PERF_PEBS_DATA_SOURCE_MAX 0x10 #define PERF_PEBS_DATA_SOURCE_MASK (PERF_PEBS_DATA_SOURCE_MAX - 1) +enum hybrid_cpu_type { + HYBRID_INTEL_NONE, + HYBRID_INTEL_ATOM = 0x20, + HYBRID_INTEL_CORE = 0x40, +}; + +enum hybrid_pmu_type { + not_hybrid, + hybrid_small = BIT(0), + hybrid_big = BIT(1), + + hybrid_big_small = hybrid_big | hybrid_small, /* only used for matching */ +}; + +#define X86_HYBRID_PMU_ATOM_IDX 0 +#define X86_HYBRID_PMU_CORE_IDX 1 + +#define X86_HYBRID_NUM_PMUS 2 + struct x86_hybrid_pmu { struct pmu pmu; const char *name; - u8 cpu_type; + enum hybrid_pmu_type pmu_type; cpumask_t supported_cpus; union perf_capabilities intel_cap; u64 intel_ctrl; @@ -721,18 +740,6 @@ extern struct static_key_false perf_is_hybrid; __Fp; \ }) -enum hybrid_pmu_type { - hybrid_big = 0x40, - hybrid_small = 0x20, - - hybrid_big_small = hybrid_big | hybrid_small, -}; - -#define X86_HYBRID_PMU_ATOM_IDX 0 -#define X86_HYBRID_PMU_CORE_IDX 1 - -#define X86_HYBRID_NUM_PMUS 2 - /* * struct x86_pmu - generic x86 pmu */ @@ -940,7 +947,7 @@ struct x86_pmu { */ int num_hybrid_pmus; struct x86_hybrid_pmu *hybrid_pmu; - u8 (*get_hybrid_cpu_type) (void); + enum hybrid_cpu_type (*get_hybrid_cpu_type) (void); }; struct x86_perf_task_context_opt { @@ -1521,7 +1528,7 @@ extern struct event_constraint intel_skl_pebs_event_constraints[]; extern struct event_constraint intel_icl_pebs_event_constraints[]; -extern struct event_constraint intel_spr_pebs_event_constraints[]; +extern struct event_constraint intel_glc_pebs_event_constraints[]; struct event_constraint *intel_pebs_constraints(struct perf_event *event); diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index 1579429846cc..8d98d468b976 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -115,7 +115,7 @@ struct rapl_pmu { struct rapl_pmus { struct pmu pmu; unsigned int maxdie; - struct rapl_pmu *pmus[]; + struct rapl_pmu *pmus[] __counted_by(maxdie); }; enum rapl_unit_quirk { @@ -179,15 +179,11 @@ static u64 rapl_event_update(struct perf_event *event) s64 delta, sdelta; int shift = RAPL_CNTR_WIDTH; -again: prev_raw_count = local64_read(&hwc->prev_count); - rdmsrl(event->hw.event_base, new_raw_count); - - if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, - new_raw_count) != prev_raw_count) { - cpu_relax(); - goto again; - } + do { + rdmsrl(event->hw.event_base, new_raw_count); + } while (!local64_try_cmpxchg(&hwc->prev_count, + &prev_raw_count, new_raw_count)); /* * Now we have the new raw value and have updated the prev @@ -537,11 +533,11 @@ static struct perf_msr intel_rapl_spr_msrs[] = { * - want to use same event codes across both architectures */ static struct perf_msr amd_rapl_msrs[] = { - [PERF_RAPL_PP0] = { 0, &rapl_events_cores_group, 0, false, 0 }, + [PERF_RAPL_PP0] = { 0, &rapl_events_cores_group, NULL, false, 0 }, [PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK }, - [PERF_RAPL_RAM] = { 0, &rapl_events_ram_group, 0, false, 0 }, - [PERF_RAPL_PP1] = { 0, &rapl_events_gpu_group, 0, false, 0 }, - [PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, 0, false, 0 }, + [PERF_RAPL_RAM] = { 0, &rapl_events_ram_group, NULL, false, 0 }, + [PERF_RAPL_PP1] = { 0, &rapl_events_gpu_group, NULL, false, 0 }, + [PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, NULL, false, 0 }, }; static int rapl_cpu_offline(unsigned int cpu) diff --git a/arch/x86/events/utils.c b/arch/x86/events/utils.c index 76b1f8bb0fd5..dab4ed199227 100644 --- a/arch/x86/events/utils.c +++ b/arch/x86/events/utils.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <asm/insn.h> +#include <linux/mm.h> #include "perf_event.h" @@ -132,9 +133,9 @@ static int get_branch_type(unsigned long from, unsigned long to, int abort, * The LBR logs any address in the IP, even if the IP just * faulted. This means userspace can control the from address. * Ensure we don't blindly read any address by validating it is - * a known text address. + * a known text address and not a vsyscall address. */ - if (kernel_text_address(from)) { + if (kernel_text_address(from) && !in_gate_area_no_mm(from)) { addr = (void *)from; /* * Assume we can get the maximum possible size diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 783ed339f341..21556ad87f4b 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -7,6 +7,8 @@ * Author : K. Y. Srinivasan <kys@microsoft.com> */ +#define pr_fmt(fmt) "Hyper-V: " fmt + #include <linux/efi.h> #include <linux/types.h> #include <linux/bitfield.h> @@ -191,7 +193,7 @@ void set_hv_tscchange_cb(void (*cb)(void)) struct hv_tsc_emulation_control emu_ctrl = {.enabled = 1}; if (!hv_reenlightenment_available()) { - pr_warn("Hyper-V: reenlightenment support is unavailable\n"); + pr_warn("reenlightenment support is unavailable\n"); return; } @@ -394,6 +396,7 @@ static void __init hv_get_partition_id(void) local_irq_restore(flags); } +#if IS_ENABLED(CONFIG_HYPERV_VTL_MODE) static u8 __init get_vtl(void) { u64 control = HV_HYPERCALL_REP_COMP_1 | HVCALL_GET_VP_REGISTERS; @@ -416,13 +419,16 @@ static u8 __init get_vtl(void) if (hv_result_success(ret)) { ret = output->as64.low & HV_X64_VTL_MASK; } else { - pr_err("Failed to get VTL(%lld) and set VTL to zero by default.\n", ret); - ret = 0; + pr_err("Failed to get VTL(error: %lld) exiting...\n", ret); + BUG(); } local_irq_restore(flags); return ret; } +#else +static inline u8 get_vtl(void) { return 0; } +#endif /* * This function is to be invoked early in the boot sequence after the @@ -564,7 +570,7 @@ skip_hypercall_pg_init: if (cpu_feature_enabled(X86_FEATURE_IBT) && *(u32 *)hv_hypercall_pg != gen_endbr()) { setup_clear_cpu_cap(X86_FEATURE_IBT); - pr_warn("Hyper-V: Disabling IBT because of Hyper-V bug\n"); + pr_warn("Disabling IBT because of Hyper-V bug\n"); } #endif @@ -604,8 +610,10 @@ skip_hypercall_pg_init: hv_query_ext_cap(0); /* Find the VTL */ - if (!ms_hyperv.paravisor_present && hv_isolation_type_snp()) - ms_hyperv.vtl = get_vtl(); + ms_hyperv.vtl = get_vtl(); + + if (ms_hyperv.vtl > 0) /* non default VTL */ + hv_vtl_early_init(); return; diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c index 36a562218010..96e6c51515f5 100644 --- a/arch/x86/hyperv/hv_vtl.c +++ b/arch/x86/hyperv/hv_vtl.c @@ -196,7 +196,7 @@ static int hv_vtl_apicid_to_vp_id(u32 apic_id) return ret; } -static int hv_vtl_wakeup_secondary_cpu(int apicid, unsigned long start_eip) +static int hv_vtl_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip) { int vp_id; @@ -215,7 +215,7 @@ static int hv_vtl_wakeup_secondary_cpu(int apicid, unsigned long start_eip) return hv_vtl_bringup_vcpu(vp_id, start_eip); } -static int __init hv_vtl_early_init(void) +int __init hv_vtl_early_init(void) { /* * `boot_cpu_has` returns the runtime feature support, @@ -230,4 +230,3 @@ static int __init hv_vtl_early_init(void) return 0; } -early_initcall(hv_vtl_early_init); diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c index 8c6bf07f7d2b..02e55237d919 100644 --- a/arch/x86/hyperv/ivm.c +++ b/arch/x86/hyperv/ivm.c @@ -288,7 +288,7 @@ static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa) free_page((unsigned long)vmsa); } -int hv_snp_boot_ap(int cpu, unsigned long start_ip) +int hv_snp_boot_ap(u32 cpu, unsigned long start_ip) { struct sev_es_save_area *vmsa = (struct sev_es_save_area *) __get_free_page(GFP_KERNEL | __GFP_ZERO); @@ -384,7 +384,7 @@ static inline void hv_ghcb_msr_read(u64 msr, u64 *value) {} #ifdef CONFIG_INTEL_TDX_GUEST static void hv_tdx_msr_write(u64 msr, u64 val) { - struct tdx_hypercall_args args = { + struct tdx_module_args args = { .r10 = TDX_HYPERCALL_STANDARD, .r11 = EXIT_REASON_MSR_WRITE, .r12 = msr, @@ -398,13 +398,13 @@ static void hv_tdx_msr_write(u64 msr, u64 val) static void hv_tdx_msr_read(u64 msr, u64 *val) { - struct tdx_hypercall_args args = { + struct tdx_module_args args = { .r10 = TDX_HYPERCALL_STANDARD, .r11 = EXIT_REASON_MSR_READ, .r12 = msr, }; - u64 ret = __tdx_hypercall_ret(&args); + u64 ret = __tdx_hypercall(&args); if (WARN_ONCE(ret, "Failed to emulate MSR read: %lld\n", ret)) *val = 0; @@ -414,13 +414,13 @@ static void hv_tdx_msr_read(u64 msr, u64 *val) u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2) { - struct tdx_hypercall_args args = { }; + struct tdx_module_args args = { }; args.r10 = control; args.rdx = param1; args.r8 = param2; - (void)__tdx_hypercall_ret(&args); + (void)__tdx_hypercall(&args); return args.r11; } diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 4f1ce5fc4e19..a192bdea69e2 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -10,5 +10,4 @@ generated-y += unistd_64_x32.h generated-y += xen-hypercalls.h generic-y += early_ioremap.h -generic-y += export.h generic-y += mcs_spinlock.h diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 9c4da699e11a..65f79092c9d9 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -58,7 +58,7 @@ #define ANNOTATE_IGNORE_ALTERNATIVE \ "999:\n\t" \ ".pushsection .discard.ignore_alts\n\t" \ - ".long 999b - .\n\t" \ + ".long 999b\n\t" \ ".popsection\n\t" /* @@ -352,7 +352,7 @@ static inline int alternatives_text_reserved(void *start, void *end) .macro ANNOTATE_IGNORE_ALTERNATIVE .Lannotate_\@: .pushsection .discard.ignore_alts - .long .Lannotate_\@ - . + .long .Lannotate_\@ .popsection .endm diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 5af4ec1a0f71..b0d192f613b7 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -54,7 +54,7 @@ extern int local_apic_timer_c2_ok; extern bool apic_is_disabled; extern unsigned int lapic_timer_period; -extern int cpuid_to_apicid[]; +extern u32 cpuid_to_apicid[]; extern enum apic_intr_mode_id apic_intr_mode; enum apic_intr_mode_id { @@ -292,19 +292,19 @@ struct apic { int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id); bool (*apic_id_registered)(void); - bool (*check_apicid_used)(physid_mask_t *map, int apicid); + bool (*check_apicid_used)(physid_mask_t *map, u32 apicid); void (*init_apic_ldr)(void); void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap); - int (*cpu_present_to_apicid)(int mps_cpu); - int (*phys_pkg_id)(int cpuid_apic, int index_msb); + u32 (*cpu_present_to_apicid)(int mps_cpu); + u32 (*phys_pkg_id)(u32 cpuid_apic, int index_msb); - u32 (*get_apic_id)(unsigned long x); - u32 (*set_apic_id)(unsigned int id); + u32 (*get_apic_id)(u32 id); + u32 (*set_apic_id)(u32 apicid); /* wakeup_secondary_cpu */ - int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip); + int (*wakeup_secondary_cpu)(u32 apicid, unsigned long start_eip); /* wakeup secondary CPU using 64-bit wakeup point */ - int (*wakeup_secondary_cpu_64)(int apicid, unsigned long start_eip); + int (*wakeup_secondary_cpu_64)(u32 apicid, unsigned long start_eip); char *name; }; @@ -322,8 +322,8 @@ struct apic_override { void (*send_IPI_self)(int vector); u64 (*icr_read)(void); void (*icr_write)(u32 low, u32 high); - int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip); - int (*wakeup_secondary_cpu_64)(int apicid, unsigned long start_eip); + int (*wakeup_secondary_cpu)(u32 apicid, unsigned long start_eip); + int (*wakeup_secondary_cpu_64)(u32 apicid, unsigned long start_eip); }; /* @@ -493,16 +493,6 @@ static inline bool lapic_vector_set_in_irr(unsigned int vector) return !!(irr & (1U << (vector % 32))); } -static inline unsigned default_get_apic_id(unsigned long x) -{ - unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); - - if (APIC_XAPIC(ver) || boot_cpu_has(X86_FEATURE_EXTD_APICID)) - return (x >> 24) & 0xFF; - else - return (x >> 24) & 0x0F; -} - /* * Warm reset vector position: */ @@ -517,9 +507,9 @@ extern void generic_bigsmp_probe(void); extern struct apic apic_noop; -static inline unsigned int read_apic_id(void) +static inline u32 read_apic_id(void) { - unsigned int reg = apic_read(APIC_ID); + u32 reg = apic_read(APIC_ID); return apic->get_apic_id(reg); } @@ -538,13 +528,12 @@ extern int default_apic_id_valid(u32 apicid); extern u32 apic_default_calc_apicid(unsigned int cpu); extern u32 apic_flat_calc_apicid(unsigned int cpu); -extern bool default_check_apicid_used(physid_mask_t *map, int apicid); extern void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap); -extern int default_cpu_present_to_apicid(int mps_cpu); +extern u32 default_cpu_present_to_apicid(int mps_cpu); #else /* CONFIG_X86_LOCAL_APIC */ -static inline unsigned int read_apic_id(void) { return 0; } +static inline u32 read_apic_id(void) { return 0; } #endif /* !CONFIG_X86_LOCAL_APIC */ diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 2edf68475fec..990eb686ca67 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -94,18 +94,17 @@ arch___clear_bit(unsigned long nr, volatile unsigned long *addr) asm volatile(__ASM_SIZE(btr) " %1,%0" : : ADDR, "Ir" (nr) : "memory"); } -static __always_inline bool -arch_clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr) +static __always_inline bool arch_xor_unlock_is_negative_byte(unsigned long mask, + volatile unsigned long *addr) { bool negative; - asm volatile(LOCK_PREFIX "andb %2,%1" + asm volatile(LOCK_PREFIX "xorb %2,%1" CC_SET(s) : CC_OUT(s) (negative), WBYTE_ADDR(addr) - : "ir" ((char) ~(1 << nr)) : "memory"); + : "iq" ((char)mask) : "memory"); return negative; } -#define arch_clear_bit_unlock_is_negative_byte \ - arch_clear_bit_unlock_is_negative_byte +#define arch_xor_unlock_is_negative_byte arch_xor_unlock_is_negative_byte static __always_inline void arch___clear_bit_unlock(long nr, volatile unsigned long *addr) @@ -293,6 +292,9 @@ static __always_inline unsigned long variable_ffz(unsigned long word) */ static __always_inline unsigned long __fls(unsigned long word) { + if (__builtin_constant_p(word)) + return BITS_PER_LONG - 1 - __builtin_clzl(word); + asm("bsr %1,%0" : "=r" (word) : "rm" (word)); @@ -360,6 +362,9 @@ static __always_inline int fls(unsigned int x) { int r; + if (__builtin_constant_p(x)) + return x ? 32 - __builtin_clz(x) : 0; + #ifdef CONFIG_X86_64 /* * AMD64 says BSRL won't clobber the dest reg if x==0; Intel64 says the @@ -401,6 +406,9 @@ static __always_inline int fls(unsigned int x) static __always_inline int fls64(__u64 x) { int bitpos = -1; + + if (__builtin_constant_p(x)) + return x ? 64 - __builtin_clzll(x) : 0; /* * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the * dest reg is undefined if x==0, but their CPU architect says its diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index b3a7cfb0d99e..a38cc0afc90a 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -85,6 +85,8 @@ extern const unsigned long kernel_total_size; unsigned long decompress_kernel(unsigned char *outbuf, unsigned long virt_addr, void (*error)(char *x)); + +extern struct boot_params *boot_params_ptr; #endif #endif /* _ASM_X86_BOOT_H */ diff --git a/arch/x86/include/asm/cacheinfo.h b/arch/x86/include/asm/cacheinfo.h index ce9685fc78d8..5aa061199866 100644 --- a/arch/x86/include/asm/cacheinfo.h +++ b/arch/x86/include/asm/cacheinfo.h @@ -7,9 +7,6 @@ extern unsigned int memory_caching_control; #define CACHE_MTRR 0x01 #define CACHE_PAT 0x02 -void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu); -void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu); - void cache_disable(void); void cache_enable(void); void set_cache_aps_delayed_init(bool val); diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index d53636506134..5612648b0202 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -221,12 +221,18 @@ extern void __add_wrong_size(void) #define __try_cmpxchg(ptr, pold, new, size) \ __raw_try_cmpxchg((ptr), (pold), (new), (size), LOCK_PREFIX) +#define __sync_try_cmpxchg(ptr, pold, new, size) \ + __raw_try_cmpxchg((ptr), (pold), (new), (size), "lock; ") + #define __try_cmpxchg_local(ptr, pold, new, size) \ __raw_try_cmpxchg((ptr), (pold), (new), (size), "") #define arch_try_cmpxchg(ptr, pold, new) \ __try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr))) +#define arch_sync_try_cmpxchg(ptr, pold, new) \ + __sync_try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr))) + #define arch_try_cmpxchg_local(ptr, pold, new) \ __try_cmpxchg_local((ptr), (pold), (new), sizeof(*(ptr))) diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index 3a233ebff712..25050d953eee 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -28,8 +28,6 @@ struct x86_cpu { }; #ifdef CONFIG_HOTPLUG_CPU -extern int arch_register_cpu(int num); -extern void arch_unregister_cpu(int); extern void soft_restart_cpu(void); #endif diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 58cb9495e40f..4af140cf5719 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -443,6 +443,7 @@ /* AMD-defined Extended Feature 2 EAX, CPUID level 0x80000021 (EAX), word 20 */ #define X86_FEATURE_NO_NESTED_DATA_BP (20*32+ 0) /* "" No Nested Data Breakpoints */ +#define X86_FEATURE_WRMSR_XX_BASE_NS (20*32+ 1) /* "" WRMSR to {FS,GS,KERNEL_GS}_BASE is non-serializing */ #define X86_FEATURE_LFENCE_RDTSC (20*32+ 2) /* "" LFENCE always serializing / synchronizes RDTSC */ #define X86_FEATURE_NULL_SEL_CLR_BASE (20*32+ 6) /* "" Null Selector Clears Base */ #define X86_FEATURE_AUTOIBRS (20*32+ 8) /* "" Automatic IBRS */ diff --git a/arch/x86/include/asm/crash_core.h b/arch/x86/include/asm/crash_core.h new file mode 100644 index 000000000000..76af98f4e801 --- /dev/null +++ b/arch/x86/include/asm/crash_core.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _X86_CRASH_CORE_H +#define _X86_CRASH_CORE_H + +/* 16M alignment for crash kernel regions */ +#define CRASH_ALIGN SZ_16M + +/* + * Keep the crash kernel below this limit. + * + * Earlier 32-bits kernels would limit the kernel to the low 512 MB range + * due to mapping restrictions. + * + * 64-bit kdump kernels need to be restricted to be under 64 TB, which is + * the upper limit of system RAM in 4-level paging mode. Since the kdump + * jump could be from 5-level paging to 4-level paging, the jump will fail if + * the kernel is put above 64 TB, and during the 1st kernel bootup there's + * no good way to detect the paging mode of the target kernel which will be + * loaded for dumping. + */ +extern unsigned long swiotlb_size_or_default(void); + +#ifdef CONFIG_X86_32 +# define CRASH_ADDR_LOW_MAX SZ_512M +# define CRASH_ADDR_HIGH_MAX SZ_512M +#else +# define CRASH_ADDR_LOW_MAX SZ_4G +# define CRASH_ADDR_HIGH_MAX SZ_64T +#endif + +# define DEFAULT_CRASH_KERNEL_LOW_SIZE crash_low_size_default() + +static inline unsigned long crash_low_size_default(void) +{ +#ifdef CONFIG_X86_64 + return max(swiotlb_size_or_default() + (8UL << 20), 256UL << 20); +#else + return 0; +#endif +} + +#endif /* _X86_CRASH_CORE_H */ diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 18fd06f7936a..a0234dfd1031 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -7,6 +7,7 @@ */ #include <linux/thread_info.h> +#include <asm/ia32.h> #include <asm/ptrace.h> #include <asm/user.h> #include <asm/auxvec.h> @@ -149,7 +150,7 @@ do { \ ((x)->e_machine == EM_X86_64) #define compat_elf_check_arch(x) \ - (elf_check_arch_ia32(x) || \ + ((elf_check_arch_ia32(x) && ia32_enabled()) || \ (IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64)) static inline void elf_common_init(struct thread_struct *t, diff --git a/arch/x86/include/asm/fb.h b/arch/x86/include/asm/fb.h index 23873da8fb77..c3b9582de7ef 100644 --- a/arch/x86/include/asm/fb.h +++ b/arch/x86/include/asm/fb.h @@ -2,12 +2,14 @@ #ifndef _ASM_X86_FB_H #define _ASM_X86_FB_H +#include <asm/page.h> + struct fb_info; -struct file; -struct vm_area_struct; -void fb_pgprotect(struct file *file, struct vm_area_struct *vma, unsigned long off); -#define fb_pgprotect fb_pgprotect +pgprot_t pgprot_framebuffer(pgprot_t prot, + unsigned long vm_start, unsigned long vm_end, + unsigned long offset); +#define pgprot_framebuffer pgprot_framebuffer int fb_is_primary_device(struct fb_info *info); #define fb_is_primary_device fb_is_primary_device diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 31089b851c4f..a2be3aefff9f 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -157,7 +157,8 @@ static inline void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd) { static inline void fpu_sync_guest_vmexit_xfd_state(void) { } #endif -extern void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, unsigned int size, u32 pkru); +extern void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, + unsigned int size, u64 xfeatures, u32 pkru); extern int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, u64 xcr0, u32 *vpkru); static inline void fpstate_set_confidential(struct fpu_guest *gfpu) diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 551829884734..b02c3cd3c0f6 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -28,7 +28,7 @@ #include <asm/irq.h> #include <asm/sections.h> -#ifdef CONFIG_X86_LOCAL_APIC +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY struct irq_data; struct pci_dev; struct msi_desc; @@ -105,10 +105,10 @@ static inline void irq_complete_move(struct irq_cfg *c) { } #endif extern void apic_ack_edge(struct irq_data *data); -#else /* CONFIG_X86_LOCAL_APIC */ +#else /* CONFIG_IRQ_DOMAIN_HIERARCHY */ static inline void lock_vector_lock(void) {} static inline void unlock_vector_lock(void) {} -#endif /* CONFIG_X86_LOCAL_APIC */ +#endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ /* Statistics */ extern atomic_t irq_err_count; diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h index 637fa1df3512..c715097e92fd 100644 --- a/arch/x86/include/asm/i8259.h +++ b/arch/x86/include/asm/i8259.h @@ -69,6 +69,8 @@ struct legacy_pic { void (*make_irq)(unsigned int irq); }; +void legacy_pic_pcat_compat(void); + extern struct legacy_pic *legacy_pic; extern struct legacy_pic null_legacy_pic; diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h index fada857f0a1e..5a2ae24b1204 100644 --- a/arch/x86/include/asm/ia32.h +++ b/arch/x86/include/asm/ia32.h @@ -68,6 +68,20 @@ extern void ia32_pick_mmap_layout(struct mm_struct *mm); #endif -#endif /* CONFIG_IA32_EMULATION */ +extern bool __ia32_enabled; + +static inline bool ia32_enabled(void) +{ + return __ia32_enabled; +} + +#else /* !CONFIG_IA32_EMULATION */ + +static inline bool ia32_enabled(void) +{ + return IS_ENABLED(CONFIG_X86_32); +} + +#endif #endif /* _ASM_X86_IA32_H */ diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index 5f1d3c421f68..cc9ccf61b6bd 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_INIT_H #define _ASM_X86_INIT_H +#define __head __section(".head.text") + struct x86_mapping_info { void *(*alloc_pgt_page)(void *); /* allocate buf for page table */ void *context; /* context for alloc_pgt_page */ diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 5fcd85fd64fd..197316121f04 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -27,6 +27,7 @@ * _X - regular server parts * _D - micro server parts * _N,_P - other mobile parts + * _H - premium mobile parts * _S - other client parts * * Historical OPTDIFFs: @@ -124,6 +125,7 @@ #define INTEL_FAM6_METEORLAKE 0xAC #define INTEL_FAM6_METEORLAKE_L 0xAA +#define INTEL_FAM6_ARROWLAKE_H 0xC5 #define INTEL_FAM6_ARROWLAKE 0xC6 #define INTEL_FAM6_LUNARLAKE_M 0xBD diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index e3054e3e46d5..26b628d84594 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -108,6 +108,7 @@ KVM_X86_OP_OPTIONAL(vcpu_blocking) KVM_X86_OP_OPTIONAL(vcpu_unblocking) KVM_X86_OP_OPTIONAL(pi_update_irte) KVM_X86_OP_OPTIONAL(pi_start_assignment) +KVM_X86_OP_OPTIONAL(apicv_pre_state_restore) KVM_X86_OP_OPTIONAL(apicv_post_state_restore) KVM_X86_OP_OPTIONAL_RET0(dy_apicv_has_pending_interrupt) KVM_X86_OP_OPTIONAL(set_hv_timer) @@ -126,7 +127,7 @@ KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from) KVM_X86_OP_OPTIONAL(vm_move_enc_context_from) KVM_X86_OP_OPTIONAL(guest_memory_reclaimed) KVM_X86_OP(get_msr_feature) -KVM_X86_OP(can_emulate_instruction) +KVM_X86_OP(check_emulate_instruction) KVM_X86_OP(apic_init_signal_blocked) KVM_X86_OP_OPTIONAL(enable_l2_tlb_flush) KVM_X86_OP_OPTIONAL(migrate_timers) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 17715cb8731d..d7036982332e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -39,7 +39,15 @@ #define __KVM_HAVE_ARCH_VCPU_DEBUGFS +/* + * CONFIG_KVM_MAX_NR_VCPUS is defined iff CONFIG_KVM!=n, provide a dummy max if + * KVM is disabled (arbitrarily use the default from CONFIG_KVM_MAX_NR_VCPUS). + */ +#ifdef CONFIG_KVM_MAX_NR_VCPUS +#define KVM_MAX_VCPUS CONFIG_KVM_MAX_NR_VCPUS +#else #define KVM_MAX_VCPUS 1024 +#endif /* * In x86, the VCPU ID corresponds to the APIC ID, and APIC IDs @@ -528,7 +536,6 @@ struct kvm_pmu { u64 raw_event_mask; struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC]; struct kvm_pmc fixed_counters[KVM_PMC_MAX_FIXED]; - struct irq_work irq_work; /* * Overlay the bitmap with a 64-bit atomic so that all bits can be @@ -680,6 +687,7 @@ struct kvm_hypervisor_cpuid { u32 limit; }; +#ifdef CONFIG_KVM_XEN /* Xen HVM per vcpu emulation context */ struct kvm_vcpu_xen { u64 hypercall_rip; @@ -702,6 +710,7 @@ struct kvm_vcpu_xen { struct timer_list poll_timer; struct kvm_hypervisor_cpuid cpuid; }; +#endif struct kvm_queued_exception { bool pending; @@ -930,8 +939,9 @@ struct kvm_vcpu_arch { bool hyperv_enabled; struct kvm_vcpu_hv *hyperv; +#ifdef CONFIG_KVM_XEN struct kvm_vcpu_xen xen; - +#endif cpumask_var_t wbinvd_dirty_mask; unsigned long last_retry_eip; @@ -1276,7 +1286,6 @@ struct kvm_arch { */ spinlock_t mmu_unsync_pages_lock; - struct list_head assigned_dev_head; struct iommu_domain *iommu_domain; bool iommu_noncoherent; #define __KVM_HAVE_ARCH_NONCOHERENT_DMA @@ -1324,6 +1333,7 @@ struct kvm_arch { int nr_vcpus_matched_tsc; u32 default_tsc_khz; + bool user_set_tsc; seqcount_raw_spinlock_t pvclock_sc; bool use_master_clock; @@ -1692,7 +1702,7 @@ struct kvm_x86_ops { void (*request_immediate_exit)(struct kvm_vcpu *vcpu); - void (*sched_in)(struct kvm_vcpu *kvm, int cpu); + void (*sched_in)(struct kvm_vcpu *vcpu, int cpu); /* * Size of the CPU's dirty log buffer, i.e. VMX's PML buffer. A zero @@ -1709,6 +1719,7 @@ struct kvm_x86_ops { int (*pi_update_irte)(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); void (*pi_start_assignment)(struct kvm *kvm); + void (*apicv_pre_state_restore)(struct kvm_vcpu *vcpu); void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu); @@ -1734,8 +1745,8 @@ struct kvm_x86_ops { int (*get_msr_feature)(struct kvm_msr_entry *entry); - bool (*can_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type, - void *insn, int insn_len); + int (*check_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len); bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu); int (*enable_l2_tlb_flush)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h index 635132a12778..73dba8b94443 100644 --- a/arch/x86/include/asm/local.h +++ b/arch/x86/include/asm/local.h @@ -135,28 +135,27 @@ static inline bool local_try_cmpxchg(local_t *l, long *old, long new) #define local_xchg(l, n) (xchg(&((l)->a.counter), (n))) /** - * local_add_unless - add unless the number is a given value + * local_add_unless - add unless the number is already a given value * @l: pointer of type local_t * @a: the amount to add to l... * @u: ...unless l is equal to u. * - * Atomically adds @a to @l, so long as it was not @u. - * Returns non-zero if @l was not @u, and zero otherwise. + * Atomically adds @a to @l, if @v was not already @u. + * Returns true if the addition was done. */ -#define local_add_unless(l, a, u) \ -({ \ - long c, old; \ - c = local_read((l)); \ - for (;;) { \ - if (unlikely(c == (u))) \ - break; \ - old = local_cmpxchg((l), c, c + (a)); \ - if (likely(old == c)) \ - break; \ - c = old; \ - } \ - c != (u); \ -}) +static __always_inline bool +local_add_unless(local_t *l, long a, long u) +{ + long c = local_read(l); + + do { + if (unlikely(c == u)) + return false; + } while (!local_try_cmpxchg(l, &c, c + a)); + + return true; +} + #define local_inc_not_zero(l) local_add_unless((l), 1, 0) /* On x86_32, these are no better than the atomic variants. diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 180b1cbfcc4e..6de6e1d95952 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -245,7 +245,7 @@ static inline void cmci_recheck(void) {} int mce_available(struct cpuinfo_x86 *c); bool mce_is_memory_error(struct mce *m); bool mce_is_correctable(struct mce *m); -int mce_usable_address(struct mce *m); +bool mce_usable_address(struct mce *m); DECLARE_PER_CPU(unsigned, mce_exception_count); DECLARE_PER_CPU(unsigned, mce_poll_count); diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 473b16d73b47..359ada486fa9 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -19,8 +19,10 @@ #ifdef CONFIG_X86_MEM_ENCRYPT void __init mem_encrypt_init(void); +void __init mem_encrypt_setup_arch(void); #else static inline void mem_encrypt_init(void) { } +static inline void __init mem_encrypt_setup_arch(void) { } #endif #ifdef CONFIG_AMD_MEM_ENCRYPT @@ -43,7 +45,6 @@ void __init sme_map_bootdata(char *real_mode_data); void __init sme_unmap_bootdata(char *real_mode_data); void __init sme_early_init(void); -void __init sev_setup_arch(void); void __init sme_encrypt_kernel(struct boot_params *bp); void __init sme_enable(struct boot_params *bp); @@ -73,7 +74,6 @@ static inline void __init sme_map_bootdata(char *real_mode_data) { } static inline void __init sme_unmap_bootdata(char *real_mode_data) { } static inline void __init sme_early_init(void) { } -static inline void __init sev_setup_arch(void) { } static inline void __init sme_encrypt_kernel(struct boot_params *bp) { } static inline void __init sme_enable(struct boot_params *bp) { } diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index f46df8349e86..4b0f98a8d338 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -37,7 +37,7 @@ extern int mp_bus_id_to_type[MAX_MP_BUSSES]; extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); -extern unsigned int boot_cpu_physical_apicid; +extern u32 boot_cpu_physical_apicid; extern u8 boot_cpu_apic_version; #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 033b53f993c6..ce4ce8720d55 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -276,11 +276,11 @@ int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry); #ifdef CONFIG_AMD_MEM_ENCRYPT bool hv_ghcb_negotiate_protocol(void); void __noreturn hv_ghcb_terminate(unsigned int set, unsigned int reason); -int hv_snp_boot_ap(int cpu, unsigned long start_ip); +int hv_snp_boot_ap(u32 cpu, unsigned long start_ip); #else static inline bool hv_ghcb_negotiate_protocol(void) { return false; } static inline void hv_ghcb_terminate(unsigned int set, unsigned int reason) {} -static inline int hv_snp_boot_ap(int cpu, unsigned long start_ip) { return 0; } +static inline int hv_snp_boot_ap(u32 cpu, unsigned long start_ip) { return 0; } #endif #if defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST) @@ -340,8 +340,10 @@ static inline u64 hv_get_non_nested_register(unsigned int reg) { return 0; } #ifdef CONFIG_HYPERV_VTL_MODE void __init hv_vtl_init_platform(void); +int __init hv_vtl_early_init(void); #else static inline void __init hv_vtl_init_platform(void) {} +static inline int __init hv_vtl_early_init(void) { return 0; } #endif #include <asm-generic/mshyperv.h> diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 1d111350197f..1d51e1850ed0 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -222,6 +222,7 @@ #define MSR_INTEGRITY_CAPS_ARRAY_BIST BIT(MSR_INTEGRITY_CAPS_ARRAY_BIST_BIT) #define MSR_INTEGRITY_CAPS_PERIODIC_BIST_BIT 4 #define MSR_INTEGRITY_CAPS_PERIODIC_BIST BIT(MSR_INTEGRITY_CAPS_PERIODIC_BIST_BIT) +#define MSR_INTEGRITY_CAPS_SAF_GEN_MASK GENMASK_ULL(10, 9) #define MSR_LBR_NHM_FROM 0x00000680 #define MSR_LBR_NHM_TO 0x000006c0 @@ -553,6 +554,7 @@ #define MSR_AMD64_CPUID_FN_1 0xc0011004 #define MSR_AMD64_LS_CFG 0xc0011020 #define MSR_AMD64_DC_CFG 0xc0011022 +#define MSR_AMD64_TW_CFG 0xc0011023 #define MSR_AMD64_DE_CFG 0xc0011029 #define MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT 1 @@ -637,12 +639,21 @@ /* AMD Last Branch Record MSRs */ #define MSR_AMD64_LBR_SELECT 0xc000010e +/* Zen4 */ +#define MSR_ZEN4_BP_CFG 0xc001102e +#define MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT 5 + +/* Fam 19h MSRs */ +#define MSR_F19H_UMC_PERF_CTL 0xc0010800 +#define MSR_F19H_UMC_PERF_CTR 0xc0010801 + +/* Zen 2 */ +#define MSR_ZEN2_SPECTRAL_CHICKEN 0xc00110e3 +#define MSR_ZEN2_SPECTRAL_CHICKEN_BIT BIT_ULL(1) + /* Fam 17h MSRs */ #define MSR_F17H_IRPERF 0xc00000e9 -#define MSR_ZEN2_SPECTRAL_CHICKEN 0xc00110e3 -#define MSR_ZEN2_SPECTRAL_CHICKEN_BIT BIT_ULL(1) - /* Fam 16h MSRs */ #define MSR_F16H_L2I_PERF_CTL 0xc0010230 #define MSR_F16H_L2I_PERF_CTR 0xc0010231 @@ -1112,12 +1123,16 @@ #define MSR_IA32_VMX_MISC_INTEL_PT (1ULL << 14) #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29) #define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE 0x1F -/* AMD-V MSRs */ +/* AMD-V MSRs */ #define MSR_VM_CR 0xc0010114 #define MSR_VM_IGNNE 0xc0010115 #define MSR_VM_HSAVE_PA 0xc0010117 +#define SVM_VM_CR_VALID_MASK 0x001fULL +#define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL +#define SVM_VM_CR_SVM_DIS_MASK 0x0010ULL + /* Hardware Feedback Interface */ #define MSR_IA32_HW_FEEDBACK_PTR 0x17d0 #define MSR_IA32_HW_FEEDBACK_CONFIG 0x17d1 diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index c55cc243592e..f93e9b96927a 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -196,7 +196,7 @@ .macro ANNOTATE_RETPOLINE_SAFE .Lhere_\@: .pushsection .discard.retpoline_safe - .long .Lhere_\@ - . + .long .Lhere_\@ .popsection .endm @@ -271,7 +271,7 @@ .Lskip_rsb_\@: .endm -#ifdef CONFIG_CPU_UNRET_ENTRY +#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO) #define CALL_UNTRAIN_RET "call entry_untrain_ret" #else #define CALL_UNTRAIN_RET "" @@ -288,38 +288,24 @@ * As such, this must be placed after every *SWITCH_TO_KERNEL_CR3 at a point * where we have a stack but before any RET instruction. */ -.macro UNTRAIN_RET -#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \ - defined(CONFIG_CALL_DEPTH_TRACKING) || defined(CONFIG_CPU_SRSO) +.macro __UNTRAIN_RET ibpb_feature, call_depth_insns +#if defined(CONFIG_RETHUNK) || defined(CONFIG_CPU_IBPB_ENTRY) VALIDATE_UNRET_END ALTERNATIVE_3 "", \ CALL_UNTRAIN_RET, X86_FEATURE_UNRET, \ - "call entry_ibpb", X86_FEATURE_ENTRY_IBPB, \ - __stringify(RESET_CALL_DEPTH), X86_FEATURE_CALL_DEPTH + "call entry_ibpb", \ibpb_feature, \ + __stringify(\call_depth_insns), X86_FEATURE_CALL_DEPTH #endif .endm -.macro UNTRAIN_RET_VM -#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \ - defined(CONFIG_CALL_DEPTH_TRACKING) || defined(CONFIG_CPU_SRSO) - VALIDATE_UNRET_END - ALTERNATIVE_3 "", \ - CALL_UNTRAIN_RET, X86_FEATURE_UNRET, \ - "call entry_ibpb", X86_FEATURE_IBPB_ON_VMEXIT, \ - __stringify(RESET_CALL_DEPTH), X86_FEATURE_CALL_DEPTH -#endif -.endm +#define UNTRAIN_RET \ + __UNTRAIN_RET X86_FEATURE_ENTRY_IBPB, __stringify(RESET_CALL_DEPTH) -.macro UNTRAIN_RET_FROM_CALL -#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \ - defined(CONFIG_CALL_DEPTH_TRACKING) - VALIDATE_UNRET_END - ALTERNATIVE_3 "", \ - CALL_UNTRAIN_RET, X86_FEATURE_UNRET, \ - "call entry_ibpb", X86_FEATURE_ENTRY_IBPB, \ - __stringify(RESET_CALL_DEPTH_FROM_CALL), X86_FEATURE_CALL_DEPTH -#endif -.endm +#define UNTRAIN_RET_VM \ + __UNTRAIN_RET X86_FEATURE_IBPB_ON_VMEXIT, __stringify(RESET_CALL_DEPTH) + +#define UNTRAIN_RET_FROM_CALL \ + __UNTRAIN_RET X86_FEATURE_ENTRY_IBPB, __stringify(RESET_CALL_DEPTH_FROM_CALL) .macro CALL_DEPTH_ACCOUNT @@ -334,7 +320,7 @@ #define ANNOTATE_RETPOLINE_SAFE \ "999:\n\t" \ ".pushsection .discard.retpoline_safe\n\t" \ - ".long 999b - .\n\t" \ + ".long 999b\n\t" \ ".popsection\n\t" typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE]; @@ -348,13 +334,23 @@ extern void __x86_return_thunk(void); static inline void __x86_return_thunk(void) {} #endif +#ifdef CONFIG_CPU_UNRET_ENTRY extern void retbleed_return_thunk(void); +#else +static inline void retbleed_return_thunk(void) {} +#endif + +#ifdef CONFIG_CPU_SRSO extern void srso_return_thunk(void); extern void srso_alias_return_thunk(void); +#else +static inline void srso_return_thunk(void) {} +static inline void srso_alias_return_thunk(void) {} +#endif -extern void retbleed_untrain_ret(void); -extern void srso_untrain_ret(void); -extern void srso_alias_untrain_ret(void); +extern void retbleed_return_thunk(void); +extern void srso_return_thunk(void); +extern void srso_alias_return_thunk(void); extern void entry_untrain_ret(void); extern void entry_ibpb(void); @@ -362,12 +358,7 @@ extern void entry_ibpb(void); extern void (*x86_return_thunk)(void); #ifdef CONFIG_CALL_DEPTH_TRACKING -extern void __x86_return_skl(void); - -static inline void x86_set_skl_return_thunk(void) -{ - x86_return_thunk = &__x86_return_skl; -} +extern void call_depth_return_thunk(void); #define CALL_DEPTH_ACCOUNT \ ALTERNATIVE("", \ @@ -380,12 +371,12 @@ DECLARE_PER_CPU(u64, __x86_ret_count); DECLARE_PER_CPU(u64, __x86_stuffs_count); DECLARE_PER_CPU(u64, __x86_ctxsw_count); #endif -#else -static inline void x86_set_skl_return_thunk(void) {} +#else /* !CONFIG_CALL_DEPTH_TRACKING */ +static inline void call_depth_return_thunk(void) {} #define CALL_DEPTH_ACCOUNT "" -#endif +#endif /* CONFIG_CALL_DEPTH_TRACKING */ #ifdef CONFIG_RETPOLINE diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index e3bae2b60a0d..ef2844d69173 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -12,13 +12,6 @@ #define NR_NODE_MEMBLKS (MAX_NUMNODES*2) -/* - * Too small node sizes may confuse the VM badly. Usually they - * result from BIOS bugs. So dont recognize nodes as standalone - * NUMA entities that have less than this amount of RAM listed: - */ -#define NODE_MIN_SIZE (4*1024*1024) - extern int numa_off; /* diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 34734d730463..20624b80f890 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -210,6 +210,25 @@ do { \ (typeof(_var))(unsigned long) pco_old__; \ }) +#define percpu_try_cmpxchg_op(size, qual, _var, _ovalp, _nval) \ +({ \ + bool success; \ + __pcpu_type_##size *pco_oval__ = (__pcpu_type_##size *)(_ovalp); \ + __pcpu_type_##size pco_old__ = *pco_oval__; \ + __pcpu_type_##size pco_new__ = __pcpu_cast_##size(_nval); \ + asm qual (__pcpu_op2_##size("cmpxchg", "%[nval]", \ + __percpu_arg([var])) \ + CC_SET(z) \ + : CC_OUT(z) (success), \ + [oval] "+a" (pco_old__), \ + [var] "+m" (_var) \ + : [nval] __pcpu_reg_##size(, pco_new__) \ + : "memory"); \ + if (unlikely(!success)) \ + *pco_oval__ = pco_old__; \ + likely(success); \ +}) + #if defined(CONFIG_X86_32) && !defined(CONFIG_UML) #define percpu_cmpxchg64_op(size, qual, _var, _oval, _nval) \ ({ \ @@ -223,26 +242,63 @@ do { \ old__.var = _oval; \ new__.var = _nval; \ \ - asm qual (ALTERNATIVE("leal %P[var], %%esi; call this_cpu_cmpxchg8b_emu", \ + asm qual (ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \ "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \ : [var] "+m" (_var), \ "+a" (old__.low), \ "+d" (old__.high) \ : "b" (new__.low), \ - "c" (new__.high) \ - : "memory", "esi"); \ + "c" (new__.high), \ + "S" (&(_var)) \ + : "memory"); \ \ old__.var; \ }) #define raw_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg64_op(8, , pcp, oval, nval) #define this_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg64_op(8, volatile, pcp, oval, nval) + +#define percpu_try_cmpxchg64_op(size, qual, _var, _ovalp, _nval) \ +({ \ + bool success; \ + u64 *_oval = (u64 *)(_ovalp); \ + union { \ + u64 var; \ + struct { \ + u32 low, high; \ + }; \ + } old__, new__; \ + \ + old__.var = *_oval; \ + new__.var = _nval; \ + \ + asm qual (ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \ + "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \ + CC_SET(z) \ + : CC_OUT(z) (success), \ + [var] "+m" (_var), \ + "+a" (old__.low), \ + "+d" (old__.high) \ + : "b" (new__.low), \ + "c" (new__.high), \ + "S" (&(_var)) \ + : "memory"); \ + if (unlikely(!success)) \ + *_oval = old__.var; \ + likely(success); \ +}) + +#define raw_cpu_try_cmpxchg64(pcp, ovalp, nval) percpu_try_cmpxchg64_op(8, , pcp, ovalp, nval) +#define this_cpu_try_cmpxchg64(pcp, ovalp, nval) percpu_try_cmpxchg64_op(8, volatile, pcp, ovalp, nval) #endif #ifdef CONFIG_X86_64 #define raw_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg_op(8, , pcp, oval, nval); #define this_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval); +#define raw_cpu_try_cmpxchg64(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, , pcp, ovalp, nval); +#define this_cpu_try_cmpxchg64(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval); + #define percpu_cmpxchg128_op(size, qual, _var, _oval, _nval) \ ({ \ union { \ @@ -255,20 +311,54 @@ do { \ old__.var = _oval; \ new__.var = _nval; \ \ - asm qual (ALTERNATIVE("leaq %P[var], %%rsi; call this_cpu_cmpxchg16b_emu", \ + asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \ "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \ : [var] "+m" (_var), \ "+a" (old__.low), \ "+d" (old__.high) \ : "b" (new__.low), \ - "c" (new__.high) \ - : "memory", "rsi"); \ + "c" (new__.high), \ + "S" (&(_var)) \ + : "memory"); \ \ old__.var; \ }) #define raw_cpu_cmpxchg128(pcp, oval, nval) percpu_cmpxchg128_op(16, , pcp, oval, nval) #define this_cpu_cmpxchg128(pcp, oval, nval) percpu_cmpxchg128_op(16, volatile, pcp, oval, nval) + +#define percpu_try_cmpxchg128_op(size, qual, _var, _ovalp, _nval) \ +({ \ + bool success; \ + u128 *_oval = (u128 *)(_ovalp); \ + union { \ + u128 var; \ + struct { \ + u64 low, high; \ + }; \ + } old__, new__; \ + \ + old__.var = *_oval; \ + new__.var = _nval; \ + \ + asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \ + "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \ + CC_SET(z) \ + : CC_OUT(z) (success), \ + [var] "+m" (_var), \ + "+a" (old__.low), \ + "+d" (old__.high) \ + : "b" (new__.low), \ + "c" (new__.high), \ + "S" (&(_var)) \ + : "memory"); \ + if (unlikely(!success)) \ + *_oval = old__.var; \ + likely(success); \ +}) + +#define raw_cpu_try_cmpxchg128(pcp, ovalp, nval) percpu_try_cmpxchg128_op(16, , pcp, ovalp, nval) +#define this_cpu_try_cmpxchg128(pcp, ovalp, nval) percpu_try_cmpxchg128_op(16, volatile, pcp, ovalp, nval) #endif /* @@ -343,6 +433,9 @@ do { \ #define raw_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(1, , pcp, oval, nval) #define raw_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(2, , pcp, oval, nval) #define raw_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(4, , pcp, oval, nval) +#define raw_cpu_try_cmpxchg_1(pcp, ovalp, nval) percpu_try_cmpxchg_op(1, , pcp, ovalp, nval) +#define raw_cpu_try_cmpxchg_2(pcp, ovalp, nval) percpu_try_cmpxchg_op(2, , pcp, ovalp, nval) +#define raw_cpu_try_cmpxchg_4(pcp, ovalp, nval) percpu_try_cmpxchg_op(4, , pcp, ovalp, nval) #define this_cpu_add_return_1(pcp, val) percpu_add_return_op(1, volatile, pcp, val) #define this_cpu_add_return_2(pcp, val) percpu_add_return_op(2, volatile, pcp, val) @@ -350,6 +443,9 @@ do { \ #define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(1, volatile, pcp, oval, nval) #define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(2, volatile, pcp, oval, nval) #define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(4, volatile, pcp, oval, nval) +#define this_cpu_try_cmpxchg_1(pcp, ovalp, nval) percpu_try_cmpxchg_op(1, volatile, pcp, ovalp, nval) +#define this_cpu_try_cmpxchg_2(pcp, ovalp, nval) percpu_try_cmpxchg_op(2, volatile, pcp, ovalp, nval) +#define this_cpu_try_cmpxchg_4(pcp, ovalp, nval) percpu_try_cmpxchg_op(4, volatile, pcp, ovalp, nval) /* * Per cpu atomic 64 bit operations are only available under 64 bit. @@ -364,6 +460,7 @@ do { \ #define raw_cpu_add_return_8(pcp, val) percpu_add_return_op(8, , pcp, val) #define raw_cpu_xchg_8(pcp, nval) raw_percpu_xchg_op(pcp, nval) #define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, , pcp, oval, nval) +#define raw_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, , pcp, ovalp, nval) #define this_cpu_read_8(pcp) percpu_from_op(8, volatile, "mov", pcp) #define this_cpu_write_8(pcp, val) percpu_to_op(8, volatile, "mov", (pcp), val) @@ -373,6 +470,7 @@ do { \ #define this_cpu_add_return_8(pcp, val) percpu_add_return_op(8, volatile, pcp, val) #define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(8, volatile, pcp, nval) #define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval) +#define this_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval) #endif static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr, diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 85a9fd5a3ec3..2618ec7c3d1d 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -112,6 +112,13 @@ (AMD64_PERFMON_V2_EVENTSEL_EVENT_NB | \ AMD64_PERFMON_V2_EVENTSEL_UMASK_NB) +#define AMD64_PERFMON_V2_ENABLE_UMC BIT_ULL(31) +#define AMD64_PERFMON_V2_EVENTSEL_EVENT_UMC GENMASK_ULL(7, 0) +#define AMD64_PERFMON_V2_EVENTSEL_RDWRMASK_UMC GENMASK_ULL(9, 8) +#define AMD64_PERFMON_V2_RAW_EVENT_MASK_UMC \ + (AMD64_PERFMON_V2_EVENTSEL_EVENT_UMC | \ + AMD64_PERFMON_V2_EVENTSEL_RDWRMASK_UMC) + #define AMD64_NUM_COUNTERS 4 #define AMD64_NUM_COUNTERS_CORE 6 #define AMD64_NUM_COUNTERS_NB 4 @@ -232,6 +239,8 @@ union cpuid_0x80000022_ebx { unsigned int lbr_v2_stack_sz:6; /* Number of Data Fabric Counters */ unsigned int num_df_pmc:6; + /* Number of Unified Memory Controller Counters */ + unsigned int num_umc_pmc:6; } split; unsigned int full; }; diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index e02b179ec659..57bab91bbf50 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1716,6 +1716,14 @@ static inline bool pud_user_accessible_page(pud_t pud) } #endif +#ifdef CONFIG_X86_SGX +int arch_memory_failure(unsigned long pfn, int flags); +#define arch_memory_failure arch_memory_failure + +bool arch_is_platform_page(u64 paddr); +#define arch_is_platform_page arch_is_platform_page +#endif + #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_H */ diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 2d13f25b1bd8..4527e1430c6d 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -31,11 +31,11 @@ static __always_inline void preempt_count_set(int pc) { int old, new; + old = raw_cpu_read_4(pcpu_hot.preempt_count); do { - old = raw_cpu_read_4(pcpu_hot.preempt_count); new = (old & PREEMPT_NEED_RESCHED) | (pc & ~PREEMPT_NEED_RESCHED); - } while (raw_cpu_cmpxchg_4(pcpu_hot.preempt_count, old, new) != old); + } while (!raw_cpu_try_cmpxchg_4(pcpu_hot.preempt_count, &old, new)); } /* diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index a3669a7774ed..ae81a7191c1c 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -75,11 +75,36 @@ extern u16 __read_mostly tlb_lld_4m[NR_INFO]; extern u16 __read_mostly tlb_lld_1g[NR_INFO]; /* - * CPU type and hardware bug flags. Kept separately for each CPU. - * Members of this structure are referenced in head_32.S, so think twice - * before touching them. [mj] + * CPU type and hardware bug flags. Kept separately for each CPU. */ +struct cpuinfo_topology { + // Real APIC ID read from the local APIC + u32 apicid; + // The initial APIC ID provided by CPUID + u32 initial_apicid; + + // Physical package ID + u32 pkg_id; + + // Physical die ID on AMD, Relative on Intel + u32 die_id; + + // Compute unit ID - AMD specific + u32 cu_id; + + // Core ID relative to the package + u32 core_id; + + // Logical ID mappings + u32 logical_pkg_id; + u32 logical_die_id; + + // Cache level topology IDs + u32 llc_id; + u32 l2c_id; +}; + struct cpuinfo_x86 { __u8 x86; /* CPU family */ __u8 x86_vendor; /* CPU vendor */ @@ -96,7 +121,6 @@ struct cpuinfo_x86 { __u8 x86_phys_bits; /* CPUID returned core id bits: */ __u8 x86_coreid_bits; - __u8 cu_id; /* Max extended CPUID function supported: */ __u32 extended_cpuid_level; /* Maximum supported CPUID level, -1=no CPUID: */ @@ -112,6 +136,7 @@ struct cpuinfo_x86 { }; char x86_vendor_id[16]; char x86_model_id[64]; + struct cpuinfo_topology topo; /* in KB - valid for CPUS which support this call: */ unsigned int x86_cache_size; int x86_cache_alignment; /* In bytes */ @@ -125,19 +150,9 @@ struct cpuinfo_x86 { u64 ppin; /* cpuid returned max cores value: */ u16 x86_max_cores; - u16 apicid; - u16 initial_apicid; u16 x86_clflush_size; /* number of cores as seen by the OS: */ u16 booted_cores; - /* Physical processor id: */ - u16 phys_proc_id; - /* Logical processor id: */ - u16 logical_proc_id; - /* Core id: */ - u16 cpu_core_id; - u16 cpu_die_id; - u16 logical_die_id; /* Index into per_cpu list: */ u16 cpu_index; /* Is SMT active on this core? */ @@ -399,7 +414,7 @@ static inline unsigned long cpu_kernelmode_gs_base(int cpu) return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu); } -extern asmlinkage void ignore_sysret(void); +extern asmlinkage void entry_SYSCALL32_ignore(void); /* Save actual FS/GS selectors and bases to current->thread */ void current_save_fsgs(void); @@ -678,7 +693,15 @@ extern int set_tsc_mode(unsigned int val); DECLARE_PER_CPU(u64, msr_misc_features_shadow); -extern u16 get_llc_id(unsigned int cpu); +static inline u32 per_cpu_llc_id(unsigned int cpu) +{ + return per_cpu(cpu_info.topo.llc_id, cpu); +} + +static inline u32 per_cpu_l2c_id(unsigned int cpu) +{ + return per_cpu(cpu_info.topo.l2c_id, cpu); +} #ifdef CONFIG_CPU_SUP_AMD extern u32 amd_get_nodes_per_socket(void); @@ -724,14 +747,6 @@ enum mds_mitigations { MDS_MITIGATION_VMWERV, }; -#ifdef CONFIG_X86_SGX -int arch_memory_failure(unsigned long pfn, int flags); -#define arch_memory_failure arch_memory_failure - -bool arch_is_platform_page(u64 paddr); -#define arch_is_platform_page arch_is_platform_page -#endif - extern bool gds_ucode_mitigated(void); #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h index b716d291d0d4..65dee2420624 100644 --- a/arch/x86/include/asm/prom.h +++ b/arch/x86/include/asm/prom.h @@ -31,6 +31,11 @@ static inline void x86_dtb_init(void) { } #define of_ioapic 0 #endif +#ifdef CONFIG_OF_EARLY_FLATTREE +void x86_flattree_get_config(void); +#else +static inline void x86_flattree_get_config(void) { } +#endif extern char cmd_line[COMMAND_LINE_SIZE]; #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 12ef86b19910..4d84122bd643 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -36,6 +36,9 @@ void entry_INT80_compat(void); #ifdef CONFIG_XEN_PV void xen_entry_INT80_compat(void); #endif +#else /* !CONFIG_IA32_EMULATION */ +#define entry_SYSCALL_compat NULL +#define entry_SYSENTER_compat NULL #endif void x86_configure_nx(void); diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h index 7513b3bb69b7..f74695dea217 100644 --- a/arch/x86/include/asm/shared/tdx.h +++ b/arch/x86/include/asm/shared/tdx.h @@ -11,11 +11,12 @@ #define TDX_IDENT "IntelTDX " /* TDX module Call Leaf IDs */ -#define TDX_GET_INFO 1 -#define TDX_GET_VEINFO 3 -#define TDX_GET_REPORT 4 -#define TDX_ACCEPT_PAGE 6 -#define TDX_WR 8 +#define TDG_VP_VMCALL 0 +#define TDG_VP_INFO 1 +#define TDG_VP_VEINFO_GET 3 +#define TDG_MR_REPORT 4 +#define TDG_MEM_PAGE_ACCEPT 6 +#define TDG_VM_WR 8 /* TDCS fields. To be used by TDG.VM.WR and TDG.VM.RD module calls */ #define TDCS_NOTIFY_ENABLES 0x9100000000000010 @@ -24,32 +25,70 @@ #define TDVMCALL_MAP_GPA 0x10001 #define TDVMCALL_REPORT_FATAL_ERROR 0x10003 +#define TDVMCALL_STATUS_RETRY 1 + +/* + * Bitmasks of exposed registers (with VMM). + */ +#define TDX_RDX BIT(2) +#define TDX_RBX BIT(3) +#define TDX_RSI BIT(6) +#define TDX_RDI BIT(7) +#define TDX_R8 BIT(8) +#define TDX_R9 BIT(9) +#define TDX_R10 BIT(10) +#define TDX_R11 BIT(11) +#define TDX_R12 BIT(12) +#define TDX_R13 BIT(13) +#define TDX_R14 BIT(14) +#define TDX_R15 BIT(15) + +/* + * These registers are clobbered to hold arguments for each + * TDVMCALL. They are safe to expose to the VMM. + * Each bit in this mask represents a register ID. Bit field + * details can be found in TDX GHCI specification, section + * titled "TDCALL [TDG.VP.VMCALL] leaf". + */ +#define TDVMCALL_EXPOSE_REGS_MASK \ + (TDX_RDX | TDX_RBX | TDX_RSI | TDX_RDI | TDX_R8 | TDX_R9 | \ + TDX_R10 | TDX_R11 | TDX_R12 | TDX_R13 | TDX_R14 | TDX_R15) + #ifndef __ASSEMBLY__ +#include <linux/compiler_attributes.h> + /* - * Used in __tdx_hypercall() to pass down and get back registers' values of - * the TDCALL instruction when requesting services from the VMM. - * - * This is a software only structure and not part of the TDX module/VMM ABI. + * Used in __tdcall*() to gather the input/output registers' values of the + * TDCALL instruction when requesting services from the TDX module. This is a + * software only structure and not part of the TDX module/VMM ABI */ -struct tdx_hypercall_args { +struct tdx_module_args { + /* callee-clobbered */ + u64 rcx; + u64 rdx; u64 r8; u64 r9; + /* extra callee-clobbered */ u64 r10; u64 r11; + /* callee-saved + rdi/rsi */ u64 r12; u64 r13; u64 r14; u64 r15; + u64 rbx; u64 rdi; u64 rsi; - u64 rbx; - u64 rdx; }; +/* Used to communicate with the TDX module */ +u64 __tdcall(u64 fn, struct tdx_module_args *args); +u64 __tdcall_ret(u64 fn, struct tdx_module_args *args); +u64 __tdcall_saved_ret(u64 fn, struct tdx_module_args *args); + /* Used to request services from the VMM */ -u64 __tdx_hypercall(struct tdx_hypercall_args *args); -u64 __tdx_hypercall_ret(struct tdx_hypercall_args *args); +u64 __tdx_hypercall(struct tdx_module_args *args); /* * Wrapper for standard use of __tdx_hypercall with no output aside from @@ -57,7 +96,7 @@ u64 __tdx_hypercall_ret(struct tdx_hypercall_args *args); */ static inline u64 _tdx_hypercall(u64 fn, u64 r12, u64 r13, u64 r14, u64 r15) { - struct tdx_hypercall_args args = { + struct tdx_module_args args = { .r10 = TDX_HYPERCALL_STANDARD, .r11 = fn, .r12 = r12, @@ -71,25 +110,7 @@ static inline u64 _tdx_hypercall(u64 fn, u64 r12, u64 r13, u64 r14, u64 r15) /* Called from __tdx_hypercall() for unrecoverable failure */ -void __tdx_hypercall_failed(void); - -/* - * Used in __tdx_module_call() to gather the output registers' values of the - * TDCALL instruction when requesting services from the TDX module. This is a - * software only structure and not part of the TDX module/VMM ABI - */ -struct tdx_module_output { - u64 rcx; - u64 rdx; - u64 r8; - u64 r9; - u64 r10; - u64 r11; -}; - -/* Used to communicate with the TDX module */ -u64 __tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9, - struct tdx_module_output *out); +void __noreturn __tdx_hypercall_failed(void); bool tdx_accept_memory(phys_addr_t start, phys_addr_t end); diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index ad98dd1d9cfb..4fab2ed454f3 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -17,10 +17,8 @@ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map); /* cpus sharing the last level cache: */ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map); -DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id); -DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id); -DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid); +DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_apicid); DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid); struct task_struct; @@ -129,7 +127,6 @@ void native_smp_send_reschedule(int cpu); void native_send_call_func_ipi(const struct cpumask *mask); void native_send_call_func_single_ipi(int cpu); -bool smp_park_other_cpus_in_init(void); void smp_store_cpu_info(int id); asmlinkage __visible void smp_reboot_interrupt(void); diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h index 64df897c0ee3..1be13b2dfe8b 100644 --- a/arch/x86/include/asm/sparsemem.h +++ b/arch/x86/include/asm/sparsemem.h @@ -37,6 +37,8 @@ extern int phys_to_target_node(phys_addr_t start); #define phys_to_target_node phys_to_target_node extern int memory_add_physaddr_to_nid(u64 start); #define memory_add_physaddr_to_nid memory_add_physaddr_to_nid +extern int numa_fill_memblks(u64 start, u64 end); +#define numa_fill_memblks numa_fill_memblks #endif #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h index cb0386fc4dc3..c648502e4535 100644 --- a/arch/x86/include/asm/spec-ctrl.h +++ b/arch/x86/include/asm/spec-ctrl.h @@ -4,6 +4,7 @@ #include <linux/thread_info.h> #include <asm/nospec-branch.h> +#include <asm/msr.h> /* * On VMENTER we must preserve whatever view of the SPEC_CTRL MSR @@ -76,6 +77,16 @@ static inline u64 ssbd_tif_to_amd_ls_cfg(u64 tifn) return (tifn & _TIF_SSBD) ? x86_amd_ls_cfg_ssbd_mask : 0ULL; } +/* + * This can be used in noinstr functions & should only be called in bare + * metal context. + */ +static __always_inline void __update_spec_ctrl(u64 val) +{ + __this_cpu_write(x86_spec_ctrl_current, val); + native_wrmsrl(MSR_IA32_SPEC_CTRL, val); +} + #ifdef CONFIG_SMP extern void speculative_store_bypass_ht_init(void); #else diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 19bf955b67e0..87a7b917d30e 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -229,10 +229,6 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT) #define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT) -#define SVM_VM_CR_VALID_MASK 0x001fULL -#define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL -#define SVM_VM_CR_SVM_DIS_MASK 0x0010ULL - #define SVM_NESTED_CTL_NP_ENABLE BIT(0) #define SVM_NESTED_CTL_SEV_ENABLE BIT(1) #define SVM_NESTED_CTL_SEV_ES_ENABLE BIT(2) @@ -268,6 +264,7 @@ enum avic_ipi_failure_cause { AVIC_IPI_FAILURE_TARGET_NOT_RUNNING, AVIC_IPI_FAILURE_INVALID_TARGET, AVIC_IPI_FAILURE_INVALID_BACKING_PAGE, + AVIC_IPI_FAILURE_INVALID_IPI_VECTOR, }; #define AVIC_PHYSICAL_MAX_INDEX_MASK GENMASK_ULL(8, 0) @@ -571,8 +568,6 @@ struct vmcb { #define SVM_CPUID_FUNC 0x8000000a -#define SVM_VM_CR_SVM_DISABLE 4 - #define SVM_SELECTOR_S_SHIFT 4 #define SVM_SELECTOR_DPL_SHIFT 5 #define SVM_SELECTOR_P_SHIFT 7 diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 4fb36fba4b5a..f44e2f9ab65d 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -126,12 +126,12 @@ static inline int syscall_get_arch(struct task_struct *task) ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; } -void do_syscall_64(struct pt_regs *regs, int nr); +bool do_syscall_64(struct pt_regs *regs, int nr); #endif /* CONFIG_X86_32 */ void do_int80_syscall_32(struct pt_regs *regs); -long do_fast_syscall_32(struct pt_regs *regs); -long do_SYSENTER_32(struct pt_regs *regs); +bool do_fast_syscall_32(struct pt_regs *regs); +bool do_SYSENTER_32(struct pt_regs *regs); #endif /* _ASM_X86_SYSCALL_H */ diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 603e6d1e9d4a..adcbe3f1de30 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -8,6 +8,7 @@ #include <asm/errno.h> #include <asm/ptrace.h> +#include <asm/trapnr.h> #include <asm/shared/tdx.h> /* @@ -20,6 +21,9 @@ #define TDX_SW_ERROR (TDX_ERROR | GENMASK_ULL(47, 40)) #define TDX_SEAMCALL_VMFAILINVALID (TDX_SW_ERROR | _UL(0xFFFF0000)) +#define TDX_SEAMCALL_GP (TDX_SW_ERROR | X86_TRAP_GP) +#define TDX_SEAMCALL_UD (TDX_SW_ERROR | X86_TRAP_UD) + #ifndef __ASSEMBLY__ /* @@ -72,5 +76,12 @@ static inline long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, return -ENODEV; } #endif /* CONFIG_INTEL_TDX_GUEST && CONFIG_KVM_GUEST */ + +#ifdef CONFIG_INTEL_TDX_HOST +u64 __seamcall(u64 fn, struct tdx_module_args *args); +u64 __seamcall_ret(u64 fn, struct tdx_module_args *args); +u64 __seamcall_saved_ret(u64 fn, struct tdx_module_args *args); +#endif /* CONFIG_INTEL_TDX_HOST */ + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_TDX_H */ diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 3235ba1e5b06..5f87f6b9b09e 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -105,17 +105,17 @@ static inline void setup_node_to_cpumask_map(void) { } extern const struct cpumask *cpu_coregroup_mask(int cpu); extern const struct cpumask *cpu_clustergroup_mask(int cpu); -#define topology_logical_package_id(cpu) (cpu_data(cpu).logical_proc_id) -#define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id) -#define topology_logical_die_id(cpu) (cpu_data(cpu).logical_die_id) -#define topology_die_id(cpu) (cpu_data(cpu).cpu_die_id) -#define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id) +#define topology_logical_package_id(cpu) (cpu_data(cpu).topo.logical_pkg_id) +#define topology_physical_package_id(cpu) (cpu_data(cpu).topo.pkg_id) +#define topology_logical_die_id(cpu) (cpu_data(cpu).topo.logical_die_id) +#define topology_die_id(cpu) (cpu_data(cpu).topo.die_id) +#define topology_core_id(cpu) (cpu_data(cpu).topo.core_id) #define topology_ppin(cpu) (cpu_data(cpu).ppin) extern unsigned int __max_die_per_package; #ifdef CONFIG_SMP -#define topology_cluster_id(cpu) (per_cpu(cpu_l2c_id, cpu)) +#define topology_cluster_id(cpu) (cpu_data(cpu).topo.l2c_id) #define topology_die_cpumask(cpu) (per_cpu(cpu_die_map, cpu)) #define topology_cluster_cpumask(cpu) (cpu_clustergroup_mask(cpu)) #define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 8bae40a66282..5c367c1290c3 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -496,7 +496,7 @@ copy_mc_to_kernel(void *to, const void *from, unsigned len); #define copy_mc_to_kernel copy_mc_to_kernel unsigned long __must_check -copy_mc_to_user(void *to, const void *from, unsigned len); +copy_mc_to_user(void __user *to, const void *from, unsigned len); #endif /* diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 5240d88db52a..c878616a18b8 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -177,7 +177,7 @@ struct x86_init_ops { * struct x86_cpuinit_ops - platform specific cpu hotplug setups * @setup_percpu_clockev: set up the per cpu clock event device * @early_percpu_clock_init: early init of the per cpu clock event device - * @fixup_cpu_id: fixup function for cpuinfo_x86::phys_proc_id + * @fixup_cpu_id: fixup function for cpuinfo_x86::topo.pkg_id * @parallel_bringup: Parallel bringup control */ struct x86_cpuinit_ops { diff --git a/arch/x86/include/uapi/asm/amd_hsmp.h b/arch/x86/include/uapi/asm/amd_hsmp.h index 769b939444ae..fce22686c834 100644 --- a/arch/x86/include/uapi/asm/amd_hsmp.h +++ b/arch/x86/include/uapi/asm/amd_hsmp.h @@ -47,6 +47,9 @@ enum hsmp_message_ids { HSMP_SET_PCI_RATE, /* 20h Control link rate on PCIe devices */ HSMP_SET_POWER_MODE, /* 21h Select power efficiency profile policy */ HSMP_SET_PSTATE_MAX_MIN, /* 22h Set the max and min DF P-State */ + HSMP_GET_METRIC_TABLE_VER, /* 23h Get metrics table version */ + HSMP_GET_METRIC_TABLE, /* 24h Get metrics table */ + HSMP_GET_METRIC_TABLE_DRAM_ADDR,/* 25h Get metrics table dram address */ HSMP_MSG_ID_MAX, }; @@ -64,6 +67,14 @@ enum hsmp_msg_type { HSMP_GET = 1, }; +enum hsmp_proto_versions { + HSMP_PROTO_VER2 = 2, + HSMP_PROTO_VER3, + HSMP_PROTO_VER4, + HSMP_PROTO_VER5, + HSMP_PROTO_VER6 +}; + struct hsmp_msg_desc { int num_args; int response_sz; @@ -295,6 +306,104 @@ static const struct hsmp_msg_desc hsmp_msg_desc_table[] = { * input: args[0] = min df pstate[15:8] + max df pstate[7:0] */ {1, 0, HSMP_SET}, + + /* + * HSMP_GET_METRIC_TABLE_VER, num_args = 0, response_sz = 1 + * output: args[0] = metrics table version + */ + {0, 1, HSMP_GET}, + + /* + * HSMP_GET_METRIC_TABLE, num_args = 0, response_sz = 0 + */ + {0, 0, HSMP_GET}, + + /* + * HSMP_GET_METRIC_TABLE_DRAM_ADDR, num_args = 0, response_sz = 2 + * output: args[0] = lower 32 bits of the address + * output: args[1] = upper 32 bits of the address + */ + {0, 2, HSMP_GET}, +}; + +/* Metrics table (supported only with proto version 6) */ +struct hsmp_metric_table { + __u32 accumulation_counter; + + /* TEMPERATURE */ + __u32 max_socket_temperature; + __u32 max_vr_temperature; + __u32 max_hbm_temperature; + __u64 max_socket_temperature_acc; + __u64 max_vr_temperature_acc; + __u64 max_hbm_temperature_acc; + + /* POWER */ + __u32 socket_power_limit; + __u32 max_socket_power_limit; + __u32 socket_power; + + /* ENERGY */ + __u64 timestamp; + __u64 socket_energy_acc; + __u64 ccd_energy_acc; + __u64 xcd_energy_acc; + __u64 aid_energy_acc; + __u64 hbm_energy_acc; + + /* FREQUENCY */ + __u32 cclk_frequency_limit; + __u32 gfxclk_frequency_limit; + __u32 fclk_frequency; + __u32 uclk_frequency; + __u32 socclk_frequency[4]; + __u32 vclk_frequency[4]; + __u32 dclk_frequency[4]; + __u32 lclk_frequency[4]; + __u64 gfxclk_frequency_acc[8]; + __u64 cclk_frequency_acc[96]; + + /* FREQUENCY RANGE */ + __u32 max_cclk_frequency; + __u32 min_cclk_frequency; + __u32 max_gfxclk_frequency; + __u32 min_gfxclk_frequency; + __u32 fclk_frequency_table[4]; + __u32 uclk_frequency_table[4]; + __u32 socclk_frequency_table[4]; + __u32 vclk_frequency_table[4]; + __u32 dclk_frequency_table[4]; + __u32 lclk_frequency_table[4]; + __u32 max_lclk_dpm_range; + __u32 min_lclk_dpm_range; + + /* XGMI */ + __u32 xgmi_width; + __u32 xgmi_bitrate; + __u64 xgmi_read_bandwidth_acc[8]; + __u64 xgmi_write_bandwidth_acc[8]; + + /* ACTIVITY */ + __u32 socket_c0_residency; + __u32 socket_gfx_busy; + __u32 dram_bandwidth_utilization; + __u64 socket_c0_residency_acc; + __u64 socket_gfx_busy_acc; + __u64 dram_bandwidth_acc; + __u32 max_dram_bandwidth; + __u64 dram_bandwidth_utilization_acc; + __u64 pcie_bandwidth_acc[4]; + + /* THROTTLERS */ + __u32 prochot_residency_acc; + __u32 ppt_residency_acc; + __u32 socket_thm_residency_acc; + __u32 vr_thm_residency_acc; + __u32 hbm_thm_residency_acc; + __u32 spare; + + /* New items at the end to maintain driver compatibility */ + __u32 gfxclk_frequency[8]; }; /* Reset to default packing */ diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 2a0ea38955df..d0918a75cb00 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -148,6 +148,9 @@ static int __init acpi_parse_madt(struct acpi_table_header *table) pr_debug("Local APIC address 0x%08x\n", madt->address); } + if (madt->flags & ACPI_MADT_PCAT_COMPAT) + legacy_pic_pcat_compat(); + /* ACPI 6.3 and newer support the online capable bit. */ if (acpi_gbl_FADT.header.revision > 6 || (acpi_gbl_FADT.header.revision == 6 && @@ -359,7 +362,7 @@ acpi_parse_lapic_nmi(union acpi_subtable_headers * header, const unsigned long e } #ifdef CONFIG_X86_64 -static int acpi_wakeup_cpu(int apicid, unsigned long start_ip) +static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip) { /* * Remap mailbox memory only for the first call to acpi_wakeup_cpu(). @@ -856,7 +859,7 @@ int acpi_unmap_cpu(int cpu) set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE); #endif - per_cpu(x86_cpu_to_apicid, cpu) = -1; + per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID; set_cpu_present(cpu, false); num_processors--; diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 517ee01503be..73be3931e4f0 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -403,6 +403,17 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, u8 insn_buff[MAX_PATCH_LEN]; DPRINTK(ALT, "alt table %px, -> %px", start, end); + + /* + * In the case CONFIG_X86_5LEVEL=y, KASAN_SHADOW_START is defined using + * cpu_feature_enabled(X86_FEATURE_LA57) and is therefore patched here. + * During the process, KASAN becomes confused seeing partial LA57 + * conversion and triggers a false-positive out-of-bound report. + * + * Disable KASAN until the patching is complete. + */ + kasan_disable_current(); + /* * The scan order should be from start to end. A later scanned * alternative code can overwrite previously scanned alternative code. @@ -452,6 +463,8 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, text_poke_early(instr, insn_buff, insn_buff_sz); } + + kasan_enable_current(); } static inline bool is_jcc32(struct insn *insn) diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 356de955e78d..053f6dcc6b2c 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -27,6 +27,7 @@ #define PCI_DEVICE_ID_AMD_1AH_M00H_ROOT 0x153a #define PCI_DEVICE_ID_AMD_1AH_M20H_ROOT 0x1507 #define PCI_DEVICE_ID_AMD_MI200_ROOT 0x14bb +#define PCI_DEVICE_ID_AMD_MI300_ROOT 0x14f8 #define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464 #define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec @@ -43,6 +44,7 @@ #define PCI_DEVICE_ID_AMD_19H_M78H_DF_F4 0x12fc #define PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4 0x12c4 #define PCI_DEVICE_ID_AMD_MI200_DF_F4 0x14d4 +#define PCI_DEVICE_ID_AMD_MI300_DF_F4 0x152c /* Protect the PCI config register pairs used for SMN. */ static DEFINE_MUTEX(smn_mutex); @@ -62,6 +64,7 @@ static const struct pci_device_id amd_root_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_ROOT) }, {} }; @@ -93,6 +96,7 @@ static const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_DF_F3) }, {} }; @@ -112,9 +116,13 @@ static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_DF_F4) }, {} }; @@ -386,7 +394,7 @@ int amd_get_subcaches(int cpu) pci_read_config_dword(link, 0x1d4, &mask); - return (mask >> (4 * cpu_data(cpu).cpu_core_id)) & 0xf; + return (mask >> (4 * cpu_data(cpu).topo.core_id)) & 0xf; } int amd_set_subcaches(int cpu, unsigned long mask) @@ -412,7 +420,7 @@ int amd_set_subcaches(int cpu, unsigned long mask) pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000); } - cuid = cpu_data(cpu).cpu_core_id; + cuid = cpu_data(cpu).topo.core_id; mask <<= 4 * cuid; mask |= (0xf ^ (1 << cuid)) << 26; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 760adac3d1a8..41093cf20acd 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -36,6 +36,8 @@ #include <linux/smp.h> #include <linux/mm.h> +#include <xen/xen.h> + #include <asm/trace/irq_vectors.h> #include <asm/irq_remapping.h> #include <asm/pc-conf-reg.h> @@ -70,7 +72,7 @@ unsigned int num_processors; unsigned disabled_cpus; /* Processor that is doing the boot up */ -unsigned int boot_cpu_physical_apicid __ro_after_init = -1U; +u32 boot_cpu_physical_apicid __ro_after_init = BAD_APICID; EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid); u8 boot_cpu_apic_version __ro_after_init; @@ -85,7 +87,7 @@ physid_mask_t phys_cpu_present_map; * disable_cpu_apicid=<int>, mostly used for the kdump 2nd kernel to * avoid undefined behaviour caused by sending INIT from AP to BSP. */ -static unsigned int disabled_cpu_apicid __ro_after_init = BAD_APICID; +static u32 disabled_cpu_apicid __ro_after_init = BAD_APICID; /* * This variable controls which CPUs receive external NMIs. By default, @@ -109,7 +111,7 @@ static inline bool apic_accessible(void) /* * Map cpu index to physical APIC ID */ -DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID); +DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_apicid, BAD_APICID); DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid, U32_MAX); EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid); @@ -1763,7 +1765,7 @@ static void __x2apic_enable(void) static int __init setup_nox2apic(char *str) { if (x2apic_enabled()) { - int apicid = native_apic_msr_read(APIC_ID); + u32 apicid = native_apic_msr_read(APIC_ID); if (apicid >= 255) { pr_warn("Apicid: %08x, cannot enforce nox2apic\n", @@ -2316,13 +2318,11 @@ static int nr_logical_cpuids = 1; /* * Used to store mapping between logical CPU IDs and APIC IDs. */ -int cpuid_to_apicid[] = { - [0 ... NR_CPUS - 1] = -1, -}; +u32 cpuid_to_apicid[] = { [0 ... NR_CPUS - 1] = BAD_APICID, }; bool arch_match_cpu_phys_id(int cpu, u64 phys_id) { - return phys_id == cpuid_to_apicid[cpu]; + return phys_id == (u64)cpuid_to_apicid[cpu]; } #ifdef CONFIG_SMP @@ -2344,6 +2344,15 @@ static int __init smp_init_primary_thread_mask(void) { unsigned int cpu; + /* + * XEN/PV provides either none or useless topology information. + * Pretend that all vCPUs are primary threads. + */ + if (xen_pv_domain()) { + cpumask_copy(&__cpu_primary_thread_mask, cpu_possible_mask); + return 0; + } + for (cpu = 0; cpu < nr_logical_cpuids; cpu++) cpu_mark_primary_thread(cpu, cpuid_to_apicid[cpu]); return 0; @@ -2382,7 +2391,7 @@ static int allocate_logical_cpuid(int apicid) return nr_logical_cpuids++; } -static void cpu_update_apic(int cpu, int apicid) +static void cpu_update_apic(int cpu, u32 apicid) { #if defined(CONFIG_SMP) || defined(CONFIG_X86_64) early_per_cpu(x86_cpu_to_apicid, cpu) = apicid; @@ -2535,7 +2544,7 @@ static struct { */ int active; /* r/w apic fields */ - unsigned int apic_id; + u32 apic_id; unsigned int apic_taskpri; unsigned int apic_ldr; unsigned int apic_dfr; diff --git a/arch/x86/kernel/apic/apic_common.c b/arch/x86/kernel/apic/apic_common.c index 7bc5d9bf59cd..8a00141073ea 100644 --- a/arch/x86/kernel/apic/apic_common.c +++ b/arch/x86/kernel/apic/apic_common.c @@ -18,7 +18,7 @@ u32 apic_flat_calc_apicid(unsigned int cpu) return 1U << cpu; } -bool default_check_apicid_used(physid_mask_t *map, int apicid) +bool default_check_apicid_used(physid_mask_t *map, u32 apicid) { return physid_isset(apicid, *map); } @@ -28,7 +28,7 @@ void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) *retmap = *phys_map; } -int default_cpu_present_to_apicid(int mps_cpu) +u32 default_cpu_present_to_apicid(int mps_cpu) { if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu)) return (int)per_cpu(x86_cpu_to_apicid, mps_cpu); diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 032a84e2c3cc..37daa3fd6819 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -56,17 +56,17 @@ flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) _flat_send_IPI_mask(mask, vector); } -static unsigned int flat_get_apic_id(unsigned long x) +static u32 flat_get_apic_id(u32 x) { return (x >> 24) & 0xFF; } -static u32 set_apic_id(unsigned int id) +static u32 set_apic_id(u32 id) { return (id & 0xFF) << 24; } -static int flat_phys_pkg_id(int initial_apic_id, int index_msb) +static u32 flat_phys_pkg_id(u32 initial_apic_id, int index_msb) { return initial_apic_id >> index_msb; } @@ -158,8 +158,6 @@ static struct apic apic_physflat __ro_after_init = { .disable_esr = 0, - .check_apicid_used = NULL, - .ioapic_phys_id_map = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .phys_pkg_id = flat_phys_pkg_id, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 966d7cf10b95..b00d52ae84fa 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -18,6 +18,8 @@ #include <asm/apic.h> +#include "local.h" + static void noop_send_IPI(int cpu, int vector) { } static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { } static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { } @@ -25,10 +27,10 @@ static void noop_send_IPI_allbutself(int vector) { } static void noop_send_IPI_all(int vector) { } static void noop_send_IPI_self(int vector) { } static void noop_apic_icr_write(u32 low, u32 id) { } -static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip) { return -1; } +static int noop_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip) { return -1; } static u64 noop_apic_icr_read(void) { return 0; } -static int noop_phys_pkg_id(int cpuid_apic, int index_msb) { return 0; } -static unsigned int noop_get_apic_id(unsigned long x) { return 0; } +static u32 noop_phys_pkg_id(u32 cpuid_apic, int index_msb) { return 0; } +static u32 noop_get_apic_id(u32 apicid) { return 0; } static void noop_apic_eoi(void) { } static u32 noop_apic_read(u32 reg) diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 63f3d7be9dc7..456a14c44f67 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -25,7 +25,7 @@ static const struct apic apic_numachip1; static const struct apic apic_numachip2; static void (*numachip_apic_icr_write)(int apicid, unsigned int val) __read_mostly; -static unsigned int numachip1_get_apic_id(unsigned long x) +static u32 numachip1_get_apic_id(u32 x) { unsigned long value; unsigned int id = (x >> 24) & 0xff; @@ -38,12 +38,12 @@ static unsigned int numachip1_get_apic_id(unsigned long x) return id; } -static u32 numachip1_set_apic_id(unsigned int id) +static u32 numachip1_set_apic_id(u32 id) { return (id & 0xff) << 24; } -static unsigned int numachip2_get_apic_id(unsigned long x) +static u32 numachip2_get_apic_id(u32 x) { u64 mcfg; @@ -51,12 +51,12 @@ static unsigned int numachip2_get_apic_id(unsigned long x) return ((mcfg >> (28 - 8)) & 0xfff00) | (x >> 24); } -static u32 numachip2_set_apic_id(unsigned int id) +static u32 numachip2_set_apic_id(u32 id) { return id << 24; } -static int numachip_phys_pkg_id(int initial_apic_id, int index_msb) +static u32 numachip_phys_pkg_id(u32 initial_apic_id, int index_msb) { return initial_apic_id >> index_msb; } @@ -71,7 +71,7 @@ static void numachip2_apic_icr_write(int apicid, unsigned int val) numachip2_write32_lcsr(NUMACHIP2_APIC_ICR, (apicid << 12) | val); } -static int numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip) +static int numachip_wakeup_secondary(u32 phys_apicid, unsigned long start_rip) { numachip_apic_icr_write(phys_apicid, APIC_DM_INIT); numachip_apic_icr_write(phys_apicid, APIC_DM_STARTUP | @@ -161,7 +161,7 @@ static void fixup_cpu_id(struct cpuinfo_x86 *c, int node) u64 val; u32 nodes = 1; - this_cpu_write(cpu_llc_id, node); + c->topo.llc_id = node; /* Account for nodes per socket in multi-core-module processors */ if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) { @@ -169,7 +169,7 @@ static void fixup_cpu_id(struct cpuinfo_x86 *c, int node) nodes = ((val >> 3) & 7) + 1; } - c->phys_proc_id = node / nodes; + c->topo.pkg_id = node / nodes; } static int __init numachip_system_init(void) diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 0e5535add4b5..7ee3c486cb33 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -13,12 +13,12 @@ #include "local.h" -static unsigned bigsmp_get_apic_id(unsigned long x) +static u32 bigsmp_get_apic_id(u32 x) { return (x >> 24) & 0xFF; } -static bool bigsmp_check_apicid_used(physid_mask_t *map, int apicid) +static bool bigsmp_check_apicid_used(physid_mask_t *map, u32 apicid) { return false; } @@ -29,7 +29,7 @@ static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *re physids_promote(0xFFL, retmap); } -static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) +static u32 bigsmp_phys_pkg_id(u32 cpuid_apic, int index_msb) { return cpuid_apic >> index_msb; } diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index a44ba7209ef3..0078730a512e 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -281,7 +281,7 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector) } #ifdef CONFIG_SMP -static int convert_apicid_to_cpu(int apic_id) +static int convert_apicid_to_cpu(u32 apic_id) { int i; @@ -294,7 +294,8 @@ static int convert_apicid_to_cpu(int apic_id) int safe_smp_processor_id(void) { - int apicid, cpuid; + u32 apicid; + int cpuid; if (!boot_cpu_has(X86_FEATURE_APIC)) return 0; diff --git a/arch/x86/kernel/apic/local.h b/arch/x86/kernel/apic/local.h index ec219c659c7d..9ea6186ea88c 100644 --- a/arch/x86/kernel/apic/local.h +++ b/arch/x86/kernel/apic/local.h @@ -15,9 +15,9 @@ /* X2APIC */ void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest); -unsigned int x2apic_get_apic_id(unsigned long id); -u32 x2apic_set_apic_id(unsigned int id); -int x2apic_phys_pkg_id(int initial_apicid, int index_msb); +u32 x2apic_get_apic_id(u32 id); +u32 x2apic_set_apic_id(u32 id); +u32 x2apic_phys_pkg_id(u32 initial_apicid, int index_msb); void x2apic_send_IPI_all(int vector); void x2apic_send_IPI_allbutself(int vector); @@ -64,6 +64,7 @@ void default_send_IPI_all(int vector); void default_send_IPI_self(int vector); bool default_apic_id_registered(void); +bool default_check_apicid_used(physid_mask_t *map, u32 apicid); #ifdef CONFIG_X86_32 void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector); diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 6b6b711678fe..d9651f15ae4f 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -55,14 +55,14 @@ msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force) * caused by the non-atomic update of the address/data pair. * * Direct update is possible when: - * - The MSI is maskable (remapped MSI does not use this code path)). - * The quirk bit is not set in this case. + * - The MSI is maskable (remapped MSI does not use this code path). + * The reservation mode bit is set in this case. * - The new vector is the same as the old vector * - The old vector is MANAGED_IRQ_SHUTDOWN_VECTOR (interrupt starts up) * - The interrupt is not yet started up * - The new destination CPU is the same as the old destination CPU */ - if (!irqd_msi_nomask_quirk(irqd) || + if (!irqd_can_reserve(irqd) || cfg->vector == old_cfg.vector || old_cfg.vector == MANAGED_IRQ_SHUTDOWN_VECTOR || !irqd_is_started(irqd) || @@ -215,8 +215,6 @@ static bool x86_init_dev_msi_info(struct device *dev, struct irq_domain *domain, if (WARN_ON_ONCE(domain != real_parent)) return false; info->chip->irq_set_affinity = msi_set_affinity; - /* See msi_set_affinity() for the gory details */ - info->flags |= MSI_FLAG_NOMASK_QUIRK; break; case DOMAIN_BUS_DMAR: case DOMAIN_BUS_AMDVI: diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 9a06df6cdd68..5eb3fbe472da 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -18,11 +18,21 @@ #include "local.h" -static int default_phys_pkg_id(int cpuid_apic, int index_msb) +static u32 default_phys_pkg_id(u32 cpuid_apic, int index_msb) { return cpuid_apic >> index_msb; } +static u32 default_get_apic_id(u32 x) +{ + unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); + + if (APIC_XAPIC(ver) || boot_cpu_has(X86_FEATURE_EXTD_APICID)) + return (x >> 24) & 0xFF; + else + return (x >> 24) & 0x0F; +} + /* should be called last. */ static int probe_default(void) { diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 788cdb4ee394..7c9fe28f742f 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -124,17 +124,17 @@ static int x2apic_phys_probe(void) return apic == &apic_x2apic_phys; } -unsigned int x2apic_get_apic_id(unsigned long id) +u32 x2apic_get_apic_id(u32 id) { return id; } -u32 x2apic_set_apic_id(unsigned int id) +u32 x2apic_set_apic_id(u32 id) { return id; } -int x2apic_phys_pkg_id(int initial_apicid, int index_msb) +u32 x2apic_phys_pkg_id(u32 initial_apicid, int index_msb) { return initial_apicid >> index_msb; } diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 205cee567629..1b0d7336a28f 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -110,7 +110,7 @@ static void __init early_get_pnodeid(void) } else if (UVH_RH_GAM_ADDR_MAP_CONFIG) { union uvh_rh_gam_addr_map_config_u m_n_config; - m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_ADDR_MAP_CONFIG); + m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_ADDR_MAP_CONFIG); uv_cpuid.n_skt = m_n_config.s.n_skt; if (is_uv(UV3)) uv_cpuid.m_skt = m_n_config.s3.m_skt; @@ -701,7 +701,7 @@ static __init void build_uv_gr_table(void) } } -static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) +static int uv_wakeup_secondary(u32 phys_apicid, unsigned long start_rip) { unsigned long val; int pnode; @@ -779,7 +779,7 @@ static void uv_send_IPI_all(int vector) uv_send_IPI_mask(cpu_online_mask, vector); } -static u32 set_apic_id(unsigned int id) +static u32 set_apic_id(u32 id) { return id; } @@ -789,7 +789,7 @@ static unsigned int uv_read_apic_id(void) return x2apic_get_apic_id(apic_read(APIC_ID)); } -static int uv_phys_pkg_id(int initial_apicid, int index_msb) +static u32 uv_phys_pkg_id(u32 initial_apicid, int index_msb) { return uv_read_apic_id() >> index_msb; } diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index dc3576303f1a..6913b372ccf7 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -68,26 +68,19 @@ static void __used common(void) #endif BLANK(); - OFFSET(TDX_MODULE_rcx, tdx_module_output, rcx); - OFFSET(TDX_MODULE_rdx, tdx_module_output, rdx); - OFFSET(TDX_MODULE_r8, tdx_module_output, r8); - OFFSET(TDX_MODULE_r9, tdx_module_output, r9); - OFFSET(TDX_MODULE_r10, tdx_module_output, r10); - OFFSET(TDX_MODULE_r11, tdx_module_output, r11); - - BLANK(); - OFFSET(TDX_HYPERCALL_r8, tdx_hypercall_args, r8); - OFFSET(TDX_HYPERCALL_r9, tdx_hypercall_args, r9); - OFFSET(TDX_HYPERCALL_r10, tdx_hypercall_args, r10); - OFFSET(TDX_HYPERCALL_r11, tdx_hypercall_args, r11); - OFFSET(TDX_HYPERCALL_r12, tdx_hypercall_args, r12); - OFFSET(TDX_HYPERCALL_r13, tdx_hypercall_args, r13); - OFFSET(TDX_HYPERCALL_r14, tdx_hypercall_args, r14); - OFFSET(TDX_HYPERCALL_r15, tdx_hypercall_args, r15); - OFFSET(TDX_HYPERCALL_rdi, tdx_hypercall_args, rdi); - OFFSET(TDX_HYPERCALL_rsi, tdx_hypercall_args, rsi); - OFFSET(TDX_HYPERCALL_rbx, tdx_hypercall_args, rbx); - OFFSET(TDX_HYPERCALL_rdx, tdx_hypercall_args, rdx); + OFFSET(TDX_MODULE_rcx, tdx_module_args, rcx); + OFFSET(TDX_MODULE_rdx, tdx_module_args, rdx); + OFFSET(TDX_MODULE_r8, tdx_module_args, r8); + OFFSET(TDX_MODULE_r9, tdx_module_args, r9); + OFFSET(TDX_MODULE_r10, tdx_module_args, r10); + OFFSET(TDX_MODULE_r11, tdx_module_args, r11); + OFFSET(TDX_MODULE_r12, tdx_module_args, r12); + OFFSET(TDX_MODULE_r13, tdx_module_args, r13); + OFFSET(TDX_MODULE_r14, tdx_module_args, r14); + OFFSET(TDX_MODULE_r15, tdx_module_args, r15); + OFFSET(TDX_MODULE_rbx, tdx_module_args, rbx); + OFFSET(TDX_MODULE_rdi, tdx_module_args, rdi); + OFFSET(TDX_MODULE_rsi, tdx_module_args, rsi); BLANK(); OFFSET(BP_scratch, boot_params, scratch); diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index faa9f2299848..e9ad518a5003 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -48,11 +48,6 @@ EXPORT_SYMBOL_GPL(__x86_call_count); extern s32 __call_sites[], __call_sites_end[]; -struct thunk_desc { - void *template; - unsigned int template_size; -}; - struct core_text { unsigned long base; unsigned long end; diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 4350f6bfc064..93eabf544031 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -54,6 +54,8 @@ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o obj-$(CONFIG_ACRN_GUEST) += acrn.o +obj-$(CONFIG_DEBUG_FS) += debugfs.o + quiet_cmd_mkcapflags = MKCAP $@ cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $@ $^ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 03ef962a6992..a7eab05e5f29 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -80,6 +80,10 @@ static const int amd_div0[] = AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0x00, 0x0, 0x2f, 0xf), AMD_MODEL_RANGE(0x17, 0x50, 0x0, 0x5f, 0xf)); +static const int amd_erratum_1485[] = + AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x19, 0x10, 0x0, 0x1f, 0xf), + AMD_MODEL_RANGE(0x19, 0x60, 0x0, 0xaf, 0xf)); + static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum) { int osvw_id = *erratum++; @@ -378,7 +382,7 @@ static int nearby_node(int apicid) #endif /* - * Fix up cpu_core_id for pre-F17h systems to be in the + * Fix up topo::core_id for pre-F17h systems to be in the * [0 .. cores_per_node - 1] range. Not really needed but * kept so as not to break existing setups. */ @@ -390,7 +394,7 @@ static void legacy_fixup_core_id(struct cpuinfo_x86 *c) return; cus_per_node = c->x86_max_cores / nodes_per_socket; - c->cpu_core_id %= cus_per_node; + c->topo.core_id %= cus_per_node; } /* @@ -401,8 +405,6 @@ static void legacy_fixup_core_id(struct cpuinfo_x86 *c) */ static void amd_get_topology(struct cpuinfo_x86 *c) { - int cpu = smp_processor_id(); - /* get information required for multi-node processors */ if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { int err; @@ -410,13 +412,13 @@ static void amd_get_topology(struct cpuinfo_x86 *c) cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); - c->cpu_die_id = ecx & 0xff; + c->topo.die_id = ecx & 0xff; if (c->x86 == 0x15) - c->cu_id = ebx & 0xff; + c->topo.cu_id = ebx & 0xff; if (c->x86 >= 0x17) { - c->cpu_core_id = ebx & 0xff; + c->topo.core_id = ebx & 0xff; if (smp_num_siblings > 1) c->x86_max_cores /= smp_num_siblings; @@ -430,15 +432,14 @@ static void amd_get_topology(struct cpuinfo_x86 *c) if (!err) c->x86_coreid_bits = get_count_order(c->x86_max_cores); - cacheinfo_amd_init_llc_id(c, cpu); + cacheinfo_amd_init_llc_id(c); } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { u64 value; rdmsrl(MSR_FAM10H_NODE_ID, value); - c->cpu_die_id = value & 7; - - per_cpu(cpu_llc_id, cpu) = c->cpu_die_id; + c->topo.die_id = value & 7; + c->topo.llc_id = c->topo.die_id; } else return; @@ -455,15 +456,14 @@ static void amd_get_topology(struct cpuinfo_x86 *c) static void amd_detect_cmp(struct cpuinfo_x86 *c) { unsigned bits; - int cpu = smp_processor_id(); bits = c->x86_coreid_bits; /* Low order bits define the core id (index of core in socket) */ - c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); + c->topo.core_id = c->topo.initial_apicid & ((1 << bits)-1); /* Convert the initial APIC ID into the socket ID */ - c->phys_proc_id = c->initial_apicid >> bits; + c->topo.pkg_id = c->topo.initial_apicid >> bits; /* use socket ID also for last level cache */ - per_cpu(cpu_llc_id, cpu) = c->cpu_die_id = c->phys_proc_id; + c->topo.llc_id = c->topo.die_id = c->topo.pkg_id; } u32 amd_get_nodes_per_socket(void) @@ -477,11 +477,11 @@ static void srat_detect_node(struct cpuinfo_x86 *c) #ifdef CONFIG_NUMA int cpu = smp_processor_id(); int node; - unsigned apicid = c->apicid; + unsigned apicid = c->topo.apicid; node = numa_cpu_node(cpu); if (node == NUMA_NO_NODE) - node = get_llc_id(cpu); + node = per_cpu_llc_id(cpu); /* * On multi-fabric platform (e.g. Numascale NumaChip) a @@ -511,7 +511,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c) * through CPU mapping may alter the outcome, directly * access __apicid_to_node[]. */ - int ht_nodeid = c->initial_apicid; + int ht_nodeid = c->topo.initial_apicid; if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE) node = __apicid_to_node[ht_nodeid]; @@ -1010,7 +1010,6 @@ static bool cpu_has_zenbleed_microcode(void) default: return false; - break; } if (boot_cpu_data.microcode < good_rev) @@ -1040,6 +1039,8 @@ static void zenbleed_check(struct cpuinfo_x86 *c) static void init_amd(struct cpuinfo_x86 *c) { + u64 vm_cr; + early_init_amd(c); /* @@ -1056,7 +1057,7 @@ static void init_amd(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_FSRS); /* get apicid instead of initial apic id from cpuid */ - c->apicid = read_apic_id(); + c->topo.apicid = read_apic_id(); /* K6s reports MCEs but don't actually have all the MSRs */ if (c->x86 < 6) @@ -1091,6 +1092,14 @@ static void init_amd(struct cpuinfo_x86 *c) init_amd_cacheinfo(c); + if (cpu_has(c, X86_FEATURE_SVM)) { + rdmsrl(MSR_VM_CR, vm_cr); + if (vm_cr & SVM_VM_CR_SVM_DIS_MASK) { + pr_notice_once("SVM disabled (by BIOS) in MSR_VM_CR\n"); + clear_cpu_cap(c, X86_FEATURE_SVM); + } + } + if (!cpu_has(c, X86_FEATURE_LFENCE_RDTSC) && cpu_has(c, X86_FEATURE_XMM2)) { /* * Use LFENCE for execution serialization. On families which @@ -1149,6 +1158,10 @@ static void init_amd(struct cpuinfo_x86 *c) pr_notice_once("AMD Zen1 DIV0 bug detected. Disable SMT for full protection.\n"); setup_force_cpu_bug(X86_BUG_DIV0); } + + if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && + cpu_has_amd_erratum(c, amd_erratum_1485)) + msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 10499bcd4e39..bb0ab8466b91 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -63,7 +63,7 @@ EXPORT_SYMBOL_GPL(x86_pred_cmd); static DEFINE_MUTEX(spec_ctrl_mutex); -void (*x86_return_thunk)(void) __ro_after_init = &__x86_return_thunk; +void (*x86_return_thunk)(void) __ro_after_init = __x86_return_thunk; /* Update SPEC_CTRL MSR and its cached copy unconditionally */ static void update_spec_ctrl(u64 val) @@ -717,7 +717,7 @@ void update_gds_msr(void) case GDS_MITIGATION_UCODE_NEEDED: case GDS_MITIGATION_HYPERVISOR: return; - }; + } wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); @@ -1019,7 +1019,6 @@ static void __init retbleed_select_mitigation(void) do_cmd_auto: case RETBLEED_CMD_AUTO: - default: if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { if (IS_ENABLED(CONFIG_CPU_UNRET_ENTRY)) @@ -1042,8 +1041,7 @@ do_cmd_auto: setup_force_cpu_cap(X86_FEATURE_RETHUNK); setup_force_cpu_cap(X86_FEATURE_UNRET); - if (IS_ENABLED(CONFIG_RETHUNK)) - x86_return_thunk = retbleed_return_thunk; + x86_return_thunk = retbleed_return_thunk; if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) @@ -1061,7 +1059,8 @@ do_cmd_auto: case RETBLEED_MITIGATION_STUFF: setup_force_cpu_cap(X86_FEATURE_RETHUNK); setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH); - x86_set_skl_return_thunk(); + + x86_return_thunk = call_depth_return_thunk; break; default: @@ -1290,6 +1289,8 @@ spectre_v2_user_select_mitigation(void) spectre_v2_user_ibpb = mode; switch (cmd) { + case SPECTRE_V2_USER_CMD_NONE: + break; case SPECTRE_V2_USER_CMD_FORCE: case SPECTRE_V2_USER_CMD_PRCTL_IBPB: case SPECTRE_V2_USER_CMD_SECCOMP_IBPB: @@ -1301,8 +1302,6 @@ spectre_v2_user_select_mitigation(void) case SPECTRE_V2_USER_CMD_SECCOMP: static_branch_enable(&switch_mm_cond_ibpb); break; - default: - break; } pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n", @@ -2160,6 +2159,10 @@ static int l1d_flush_prctl_get(struct task_struct *task) static int ssb_prctl_get(struct task_struct *task) { switch (ssb_mode) { + case SPEC_STORE_BYPASS_NONE: + if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) + return PR_SPEC_ENABLE; + return PR_SPEC_NOT_AFFECTED; case SPEC_STORE_BYPASS_DISABLE: return PR_SPEC_DISABLE; case SPEC_STORE_BYPASS_SECCOMP: @@ -2171,11 +2174,8 @@ static int ssb_prctl_get(struct task_struct *task) if (task_spec_ssb_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_DISABLE; return PR_SPEC_PRCTL | PR_SPEC_ENABLE; - default: - if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) - return PR_SPEC_ENABLE; - return PR_SPEC_NOT_AFFECTED; } + BUG(); } static int ib_prctl_get(struct task_struct *task) @@ -2353,6 +2353,8 @@ early_param("l1tf", l1tf_cmdline); enum srso_mitigation { SRSO_MITIGATION_NONE, + SRSO_MITIGATION_UCODE_NEEDED, + SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED, SRSO_MITIGATION_MICROCODE, SRSO_MITIGATION_SAFE_RET, SRSO_MITIGATION_IBPB, @@ -2368,11 +2370,13 @@ enum srso_mitigation_cmd { }; static const char * const srso_strings[] = { - [SRSO_MITIGATION_NONE] = "Vulnerable", - [SRSO_MITIGATION_MICROCODE] = "Mitigation: microcode", - [SRSO_MITIGATION_SAFE_RET] = "Mitigation: safe RET", - [SRSO_MITIGATION_IBPB] = "Mitigation: IBPB", - [SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only" + [SRSO_MITIGATION_NONE] = "Vulnerable", + [SRSO_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", + [SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED] = "Vulnerable: Safe RET, no microcode", + [SRSO_MITIGATION_MICROCODE] = "Vulnerable: Microcode, no safe RET", + [SRSO_MITIGATION_SAFE_RET] = "Mitigation: Safe RET", + [SRSO_MITIGATION_IBPB] = "Mitigation: IBPB", + [SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only" }; static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_NONE; @@ -2406,34 +2410,44 @@ static void __init srso_select_mitigation(void) { bool has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE); - if (!boot_cpu_has_bug(X86_BUG_SRSO) || cpu_mitigations_off()) - goto pred_cmd; + if (cpu_mitigations_off()) + return; - if (!has_microcode) { - pr_warn("IBPB-extending microcode not applied!\n"); - pr_warn(SRSO_NOTICE); - } else { + if (!boot_cpu_has_bug(X86_BUG_SRSO)) { + if (boot_cpu_has(X86_FEATURE_SBPB)) + x86_pred_cmd = PRED_CMD_SBPB; + return; + } + + if (has_microcode) { /* * Zen1/2 with SMT off aren't vulnerable after the right * IBPB microcode has been applied. + * + * Zen1/2 don't have SBPB, no need to try to enable it here. */ if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) { setup_force_cpu_cap(X86_FEATURE_SRSO_NO); return; } - } - if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB) { - if (has_microcode) { - pr_err("Retbleed IBPB mitigation enabled, using same for SRSO\n"); + if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB) { srso_mitigation = SRSO_MITIGATION_IBPB; - goto pred_cmd; + goto out; } + } else { + pr_warn("IBPB-extending microcode not applied!\n"); + pr_warn(SRSO_NOTICE); + + /* may be overwritten by SRSO_CMD_SAFE_RET below */ + srso_mitigation = SRSO_MITIGATION_UCODE_NEEDED; } switch (srso_cmd) { case SRSO_CMD_OFF: - goto pred_cmd; + if (boot_cpu_has(X86_FEATURE_SBPB)) + x86_pred_cmd = PRED_CMD_SBPB; + return; case SRSO_CMD_MICROCODE: if (has_microcode) { @@ -2458,10 +2472,12 @@ static void __init srso_select_mitigation(void) setup_force_cpu_cap(X86_FEATURE_SRSO); x86_return_thunk = srso_return_thunk; } - srso_mitigation = SRSO_MITIGATION_SAFE_RET; + if (has_microcode) + srso_mitigation = SRSO_MITIGATION_SAFE_RET; + else + srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED; } else { pr_err("WARNING: kernel not compiled with CPU_SRSO.\n"); - goto pred_cmd; } break; @@ -2473,7 +2489,6 @@ static void __init srso_select_mitigation(void) } } else { pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n"); - goto pred_cmd; } break; @@ -2485,20 +2500,12 @@ static void __init srso_select_mitigation(void) } } else { pr_err("WARNING: kernel not compiled with CPU_SRSO.\n"); - goto pred_cmd; } break; - - default: - break; } - pr_info("%s%s\n", srso_strings[srso_mitigation], (has_microcode ? "" : ", no microcode")); - -pred_cmd: - if ((boot_cpu_has(X86_FEATURE_SRSO_NO) || srso_cmd == SRSO_CMD_OFF) && - boot_cpu_has(X86_FEATURE_SBPB)) - x86_pred_cmd = PRED_CMD_SBPB; +out: + pr_info("%s\n", srso_strings[srso_mitigation]); } #undef pr_fmt @@ -2704,9 +2711,7 @@ static ssize_t srso_show_state(char *buf) if (boot_cpu_has(X86_FEATURE_SRSO_NO)) return sysfs_emit(buf, "Mitigation: SMT disabled\n"); - return sysfs_emit(buf, "%s%s\n", - srso_strings[srso_mitigation], - boot_cpu_has(X86_FEATURE_IBPB_BRTYPE) ? "" : ", no microcode"); + return sysfs_emit(buf, "%s\n", srso_strings[srso_mitigation]); } static ssize_t gds_show_state(char *buf) diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index 8f86eacf69f7..c131c412db89 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -661,7 +661,7 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c) return i; } -void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu) +void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c) { /* * We may have multiple LLCs if L3 caches exist, so check if we @@ -672,13 +672,13 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu) if (c->x86 < 0x17) { /* LLC is at the node level. */ - per_cpu(cpu_llc_id, cpu) = c->cpu_die_id; + c->topo.llc_id = c->topo.die_id; } else if (c->x86 == 0x17 && c->x86_model <= 0x1F) { /* * LLC is at the core complex level. * Core complex ID is ApicId[3] for these processors. */ - per_cpu(cpu_llc_id, cpu) = c->apicid >> 3; + c->topo.llc_id = c->topo.apicid >> 3; } else { /* * LLC ID is calculated from the number of threads sharing the @@ -694,12 +694,12 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu) if (num_sharing_cache) { int bits = get_count_order(num_sharing_cache); - per_cpu(cpu_llc_id, cpu) = c->apicid >> bits; + c->topo.llc_id = c->topo.apicid >> bits; } } } -void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu) +void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c) { /* * We may have multiple LLCs if L3 caches exist, so check if we @@ -712,7 +712,7 @@ void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu) * LLC is at the core complex level. * Core complex ID is ApicId[3] for these processors. */ - per_cpu(cpu_llc_id, cpu) = c->apicid >> 3; + c->topo.llc_id = c->topo.apicid >> 3; } void init_amd_cacheinfo(struct cpuinfo_x86 *c) @@ -740,9 +740,6 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c) unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb; -#ifdef CONFIG_SMP - unsigned int cpu = c->cpu_index; -#endif if (c->cpuid_level > 3) { static int is_initialized; @@ -776,13 +773,13 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c) new_l2 = this_leaf.size/1024; num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; index_msb = get_count_order(num_threads_sharing); - l2_id = c->apicid & ~((1 << index_msb) - 1); + l2_id = c->topo.apicid & ~((1 << index_msb) - 1); break; case 3: new_l3 = this_leaf.size/1024; num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; index_msb = get_count_order(num_threads_sharing); - l3_id = c->apicid & ~((1 << index_msb) - 1); + l3_id = c->topo.apicid & ~((1 << index_msb) - 1); break; default: break; @@ -856,30 +853,24 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c) if (new_l2) { l2 = new_l2; -#ifdef CONFIG_SMP - per_cpu(cpu_llc_id, cpu) = l2_id; - per_cpu(cpu_l2c_id, cpu) = l2_id; -#endif + c->topo.llc_id = l2_id; + c->topo.l2c_id = l2_id; } if (new_l3) { l3 = new_l3; -#ifdef CONFIG_SMP - per_cpu(cpu_llc_id, cpu) = l3_id; -#endif + c->topo.llc_id = l3_id; } -#ifdef CONFIG_SMP /* - * If cpu_llc_id is not yet set, this means cpuid_level < 4 which in + * If llc_id is not yet set, this means cpuid_level < 4 which in * turns means that the only possibility is SMT (as indicated in * cpuid1). Since cpuid2 doesn't specify shared caches, and we know * that SMT shares all caches, we can unconditionally set cpu_llc_id to - * c->phys_proc_id. + * c->topo.pkg_id. */ - if (per_cpu(cpu_llc_id, cpu) == BAD_APICID) - per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; -#endif + if (c->topo.llc_id == BAD_APICID) + c->topo.llc_id = c->topo.pkg_id; c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d)); @@ -915,7 +906,7 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index, unsigned int apicid, nshared, first, last; nshared = base->eax.split.num_threads_sharing + 1; - apicid = cpu_data(cpu).apicid; + apicid = cpu_data(cpu).topo.apicid; first = apicid - (apicid % nshared); last = first + nshared - 1; @@ -924,14 +915,14 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index, if (!this_cpu_ci->info_list) continue; - apicid = cpu_data(i).apicid; + apicid = cpu_data(i).topo.apicid; if ((apicid < first) || (apicid > last)) continue; this_leaf = this_cpu_ci->info_list + index; for_each_online_cpu(sibling) { - apicid = cpu_data(sibling).apicid; + apicid = cpu_data(sibling).topo.apicid; if ((apicid < first) || (apicid > last)) continue; cpumask_set_cpu(sibling, @@ -969,7 +960,7 @@ static void __cache_cpumap_setup(unsigned int cpu, int index, index_msb = get_count_order(num_threads_sharing); for_each_online_cpu(i) - if (cpu_data(i).apicid >> index_msb == c->apicid >> index_msb) { + if (cpu_data(i).topo.apicid >> index_msb == c->topo.apicid >> index_msb) { struct cpu_cacheinfo *sib_cpu_ci = get_cpu_cacheinfo(i); if (i == cpu || !sib_cpu_ci->info_list) @@ -1024,7 +1015,7 @@ static void get_cache_id(int cpu, struct _cpuid4_info_regs *id4_regs) num_threads_sharing = 1 + id4_regs->eax.split.num_threads_sharing; index_msb = get_count_order(num_threads_sharing); - id4_regs->id = c->apicid >> index_msb; + id4_regs->id = c->topo.apicid >> index_msb; } int populate_cache_leaves(unsigned int cpu) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4e5ffc8b0e46..5d9591146244 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -62,6 +62,7 @@ #include <asm/intel-family.h> #include <asm/cpu_device_id.h> #include <asm/uv/uv.h> +#include <asm/ia32.h> #include <asm/set_memory.h> #include <asm/traps.h> #include <asm/sev.h> @@ -74,18 +75,6 @@ u32 elf_hwcap2 __read_mostly; int smp_num_siblings = 1; EXPORT_SYMBOL(smp_num_siblings); -/* Last level cache ID of each logical CPU */ -DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID; - -u16 get_llc_id(unsigned int cpu) -{ - return per_cpu(cpu_llc_id, cpu); -} -EXPORT_SYMBOL_GPL(get_llc_id); - -/* L2 cache ID of each logical CPU */ -DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id) = BAD_APICID; - static struct ppin_info { int feature; int msr_ppin_ctl; @@ -914,7 +903,7 @@ void detect_ht(struct cpuinfo_x86 *c) return; index_msb = get_count_order(smp_num_siblings); - c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb); + c->topo.pkg_id = apic->phys_pkg_id(c->topo.initial_apicid, index_msb); smp_num_siblings = smp_num_siblings / c->x86_max_cores; @@ -922,8 +911,8 @@ void detect_ht(struct cpuinfo_x86 *c) core_bits = get_count_order(c->x86_max_cores); - c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) & - ((1 << core_bits) - 1); + c->topo.core_id = apic->phys_pkg_id(c->topo.initial_apicid, index_msb) & + ((1 << core_bits) - 1); #endif } @@ -1114,18 +1103,34 @@ void get_cpu_cap(struct cpuinfo_x86 *c) void get_cpu_address_sizes(struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; + bool vp_bits_from_cpuid = true; - if (c->extended_cpuid_level >= 0x80000008) { + if (!cpu_has(c, X86_FEATURE_CPUID) || + (c->extended_cpuid_level < 0x80000008)) + vp_bits_from_cpuid = false; + + if (vp_bits_from_cpuid) { cpuid(0x80000008, &eax, &ebx, &ecx, &edx); c->x86_virt_bits = (eax >> 8) & 0xff; c->x86_phys_bits = eax & 0xff; + } else { + if (IS_ENABLED(CONFIG_X86_64)) { + c->x86_clflush_size = 64; + c->x86_phys_bits = 36; + c->x86_virt_bits = 48; + } else { + c->x86_clflush_size = 32; + c->x86_virt_bits = 32; + c->x86_phys_bits = 32; + + if (cpu_has(c, X86_FEATURE_PAE) || + cpu_has(c, X86_FEATURE_PSE36)) + c->x86_phys_bits = 36; + } } -#ifdef CONFIG_X86_32 - else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36)) - c->x86_phys_bits = 36; -#endif c->x86_cache_bits = c->x86_phys_bits; + c->x86_cache_alignment = c->x86_clflush_size; } static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) @@ -1579,17 +1584,6 @@ static void __init cpu_parse_early_param(void) */ static void __init early_identify_cpu(struct cpuinfo_x86 *c) { -#ifdef CONFIG_X86_64 - c->x86_clflush_size = 64; - c->x86_phys_bits = 36; - c->x86_virt_bits = 48; -#else - c->x86_clflush_size = 32; - c->x86_phys_bits = 32; - c->x86_virt_bits = 32; -#endif - c->x86_cache_alignment = c->x86_clflush_size; - memset(&c->x86_capability, 0, sizeof(c->x86_capability)); c->extended_cpuid_level = 0; @@ -1601,7 +1595,6 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) cpu_detect(c); get_cpu_vendor(c); get_cpu_cap(c); - get_cpu_address_sizes(c); setup_force_cpu_cap(X86_FEATURE_CPUID); cpu_parse_early_param(); @@ -1617,6 +1610,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) setup_clear_cpu_cap(X86_FEATURE_CPUID); } + get_cpu_address_sizes(c); + setup_force_cpu_cap(X86_FEATURE_ALWAYS); cpu_set_bug_bits(c); @@ -1761,15 +1756,15 @@ static void generic_identify(struct cpuinfo_x86 *c) get_cpu_address_sizes(c); if (c->cpuid_level >= 0x00000001) { - c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF; + c->topo.initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF; #ifdef CONFIG_X86_32 # ifdef CONFIG_SMP - c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); + c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0); # else - c->apicid = c->initial_apicid; + c->topo.apicid = c->topo.initial_apicid; # endif #endif - c->phys_proc_id = c->initial_apicid; + c->topo.pkg_id = c->topo.initial_apicid; } get_model_name(c); /* Default name */ @@ -1799,18 +1794,19 @@ static void generic_identify(struct cpuinfo_x86 *c) static void validate_apic_and_package_id(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP - unsigned int apicid, cpu = smp_processor_id(); + unsigned int cpu = smp_processor_id(); + u32 apicid; apicid = apic->cpu_present_to_apicid(cpu); - if (apicid != c->apicid) { + if (apicid != c->topo.apicid) { pr_err(FW_BUG "CPU%u: APIC id mismatch. Firmware: %x APIC: %x\n", - cpu, apicid, c->initial_apicid); + cpu, apicid, c->topo.initial_apicid); } - BUG_ON(topology_update_package_map(c->phys_proc_id, cpu)); - BUG_ON(topology_update_die_map(c->cpu_die_id, cpu)); + BUG_ON(topology_update_package_map(c->topo.pkg_id, cpu)); + BUG_ON(topology_update_die_map(c->topo.die_id, cpu)); #else - c->logical_proc_id = 0; + c->topo.logical_pkg_id = 0; #endif } @@ -1829,7 +1825,9 @@ static void identify_cpu(struct cpuinfo_x86 *c) c->x86_model_id[0] = '\0'; /* Unset */ c->x86_max_cores = 1; c->x86_coreid_bits = 0; - c->cu_id = 0xff; + c->topo.cu_id = 0xff; + c->topo.llc_id = BAD_APICID; + c->topo.l2c_id = BAD_APICID; #ifdef CONFIG_X86_64 c->x86_clflush_size = 64; c->x86_phys_bits = 36; @@ -1855,7 +1853,7 @@ static void identify_cpu(struct cpuinfo_x86 *c) apply_forced_caps(c); #ifdef CONFIG_X86_64 - c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); + c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0); #endif /* @@ -2074,24 +2072,24 @@ void syscall_init(void) wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); -#ifdef CONFIG_IA32_EMULATION - wrmsrl_cstar((unsigned long)entry_SYSCALL_compat); - /* - * This only works on Intel CPUs. - * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. - * This does not cause SYSENTER to jump to the wrong location, because - * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). - */ - wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); - wrmsrl_safe(MSR_IA32_SYSENTER_ESP, - (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1)); - wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); -#else - wrmsrl_cstar((unsigned long)ignore_sysret); - wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); - wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); - wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL); -#endif + if (ia32_enabled()) { + wrmsrl_cstar((unsigned long)entry_SYSCALL_compat); + /* + * This only works on Intel CPUs. + * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. + * This does not cause SYSENTER to jump to the wrong location, because + * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). + */ + wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, + (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1)); + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); + } else { + wrmsrl_cstar((unsigned long)entry_SYSCALL32_ignore); + wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL); + } /* * Flags to clear on syscall; clear as much as possible diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 1dcd7d4e38ef..885281ae79a5 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -78,6 +78,9 @@ extern int detect_ht_early(struct cpuinfo_x86 *c); extern void detect_ht(struct cpuinfo_x86 *c); extern void check_null_seg_clears_base(struct cpuinfo_x86 *c); +void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c); +void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c); + unsigned int aperfmperf_get_khz(int cpu); void cpu_select_mitigations(void); diff --git a/arch/x86/kernel/cpu/debugfs.c b/arch/x86/kernel/cpu/debugfs.c new file mode 100644 index 000000000000..0c179d684b3b --- /dev/null +++ b/arch/x86/kernel/cpu/debugfs.c @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/debugfs.h> + +#include <asm/apic.h> +#include <asm/processor.h> + +static int cpu_debug_show(struct seq_file *m, void *p) +{ + unsigned long cpu = (unsigned long)m->private; + struct cpuinfo_x86 *c = per_cpu_ptr(&cpu_info, cpu); + + seq_printf(m, "online: %d\n", cpu_online(cpu)); + if (!c->initialized) + return 0; + + seq_printf(m, "initial_apicid: %x\n", c->topo.initial_apicid); + seq_printf(m, "apicid: %x\n", c->topo.apicid); + seq_printf(m, "pkg_id: %u\n", c->topo.pkg_id); + seq_printf(m, "die_id: %u\n", c->topo.die_id); + seq_printf(m, "cu_id: %u\n", c->topo.cu_id); + seq_printf(m, "core_id: %u\n", c->topo.core_id); + seq_printf(m, "logical_pkg_id: %u\n", c->topo.logical_pkg_id); + seq_printf(m, "logical_die_id: %u\n", c->topo.logical_die_id); + seq_printf(m, "llc_id: %u\n", c->topo.llc_id); + seq_printf(m, "l2c_id: %u\n", c->topo.l2c_id); + seq_printf(m, "max_cores: %u\n", c->x86_max_cores); + seq_printf(m, "max_die_per_pkg: %u\n", __max_die_per_package); + seq_printf(m, "smp_num_siblings: %u\n", smp_num_siblings); + return 0; +} + +static int cpu_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, cpu_debug_show, inode->i_private); +} + +static const struct file_operations dfs_cpu_ops = { + .open = cpu_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static __init int cpu_init_debugfs(void) +{ + struct dentry *dir, *base = debugfs_create_dir("topo", arch_debugfs_dir); + unsigned long id; + char name[24]; + + dir = debugfs_create_dir("cpus", base); + for_each_possible_cpu(id) { + sprintf(name, "%lu", id); + debugfs_create_file(name, 0444, dir, (void *)id, &dfs_cpu_ops); + } + return 0; +} +late_initcall(cpu_init_debugfs); diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index defdc594be14..6f247d66758d 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -63,8 +63,6 @@ static void hygon_get_topology_early(struct cpuinfo_x86 *c) */ static void hygon_get_topology(struct cpuinfo_x86 *c) { - int cpu = smp_processor_id(); - /* get information required for multi-node processors */ if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { int err; @@ -72,9 +70,9 @@ static void hygon_get_topology(struct cpuinfo_x86 *c) cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); - c->cpu_die_id = ecx & 0xff; + c->topo.die_id = ecx & 0xff; - c->cpu_core_id = ebx & 0xff; + c->topo.core_id = ebx & 0xff; if (smp_num_siblings > 1) c->x86_max_cores /= smp_num_siblings; @@ -87,17 +85,20 @@ static void hygon_get_topology(struct cpuinfo_x86 *c) if (!err) c->x86_coreid_bits = get_count_order(c->x86_max_cores); - /* Socket ID is ApicId[6] for these processors. */ - c->phys_proc_id = c->apicid >> APICID_SOCKET_ID_BIT; + /* + * Socket ID is ApicId[6] for the processors with model <= 0x3 + * when running on host. + */ + if (!boot_cpu_has(X86_FEATURE_HYPERVISOR) && c->x86_model <= 0x3) + c->topo.pkg_id = c->topo.apicid >> APICID_SOCKET_ID_BIT; - cacheinfo_hygon_init_llc_id(c, cpu); + cacheinfo_hygon_init_llc_id(c); } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { u64 value; rdmsrl(MSR_FAM10H_NODE_ID, value); - c->cpu_die_id = value & 7; - - per_cpu(cpu_llc_id, cpu) = c->cpu_die_id; + c->topo.die_id = value & 7; + c->topo.llc_id = c->topo.die_id; } else return; @@ -112,15 +113,14 @@ static void hygon_get_topology(struct cpuinfo_x86 *c) static void hygon_detect_cmp(struct cpuinfo_x86 *c) { unsigned int bits; - int cpu = smp_processor_id(); bits = c->x86_coreid_bits; /* Low order bits define the core id (index of core in socket) */ - c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); + c->topo.core_id = c->topo.initial_apicid & ((1 << bits)-1); /* Convert the initial APIC ID into the socket ID */ - c->phys_proc_id = c->initial_apicid >> bits; - /* use socket ID also for last level cache */ - per_cpu(cpu_llc_id, cpu) = c->cpu_die_id = c->phys_proc_id; + c->topo.pkg_id = c->topo.initial_apicid >> bits; + /* Use package ID also for last level cache */ + c->topo.llc_id = c->topo.die_id = c->topo.pkg_id; } static void srat_detect_node(struct cpuinfo_x86 *c) @@ -128,11 +128,11 @@ static void srat_detect_node(struct cpuinfo_x86 *c) #ifdef CONFIG_NUMA int cpu = smp_processor_id(); int node; - unsigned int apicid = c->apicid; + unsigned int apicid = c->topo.apicid; node = numa_cpu_node(cpu); if (node == NUMA_NO_NODE) - node = per_cpu(cpu_llc_id, cpu); + node = c->topo.llc_id; /* * On multi-fabric platform (e.g. Numascale NumaChip) a @@ -161,7 +161,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c) * through CPU mapping may alter the outcome, directly * access __apicid_to_node[]. */ - int ht_nodeid = c->initial_apicid; + int ht_nodeid = c->topo.initial_apicid; if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE) node = __apicid_to_node[ht_nodeid]; @@ -290,6 +290,8 @@ static void early_init_hygon(struct cpuinfo_x86 *c) static void init_hygon(struct cpuinfo_x86 *c) { + u64 vm_cr; + early_init_hygon(c); /* @@ -301,7 +303,7 @@ static void init_hygon(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_REP_GOOD); /* get apicid instead of initial apic id from cpuid */ - c->apicid = read_apic_id(); + c->topo.apicid = read_apic_id(); /* * XXX someone from Hygon needs to confirm this DTRT @@ -320,6 +322,14 @@ static void init_hygon(struct cpuinfo_x86 *c) init_hygon_cacheinfo(c); + if (cpu_has(c, X86_FEATURE_SVM)) { + rdmsrl(MSR_VM_CR, vm_cr); + if (vm_cr & SVM_VM_CR_SVM_DIS_MASK) { + pr_notice_once("SVM disabled (by BIOS) in MSR_VM_CR\n"); + clear_cpu_cap(c, X86_FEATURE_SVM); + } + } + if (cpu_has(c, X86_FEATURE_XMM2)) { /* * Use LFENCE for execution serialization. On families which diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index be4045628fd3..a927a8fc9624 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -314,19 +314,6 @@ static void early_init_intel(struct cpuinfo_x86 *c) setup_clear_cpu_cap(X86_FEATURE_PGE); } - if (c->cpuid_level >= 0x00000001) { - u32 eax, ebx, ecx, edx; - - cpuid(0x00000001, &eax, &ebx, &ecx, &edx); - /* - * If HTT (EDX[28]) is set EBX[16:23] contain the number of - * apicids which are reserved per package. Store the resulting - * shift value for the package management code. - */ - if (edx & (1U << 28)) - c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff); - } - check_memory_type_self_snoop_errata(c); /* @@ -1016,7 +1003,6 @@ static struct ctl_table sld_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - {} }; static int __init sld_mitigate_sysctl_init(void) diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index c267f43de39e..f3517b8a8e91 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -713,17 +713,75 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) deferred_error_interrupt_enable(c); } -bool amd_mce_is_memory_error(struct mce *m) +/* + * DRAM ECC errors are reported in the Northbridge (bank 4) with + * Extended Error Code 8. + */ +static bool legacy_mce_is_memory_error(struct mce *m) +{ + return m->bank == 4 && XEC(m->status, 0x1f) == 8; +} + +/* + * DRAM ECC errors are reported in Unified Memory Controllers with + * Extended Error Code 0. + */ +static bool smca_mce_is_memory_error(struct mce *m) { enum smca_bank_types bank_type; - /* ErrCodeExt[20:16] */ - u8 xec = (m->status >> 16) & 0x1f; + + if (XEC(m->status, 0x3f)) + return false; bank_type = smca_get_bank_type(m->extcpu, m->bank); + + return bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2; +} + +bool amd_mce_is_memory_error(struct mce *m) +{ if (mce_flags.smca) - return (bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2) && xec == 0x0; + return smca_mce_is_memory_error(m); + else + return legacy_mce_is_memory_error(m); +} + +/* + * AMD systems do not have an explicit indicator that the value in MCA_ADDR is + * a system physical address. Therefore, individual cases need to be detected. + * Future cases and checks will be added as needed. + * + * 1) General case + * a) Assume address is not usable. + * 2) Poison errors + * a) Indicated by MCA_STATUS[43]: poison. Defined for all banks except legacy + * northbridge (bank 4). + * b) Refers to poison consumption in the core. Does not include "no action", + * "action optional", or "deferred" error severities. + * c) Will include a usable address so that immediate action can be taken. + * 3) Northbridge DRAM ECC errors + * a) Reported in legacy bank 4 with extended error code (XEC) 8. + * b) MCA_STATUS[43] is *not* defined as poison in legacy bank 4. Therefore, + * this bit should not be checked. + * + * NOTE: SMCA UMC memory errors fall into case #1. + */ +bool amd_mce_usable_address(struct mce *m) +{ + /* Check special northbridge case 3) first. */ + if (!mce_flags.smca) { + if (legacy_mce_is_memory_error(m)) + return true; + else if (m->bank == 4) + return false; + } - return m->bank == 4 && xec == 0x8; + /* Check poison bit for all other bank types. */ + if (m->status & MCI_STATUS_POISON) + return true; + + /* Assume address is not usable for all others. */ + return false; } static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c index 8ed341714686..7f7309ff67d0 100644 --- a/arch/x86/kernel/cpu/mce/apei.c +++ b/arch/x86/kernel/cpu/mce/apei.c @@ -103,9 +103,9 @@ int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) m.socketid = -1; for_each_possible_cpu(cpu) { - if (cpu_data(cpu).initial_apicid == lapic_id) { + if (cpu_data(cpu).topo.initial_apicid == lapic_id) { m.extcpu = cpu; - m.socketid = cpu_data(m.extcpu).phys_proc_id; + m.socketid = cpu_data(m.extcpu).topo.pkg_id; break; } } diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 6f35f724cc14..7b397370b4d6 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -123,8 +123,8 @@ void mce_setup(struct mce *m) m->time = __ktime_get_real_seconds(); m->cpuvendor = boot_cpu_data.x86_vendor; m->cpuid = cpuid_eax(1); - m->socketid = cpu_data(m->extcpu).phys_proc_id; - m->apicid = cpu_data(m->extcpu).initial_apicid; + m->socketid = cpu_data(m->extcpu).topo.pkg_id; + m->apicid = cpu_data(m->extcpu).topo.initial_apicid; m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP); m->ppin = cpu_data(m->extcpu).ppin; m->microcode = boot_cpu_data.microcode; @@ -453,32 +453,22 @@ static void mce_irq_work_cb(struct irq_work *entry) mce_schedule_work(); } -/* - * Check if the address reported by the CPU is in a format we can parse. - * It would be possible to add code for most other cases, but all would - * be somewhat complicated (e.g. segment offset would require an instruction - * parser). So only support physical addresses up to page granularity for now. - */ -int mce_usable_address(struct mce *m) +bool mce_usable_address(struct mce *m) { if (!(m->status & MCI_STATUS_ADDRV)) - return 0; - - /* Checks after this one are Intel/Zhaoxin-specific: */ - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL && - boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN) - return 1; - - if (!(m->status & MCI_STATUS_MISCV)) - return 0; + return false; - if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) - return 0; + switch (m->cpuvendor) { + case X86_VENDOR_AMD: + return amd_mce_usable_address(m); - if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) - return 0; + case X86_VENDOR_INTEL: + case X86_VENDOR_ZHAOXIN: + return intel_mce_usable_address(m); - return 1; + default: + return true; + } } EXPORT_SYMBOL_GPL(mce_usable_address); diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index f5323551c1a9..52bce533ddcc 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -536,3 +536,23 @@ bool intel_filter_mce(struct mce *m) return false; } + +/* + * Check if the address reported by the CPU is in a format we can parse. + * It would be possible to add code for most other cases, but all would + * be somewhat complicated (e.g. segment offset would require an instruction + * parser). So only support physical addresses up to page granularity for now. + */ +bool intel_mce_usable_address(struct mce *m) +{ + if (!(m->status & MCI_STATUS_MISCV)) + return false; + + if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) + return false; + + if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) + return false; + + return true; +} diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index bcf1b3c66c9c..e13a26c9c0ac 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -49,6 +49,7 @@ void intel_init_cmci(void); void intel_init_lmce(void); void intel_clear_lmce(void); bool intel_filter_mce(struct mce *m); +bool intel_mce_usable_address(struct mce *m); #else # define cmci_intel_adjust_timer mce_adjust_timer_default static inline bool mce_intel_cmci_poll(void) { return false; } @@ -58,6 +59,7 @@ static inline void intel_init_cmci(void) { } static inline void intel_init_lmce(void) { } static inline void intel_clear_lmce(void) { } static inline bool intel_filter_mce(struct mce *m) { return false; } +static inline bool intel_mce_usable_address(struct mce *m) { return false; } #endif void mce_timer_kick(unsigned long interval); @@ -210,6 +212,7 @@ extern bool filter_mce(struct mce *m); #ifdef CONFIG_X86_MCE_AMD extern bool amd_filter_mce(struct mce *m); +bool amd_mce_usable_address(struct mce *m); /* * If MCA_CONFIG[McaLsbInStatusSupported] is set, extract ErrAddr in bits @@ -237,6 +240,7 @@ static __always_inline void smca_extract_err_addr(struct mce *m) #else static inline bool amd_filter_mce(struct mce *m) { return false; } +static inline bool amd_mce_usable_address(struct mce *m) { return false; } static inline void smca_extract_err_addr(struct mce *m) { } #endif diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 31c0e68f6227..e65fae63660e 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -20,13 +20,13 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c, unsigned int cpu) { #ifdef CONFIG_SMP - seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); + seq_printf(m, "physical id\t: %d\n", c->topo.pkg_id); seq_printf(m, "siblings\t: %d\n", cpumask_weight(topology_core_cpumask(cpu))); - seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); + seq_printf(m, "core id\t\t: %d\n", c->topo.core_id); seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); - seq_printf(m, "apicid\t\t: %d\n", c->apicid); - seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid); + seq_printf(m, "apicid\t\t: %d\n", c->topo.apicid); + seq_printf(m, "initial apicid\t: %d\n", c->topo.initial_apicid); #endif } diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 030d3b409768..19e0681f0435 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -152,6 +152,7 @@ static inline void cache_alloc_hsw_probe(void) r->cache.cbm_len = 20; r->cache.shareable_bits = 0xc0000; r->cache.min_cbm_bits = 2; + r->cache.arch_has_sparse_bitmasks = false; r->alloc_capable = true; rdt_alloc_capable = true; @@ -267,15 +268,18 @@ static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r) { struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); union cpuid_0x10_1_eax eax; + union cpuid_0x10_x_ecx ecx; union cpuid_0x10_x_edx edx; - u32 ebx, ecx; + u32 ebx; - cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx, &edx.full); + cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx.full, &edx.full); hw_res->num_closid = edx.split.cos_max + 1; r->cache.cbm_len = eax.split.cbm_len + 1; r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1; r->cache.shareable_bits = ebx & r->default_ctrl; r->data_width = (r->cache.cbm_len + 3) / 4; + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + r->cache.arch_has_sparse_bitmasks = ecx.split.noncont; r->alloc_capable = true; } @@ -872,7 +876,6 @@ static __init void rdt_init_res_defs_intel(void) if (r->rid == RDT_RESOURCE_L3 || r->rid == RDT_RESOURCE_L2) { - r->cache.arch_has_sparse_bitmaps = false; r->cache.arch_has_per_cpu_cfg = false; r->cache.min_cbm_bits = 1; } else if (r->rid == RDT_RESOURCE_MBA) { @@ -892,7 +895,7 @@ static __init void rdt_init_res_defs_amd(void) if (r->rid == RDT_RESOURCE_L3 || r->rid == RDT_RESOURCE_L2) { - r->cache.arch_has_sparse_bitmaps = true; + r->cache.arch_has_sparse_bitmasks = true; r->cache.arch_has_per_cpu_cfg = true; r->cache.min_cbm_bits = 0; } else if (r->rid == RDT_RESOURCE_MBA) { diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index b44c487727d4..beccb0e87ba7 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -87,10 +87,12 @@ int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, /* * Check whether a cache bit mask is valid. - * For Intel the SDM says: - * Please note that all (and only) contiguous '1' combinations - * are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.). - * Additionally Haswell requires at least two bits set. + * On Intel CPUs, non-contiguous 1s value support is indicated by CPUID: + * - CPUID.0x10.1:ECX[3]: L3 non-contiguous 1s value supported if 1 + * - CPUID.0x10.2:ECX[3]: L2 non-contiguous 1s value supported if 1 + * + * Haswell does not support a non-contiguous 1s value and additionally + * requires at least two bits set. * AMD allows non-contiguous bitmasks. */ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) @@ -113,8 +115,8 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) first_bit = find_first_bit(&val, cbm_len); zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); - /* Are non-contiguous bitmaps allowed? */ - if (!r->cache.arch_has_sparse_bitmaps && + /* Are non-contiguous bitmasks allowed? */ + if (!r->cache.arch_has_sparse_bitmasks && (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)) { rdt_last_cmd_printf("The mask %lx has non-consecutive 1-bits\n", val); return false; diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 85ceaf9a31ac..a4f1aa15f0a2 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -59,6 +59,7 @@ struct rdt_fs_context { bool enable_cdpl2; bool enable_cdpl3; bool enable_mba_mbps; + bool enable_debug; }; static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) @@ -243,18 +244,17 @@ struct rdtgroup { */ #define RFTYPE_INFO BIT(0) #define RFTYPE_BASE BIT(1) -#define RF_CTRLSHIFT 4 -#define RF_MONSHIFT 5 -#define RF_TOPSHIFT 6 -#define RFTYPE_CTRL BIT(RF_CTRLSHIFT) -#define RFTYPE_MON BIT(RF_MONSHIFT) -#define RFTYPE_TOP BIT(RF_TOPSHIFT) +#define RFTYPE_CTRL BIT(4) +#define RFTYPE_MON BIT(5) +#define RFTYPE_TOP BIT(6) #define RFTYPE_RES_CACHE BIT(8) #define RFTYPE_RES_MB BIT(9) -#define RF_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) -#define RF_MON_INFO (RFTYPE_INFO | RFTYPE_MON) -#define RF_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP) -#define RF_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL) +#define RFTYPE_DEBUG BIT(10) +#define RFTYPE_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) +#define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON) +#define RFTYPE_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP) +#define RFTYPE_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL) +#define RFTYPE_MON_BASE (RFTYPE_BASE | RFTYPE_MON) /* List of all resource groups */ extern struct list_head rdt_all_groups; @@ -270,7 +270,7 @@ void __exit rdtgroup_exit(void); * @mode: Access mode * @kf_ops: File operations * @flags: File specific RFTYPE_FLAGS_* flags - * @fflags: File specific RF_* or RFTYPE_* flags + * @fflags: File specific RFTYPE_* flags * @seq_show: Show content of the file * @write: Write to the file */ @@ -492,6 +492,15 @@ union cpuid_0x10_3_eax { unsigned int full; }; +/* CPUID.(EAX=10H, ECX=ResID).ECX */ +union cpuid_0x10_x_ecx { + struct { + unsigned int reserved:3; + unsigned int noncont:1; + } split; + unsigned int full; +}; + /* CPUID.(EAX=10H, ECX=ResID).EDX */ union cpuid_0x10_x_edx { struct { diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index ded1fc7cb7cb..f136ac046851 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -30,15 +30,15 @@ struct rmid_entry { struct list_head list; }; -/** - * @rmid_free_lru A least recently used list of free RMIDs +/* + * @rmid_free_lru - A least recently used list of free RMIDs * These RMIDs are guaranteed to have an occupancy less than the * threshold occupancy */ static LIST_HEAD(rmid_free_lru); -/** - * @rmid_limbo_count count of currently unused but (potentially) +/* + * @rmid_limbo_count - count of currently unused but (potentially) * dirty RMIDs. * This counts RMIDs that no one is currently using but that * may have a occupancy value > resctrl_rmid_realloc_threshold. User can @@ -46,7 +46,7 @@ static LIST_HEAD(rmid_free_lru); */ static unsigned int rmid_limbo_count; -/** +/* * @rmid_entry - The entry in the limbo and free lists. */ static struct rmid_entry *rmid_ptrs; diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 725344048f85..69a1de92384a 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -54,8 +54,13 @@ static struct kernfs_node *kn_mondata; static struct seq_buf last_cmd_status; static char last_cmd_status_buf[512]; +static int rdtgroup_setup_root(struct rdt_fs_context *ctx); +static void rdtgroup_destroy_root(void); + struct dentry *debugfs_resctrl; +static bool resctrl_debug; + void rdt_last_cmd_clear(void) { lockdep_assert_held(&rdtgroup_mutex); @@ -696,11 +701,10 @@ static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct rdtgroup *rdtgrp; + char *pid_str; int ret = 0; pid_t pid; - if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) - return -EINVAL; rdtgrp = rdtgroup_kn_lock_live(of->kn); if (!rdtgrp) { rdtgroup_kn_unlock(of->kn); @@ -715,7 +719,27 @@ static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, goto unlock; } - ret = rdtgroup_move_task(pid, rdtgrp, of); + while (buf && buf[0] != '\0' && buf[0] != '\n') { + pid_str = strim(strsep(&buf, ",")); + + if (kstrtoint(pid_str, 0, &pid)) { + rdt_last_cmd_printf("Task list parsing error pid %s\n", pid_str); + ret = -EINVAL; + break; + } + + if (pid < 0) { + rdt_last_cmd_printf("Invalid pid %d\n", pid); + ret = -EINVAL; + break; + } + + ret = rdtgroup_move_task(pid, rdtgrp, of); + if (ret) { + rdt_last_cmd_printf("Error while processing task %d\n", pid); + break; + } + } unlock: rdtgroup_kn_unlock(of->kn); @@ -755,6 +779,38 @@ static int rdtgroup_tasks_show(struct kernfs_open_file *of, return ret; } +static int rdtgroup_closid_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (rdtgrp) + seq_printf(s, "%u\n", rdtgrp->closid); + else + ret = -ENOENT; + rdtgroup_kn_unlock(of->kn); + + return ret; +} + +static int rdtgroup_rmid_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (rdtgrp) + seq_printf(s, "%u\n", rdtgrp->mon.rmid); + else + ret = -ENOENT; + rdtgroup_kn_unlock(of->kn); + + return ret; +} + #ifdef CONFIG_PROC_CPU_RESCTRL /* @@ -895,7 +951,7 @@ static int rdt_shareable_bits_show(struct kernfs_open_file *of, return 0; } -/** +/* * rdt_bit_usage_show - Display current usage of resources * * A domain is a shared resource that can now be allocated differently. Here @@ -1117,12 +1173,24 @@ static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type) } } +static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = of->kn->parent->priv; + struct rdt_resource *r = s->res; + + seq_printf(seq, "%u\n", r->cache.arch_has_sparse_bitmasks); + + return 0; +} + /** * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other * @r: Resource to which domain instance @d belongs. * @d: The domain instance for which @closid is being tested. * @cbm: Capacity bitmask being tested. * @closid: Intended closid for @cbm. + * @type: CDP type of @r. * @exclusive: Only check if overlaps with exclusive resource groups * * Checks if provided @cbm intended to be used for @closid on domain @@ -1209,6 +1277,7 @@ bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d, /** * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive + * @rdtgrp: Resource group identified through its closid. * * An exclusive resource group implies that there should be no sharing of * its allocated resources. At the time this group is considered to be @@ -1251,9 +1320,8 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) return true; } -/** +/* * rdtgroup_mode_write - Modify the resource group's mode - * */ static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) @@ -1357,12 +1425,11 @@ unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, return size; } -/** +/* * rdtgroup_size_show - Display size in bytes of allocated regions * * The "size" file mirrors the layout of the "schemata" file, printing the * size in bytes of each region instead of the capacity bitmask. - * */ static int rdtgroup_size_show(struct kernfs_open_file *of, struct seq_file *s, void *v) @@ -1686,77 +1753,77 @@ static struct rftype res_common_files[] = { .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_last_cmd_status_show, - .fflags = RF_TOP_INFO, + .fflags = RFTYPE_TOP_INFO, }, { .name = "num_closids", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_num_closids_show, - .fflags = RF_CTRL_INFO, + .fflags = RFTYPE_CTRL_INFO, }, { .name = "mon_features", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_mon_features_show, - .fflags = RF_MON_INFO, + .fflags = RFTYPE_MON_INFO, }, { .name = "num_rmids", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_num_rmids_show, - .fflags = RF_MON_INFO, + .fflags = RFTYPE_MON_INFO, }, { .name = "cbm_mask", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_default_ctrl_show, - .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, }, { .name = "min_cbm_bits", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_min_cbm_bits_show, - .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, }, { .name = "shareable_bits", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_shareable_bits_show, - .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, }, { .name = "bit_usage", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_bit_usage_show, - .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, }, { .name = "min_bandwidth", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_min_bw_show, - .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, }, { .name = "bandwidth_gran", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_bw_gran_show, - .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, }, { .name = "delay_linear", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_delay_linear_show, - .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, }, /* * Platform specific which (if any) capabilities are provided by @@ -1775,7 +1842,7 @@ static struct rftype res_common_files[] = { .kf_ops = &rdtgroup_kf_single_ops, .write = max_threshold_occ_write, .seq_show = max_threshold_occ_show, - .fflags = RF_MON_INFO | RFTYPE_RES_CACHE, + .fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE, }, { .name = "mbm_total_bytes_config", @@ -1817,12 +1884,19 @@ static struct rftype res_common_files[] = { .fflags = RFTYPE_BASE, }, { + .name = "mon_hw_id", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdtgroup_rmid_show, + .fflags = RFTYPE_MON_BASE | RFTYPE_DEBUG, + }, + { .name = "schemata", .mode = 0644, .kf_ops = &rdtgroup_kf_single_ops, .write = rdtgroup_schemata_write, .seq_show = rdtgroup_schemata_show, - .fflags = RF_CTRL_BASE, + .fflags = RFTYPE_CTRL_BASE, }, { .name = "mode", @@ -1830,14 +1904,28 @@ static struct rftype res_common_files[] = { .kf_ops = &rdtgroup_kf_single_ops, .write = rdtgroup_mode_write, .seq_show = rdtgroup_mode_show, - .fflags = RF_CTRL_BASE, + .fflags = RFTYPE_CTRL_BASE, }, { .name = "size", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdtgroup_size_show, - .fflags = RF_CTRL_BASE, + .fflags = RFTYPE_CTRL_BASE, + }, + { + .name = "sparse_masks", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_has_sparse_bitmasks_show, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, + }, + { + .name = "ctrl_hw_id", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdtgroup_closid_show, + .fflags = RFTYPE_CTRL_BASE | RFTYPE_DEBUG, }, }; @@ -1852,6 +1940,9 @@ static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags) lockdep_assert_held(&rdtgroup_mutex); + if (resctrl_debug) + fflags |= RFTYPE_DEBUG; + for (rft = rfts; rft < rfts + len; rft++) { if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) { ret = rdtgroup_add_file(kn, rft); @@ -1894,7 +1985,7 @@ void __init thread_throttle_mode_init(void) if (!rft) return; - rft->fflags = RF_CTRL_INFO | RFTYPE_RES_MB; + rft->fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB; } void __init mbm_config_rftype_init(const char *config) @@ -1903,7 +1994,7 @@ void __init mbm_config_rftype_init(const char *config) rft = rdtgroup_get_rftype_by_name(config); if (rft) - rft->fflags = RF_MON_INFO | RFTYPE_RES_CACHE; + rft->fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE; } /** @@ -2038,21 +2129,21 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) if (IS_ERR(kn_info)) return PTR_ERR(kn_info); - ret = rdtgroup_add_files(kn_info, RF_TOP_INFO); + ret = rdtgroup_add_files(kn_info, RFTYPE_TOP_INFO); if (ret) goto out_destroy; /* loop over enabled controls, these are all alloc_capable */ list_for_each_entry(s, &resctrl_schema_all, list) { r = s->res; - fflags = r->fflags | RF_CTRL_INFO; + fflags = r->fflags | RFTYPE_CTRL_INFO; ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags); if (ret) goto out_destroy; } for_each_mon_capable_rdt_resource(r) { - fflags = r->fflags | RF_MON_INFO; + fflags = r->fflags | RFTYPE_MON_INFO; sprintf(name, "%s_MON", r->name); ret = rdtgroup_mkdir_info_resdir(r, name, fflags); if (ret) @@ -2271,14 +2362,6 @@ int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable) return 0; } -static void cdp_disable_all(void) -{ - if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3)) - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); - if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); -} - /* * We don't allow rdtgroup directories to be created anywhere * except the root directory. Thus when looking for the rdtgroup @@ -2358,19 +2441,47 @@ static int mkdir_mondata_all(struct kernfs_node *parent_kn, struct rdtgroup *prgrp, struct kernfs_node **mon_data_kn); +static void rdt_disable_ctx(void) +{ + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); + set_mba_sc(false); + + resctrl_debug = false; +} + static int rdt_enable_ctx(struct rdt_fs_context *ctx) { int ret = 0; - if (ctx->enable_cdpl2) + if (ctx->enable_cdpl2) { ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, true); + if (ret) + goto out_done; + } - if (!ret && ctx->enable_cdpl3) + if (ctx->enable_cdpl3) { ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, true); + if (ret) + goto out_cdpl2; + } - if (!ret && ctx->enable_mba_mbps) + if (ctx->enable_mba_mbps) { ret = set_mba_sc(true); + if (ret) + goto out_cdpl3; + } + + if (ctx->enable_debug) + resctrl_debug = true; + return 0; + +out_cdpl3: + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); +out_cdpl2: + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); +out_done: return ret; } @@ -2463,6 +2574,7 @@ static void schemata_list_destroy(void) static int rdt_get_tree(struct fs_context *fc) { struct rdt_fs_context *ctx = rdt_fc2context(fc); + unsigned long flags = RFTYPE_CTRL_BASE; struct rdt_domain *dom; struct rdt_resource *r; int ret; @@ -2477,18 +2589,31 @@ static int rdt_get_tree(struct fs_context *fc) goto out; } + ret = rdtgroup_setup_root(ctx); + if (ret) + goto out; + ret = rdt_enable_ctx(ctx); - if (ret < 0) - goto out_cdp; + if (ret) + goto out_root; ret = schemata_list_create(); if (ret) { schemata_list_destroy(); - goto out_mba; + goto out_ctx; } closid_init(); + if (rdt_mon_capable) + flags |= RFTYPE_MON; + + ret = rdtgroup_add_files(rdtgroup_default.kn, flags); + if (ret) + goto out_schemata_free; + + kernfs_activate(rdtgroup_default.kn); + ret = rdtgroup_create_info_dir(rdtgroup_default.kn); if (ret < 0) goto out_schemata_free; @@ -2543,11 +2668,10 @@ out_info: kernfs_remove(kn_info); out_schemata_free: schemata_list_destroy(); -out_mba: - if (ctx->enable_mba_mbps) - set_mba_sc(false); -out_cdp: - cdp_disable_all(); +out_ctx: + rdt_disable_ctx(); +out_root: + rdtgroup_destroy_root(); out: rdt_last_cmd_clear(); mutex_unlock(&rdtgroup_mutex); @@ -2559,6 +2683,7 @@ enum rdt_param { Opt_cdp, Opt_cdpl2, Opt_mba_mbps, + Opt_debug, nr__rdt_params }; @@ -2566,6 +2691,7 @@ static const struct fs_parameter_spec rdt_fs_parameters[] = { fsparam_flag("cdp", Opt_cdp), fsparam_flag("cdpl2", Opt_cdpl2), fsparam_flag("mba_MBps", Opt_mba_mbps), + fsparam_flag("debug", Opt_debug), {} }; @@ -2591,6 +2717,9 @@ static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) return -EINVAL; ctx->enable_mba_mbps = true; return 0; + case Opt_debug: + ctx->enable_debug = true; + return 0; } return -EINVAL; @@ -2618,7 +2747,6 @@ static int rdt_init_fs_context(struct fs_context *fc) if (!ctx) return -ENOMEM; - ctx->kfc.root = rdt_root; ctx->kfc.magic = RDTGROUP_SUPER_MAGIC; fc->fs_private = &ctx->kfc; fc->ops = &rdt_fs_context_ops; @@ -2779,16 +2907,16 @@ static void rdt_kill_sb(struct super_block *sb) cpus_read_lock(); mutex_lock(&rdtgroup_mutex); - set_mba_sc(false); + rdt_disable_ctx(); /*Put everything back to default values. */ for_each_alloc_capable_rdt_resource(r) reset_all_ctrls(r); - cdp_disable_all(); rmdir_all_sub(); rdt_pseudo_lock_release(); rdtgroup_default.mode = RDT_MODE_SHAREABLE; schemata_list_destroy(); + rdtgroup_destroy_root(); static_branch_disable_cpuslocked(&rdt_alloc_enable_key); static_branch_disable_cpuslocked(&rdt_mon_enable_key); static_branch_disable_cpuslocked(&rdt_enable_key); @@ -3170,8 +3298,8 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, enum rdt_group_type rtype, struct rdtgroup **r) { struct rdtgroup *prdtgrp, *rdtgrp; + unsigned long files = 0; struct kernfs_node *kn; - uint files = 0; int ret; prdtgrp = rdtgroup_kn_lock_live(parent_kn); @@ -3223,7 +3351,14 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, goto out_destroy; } - files = RFTYPE_BASE | BIT(RF_CTRLSHIFT + rtype); + if (rtype == RDTCTRL_GROUP) { + files = RFTYPE_BASE | RFTYPE_CTRL; + if (rdt_mon_capable) + files |= RFTYPE_MON; + } else { + files = RFTYPE_BASE | RFTYPE_MON; + } + ret = rdtgroup_add_files(kn, files); if (ret) { rdt_last_cmd_puts("kernfs fill error\n"); @@ -3656,6 +3791,9 @@ static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) if (is_mba_sc(&rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl)) seq_puts(seq, ",mba_MBps"); + if (resctrl_debug) + seq_puts(seq, ",debug"); + return 0; } @@ -3666,10 +3804,8 @@ static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = { .show_options = rdtgroup_show_options, }; -static int __init rdtgroup_setup_root(void) +static int rdtgroup_setup_root(struct rdt_fs_context *ctx) { - int ret; - rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops, KERNFS_ROOT_CREATE_DEACTIVATED | KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK, @@ -3677,6 +3813,20 @@ static int __init rdtgroup_setup_root(void) if (IS_ERR(rdt_root)) return PTR_ERR(rdt_root); + ctx->kfc.root = rdt_root; + rdtgroup_default.kn = kernfs_root_to_node(rdt_root); + + return 0; +} + +static void rdtgroup_destroy_root(void) +{ + kernfs_destroy_root(rdt_root); + rdtgroup_default.kn = NULL; +} + +static void __init rdtgroup_setup_default(void) +{ mutex_lock(&rdtgroup_mutex); rdtgroup_default.closid = 0; @@ -3686,19 +3836,7 @@ static int __init rdtgroup_setup_root(void) list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups); - ret = rdtgroup_add_files(kernfs_root_to_node(rdt_root), RF_CTRL_BASE); - if (ret) { - kernfs_destroy_root(rdt_root); - goto out; - } - - rdtgroup_default.kn = kernfs_root_to_node(rdt_root); - kernfs_activate(rdtgroup_default.kn); - -out: mutex_unlock(&rdtgroup_mutex); - - return ret; } static void domain_destroy_mon_state(struct rdt_domain *d) @@ -3820,13 +3958,11 @@ int __init rdtgroup_init(void) seq_buf_init(&last_cmd_status, last_cmd_status_buf, sizeof(last_cmd_status_buf)); - ret = rdtgroup_setup_root(); - if (ret) - return ret; + rdtgroup_setup_default(); ret = sysfs_create_mount_point(fs_kobj, "resctrl"); if (ret) - goto cleanup_root; + return ret; ret = register_filesystem(&rdt_fs_type); if (ret) @@ -3859,8 +3995,6 @@ int __init rdtgroup_init(void) cleanup_mountpoint: sysfs_remove_mount_point(fs_kobj, "resctrl"); -cleanup_root: - kernfs_destroy_root(rdt_root); return ret; } @@ -3870,5 +4004,4 @@ void __exit rdtgroup_exit(void) debugfs_remove_recursive(debugfs_resctrl); unregister_filesystem(&rdt_fs_type); sysfs_remove_mount_point(fs_kobj, "resctrl"); - kernfs_destroy_root(rdt_root); } diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 0270925fe013..dc136703566f 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -78,7 +78,7 @@ int detect_extended_topology_early(struct cpuinfo_x86 *c) /* * initial apic id, which also represents 32-bit extended x2apic id. */ - c->initial_apicid = edx; + c->topo.initial_apicid = edx; smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx)); #endif return 0; @@ -108,7 +108,7 @@ int detect_extended_topology(struct cpuinfo_x86 *c) * Populate HT related information from sub-leaf level 0. */ cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx); - c->initial_apicid = edx; + c->topo.initial_apicid = edx; core_level_siblings = LEVEL_MAX_SIBLINGS(ebx); smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx)); core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); @@ -146,20 +146,19 @@ int detect_extended_topology(struct cpuinfo_x86 *c) die_select_mask = (~(-1 << die_plus_mask_width)) >> core_plus_mask_width; - c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, + c->topo.core_id = apic->phys_pkg_id(c->topo.initial_apicid, ht_mask_width) & core_select_mask; if (die_level_present) { - c->cpu_die_id = apic->phys_pkg_id(c->initial_apicid, + c->topo.die_id = apic->phys_pkg_id(c->topo.initial_apicid, core_plus_mask_width) & die_select_mask; } - c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, - pkg_mask_width); + c->topo.pkg_id = apic->phys_pkg_id(c->topo.initial_apicid, pkg_mask_width); /* * Reinit the apicid, now that we have extended initial_apicid. */ - c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); + c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0); c->x86_max_cores = (core_level_siblings / smp_num_siblings); __max_die_per_package = (die_level_siblings / core_level_siblings); diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c index 05fa4ef63490..415564a6523b 100644 --- a/arch/x86/kernel/cpu/zhaoxin.c +++ b/arch/x86/kernel/cpu/zhaoxin.c @@ -65,20 +65,6 @@ static void early_init_zhaoxin(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); } - - if (c->cpuid_level >= 0x00000001) { - u32 eax, ebx, ecx, edx; - - cpuid(0x00000001, &eax, &ebx, &ecx, &edx); - /* - * If HTT (EDX[28]) is set EBX[16:23] contain the number of - * apicids which are reserved per package. Store the resulting - * shift value for the package management code. - */ - if (edx & (1U << 28)) - c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff); - } - } static void init_zhaoxin(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 87d38f17ff5c..afd09924094e 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -278,7 +278,7 @@ static void __init dtb_apic_setup(void) } #ifdef CONFIG_OF_EARLY_FLATTREE -static void __init x86_flattree_get_config(void) +void __init x86_flattree_get_config(void) { u32 size, map_len; void *dt; @@ -300,14 +300,10 @@ static void __init x86_flattree_get_config(void) unflatten_and_copy_device_tree(); early_memunmap(dt, map_len); } -#else -static inline void x86_flattree_get_config(void) { } #endif void __init x86_dtb_init(void) { - x86_flattree_get_config(); - if (!of_have_populated_dt()) return; diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index a86d37052a64..a21a4d0ecc34 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -369,14 +369,15 @@ int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest) EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpstate); void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, - unsigned int size, u32 pkru) + unsigned int size, u64 xfeatures, u32 pkru) { struct fpstate *kstate = gfpu->fpstate; union fpregs_state *ustate = buf; struct membuf mb = { .p = buf, .left = size }; if (cpu_feature_enabled(X86_FEATURE_XSAVE)) { - __copy_xstate_to_uabi_buf(mb, kstate, pkru, XSTATE_COPY_XSAVE); + __copy_xstate_to_uabi_buf(mb, kstate, xfeatures, pkru, + XSTATE_COPY_XSAVE); } else { memcpy(&ustate->fxsave, &kstate->regs.fxsave, sizeof(ustate->fxsave)); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index cadf68737e6b..117e74c44e75 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1049,6 +1049,7 @@ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate, * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer * @to: membuf descriptor * @fpstate: The fpstate buffer from which to copy + * @xfeatures: The mask of xfeatures to save (XSAVE mode only) * @pkru_val: The PKRU value to store in the PKRU component * @copy_mode: The requested copy mode * @@ -1059,7 +1060,8 @@ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate, * It supports partial copy but @to.pos always starts from zero. */ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, - u32 pkru_val, enum xstate_copy_mode copy_mode) + u64 xfeatures, u32 pkru_val, + enum xstate_copy_mode copy_mode) { const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr); struct xregs_state *xinit = &init_fpstate.regs.xsave; @@ -1083,7 +1085,7 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, break; case XSTATE_COPY_XSAVE: - header.xfeatures &= fpstate->user_xfeatures; + header.xfeatures &= fpstate->user_xfeatures & xfeatures; break; } @@ -1185,6 +1187,7 @@ void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, enum xstate_copy_mode copy_mode) { __copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate, + tsk->thread.fpu.fpstate->user_xfeatures, tsk->thread.pkru, copy_mode); } @@ -1536,10 +1539,7 @@ static int fpstate_realloc(u64 xfeatures, unsigned int ksize, fpregs_restore_userregs(); newfps->xfeatures = curfps->xfeatures | xfeatures; - - if (!guest_fpu) - newfps->user_xfeatures = curfps->user_xfeatures | xfeatures; - + newfps->user_xfeatures = curfps->user_xfeatures | xfeatures; newfps->xfd = curfps->xfd & ~xfeatures; /* Do the final updates within the locked region */ @@ -1736,7 +1736,6 @@ EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm); /** * fpu_xstate_prctl - xstate permission operations - * @tsk: Redundant pointer to current * @option: A subfunction of arch_prctl() * @arg2: option argument * Return: 0 if successful; otherwise, an error code diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index a4ecb04d8d64..3518fb26d06b 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -43,7 +43,8 @@ enum xstate_copy_mode { struct membuf; extern void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, - u32 pkru_val, enum xstate_copy_mode copy_mode); + u64 xfeatures, u32 pkru_val, + enum xstate_copy_mode copy_mode); extern void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, enum xstate_copy_mode mode); extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru); diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S index 24c1175a47e2..58d9ed50fe61 100644 --- a/arch/x86/kernel/ftrace_32.S +++ b/arch/x86/kernel/ftrace_32.S @@ -3,10 +3,10 @@ * Copyright (C) 2017 Steven Rostedt, VMware Inc. */ +#include <linux/export.h> #include <linux/linkage.h> #include <asm/page_types.h> #include <asm/segment.h> -#include <asm/export.h> #include <asm/ftrace.h> #include <asm/nospec-branch.h> #include <asm/frame.h> diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 945cfa5f7239..214f30e9f0c0 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -3,12 +3,12 @@ * Copyright (C) 2014 Steven Rostedt, Red Hat Inc */ +#include <linux/export.h> #include <linux/cfi_types.h> #include <linux/linkage.h> #include <asm/asm-offsets.h> #include <asm/ptrace.h> #include <asm/ftrace.h> -#include <asm/export.h> #include <asm/nospec-branch.h> #include <asm/unwind_hints.h> #include <asm/frame.h> diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 49f7629b17f7..05a110c97111 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -41,6 +41,7 @@ #include <asm/trapnr.h> #include <asm/sev.h> #include <asm/tdx.h> +#include <asm/init.h> /* * Manage page tables very early on. @@ -69,7 +70,7 @@ EXPORT_SYMBOL(vmemmap_base); /* * GDT used on the boot CPU before switching to virtual addresses. */ -static struct desc_struct startup_gdt[GDT_ENTRIES] = { +static struct desc_struct startup_gdt[GDT_ENTRIES] __initdata = { [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff), [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff), [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff), @@ -79,13 +80,11 @@ static struct desc_struct startup_gdt[GDT_ENTRIES] = { * Address needs to be set at runtime because it references the startup_gdt * while the kernel still uses a direct mapping. */ -static struct desc_ptr startup_gdt_descr = { - .size = sizeof(startup_gdt), +static struct desc_ptr startup_gdt_descr __initdata = { + .size = sizeof(startup_gdt)-1, .address = 0, }; -#define __head __section(".head.text") - static void __head *fixup_pointer(void *ptr, unsigned long physaddr) { return ptr - (void *)_text + (void *)physaddr; @@ -211,7 +210,7 @@ unsigned long __head __startup_64(unsigned long physaddr, /* Fixup the physical addresses in the page table */ - pgd = fixup_pointer(&early_top_pgt, physaddr); + pgd = fixup_pointer(early_top_pgt, physaddr); p = pgd + pgd_index(__START_KERNEL_map); if (la57) *p = (unsigned long)level4_kernel_pgt; @@ -220,11 +219,11 @@ unsigned long __head __startup_64(unsigned long physaddr, *p += _PAGE_TABLE_NOENC - __START_KERNEL_map + load_delta; if (la57) { - p4d = fixup_pointer(&level4_kernel_pgt, physaddr); + p4d = fixup_pointer(level4_kernel_pgt, physaddr); p4d[511] += load_delta; } - pud = fixup_pointer(&level3_kernel_pgt, physaddr); + pud = fixup_pointer(level3_kernel_pgt, physaddr); pud[510] += load_delta; pud[511] += load_delta; @@ -588,7 +587,7 @@ static void set_bringup_idt_handler(gate_desc *idt, int n, void *handler) } /* This runs while still in the direct mapping */ -static void startup_64_load_idt(unsigned long physbase) +static void __head startup_64_load_idt(unsigned long physbase) { struct desc_ptr *desc = fixup_pointer(&bringup_idt_descr, physbase); gate_desc *idt = fixup_pointer(bringup_idt_table, physbase); diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index c9318993f959..b6554212b7c7 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -8,6 +8,7 @@ */ .text +#include <linux/export.h> #include <linux/threads.h> #include <linux/init.h> #include <linux/linkage.h> @@ -25,7 +26,6 @@ #include <asm/nops.h> #include <asm/nospec-branch.h> #include <asm/bootparam.h> -#include <asm/export.h> #include <asm/pgtable_32.h> /* Physical address */ diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index ea6995920b7a..086a2c3aaaa0 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -9,7 +9,7 @@ * Copyright (C) 2005 Eric Biederman <ebiederm@xmission.com> */ - +#include <linux/export.h> #include <linux/linkage.h> #include <linux/threads.h> #include <linux/init.h> @@ -22,7 +22,6 @@ #include <asm/percpu.h> #include <asm/nops.h> #include "../entry/calling.h" -#include <asm/export.h> #include <asm/nospec-branch.h> #include <asm/apicdef.h> #include <asm/fixmap.h> @@ -180,8 +179,8 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) movl $0, %ecx #endif - /* Enable PAE mode, PGE and LA57 */ - orl $(X86_CR4_PAE | X86_CR4_PGE), %ecx + /* Enable PAE mode, PSE, PGE and LA57 */ + orl $(X86_CR4_PAE | X86_CR4_PSE | X86_CR4_PGE), %ecx #ifdef CONFIG_X86_5LEVEL testl $1, __pgtable_l5_enabled(%rip) jz 1f diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 1648aa0204d9..41eecf180b7f 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -52,7 +52,7 @@ unsigned long hpet_address; u8 hpet_blockid; /* OS timer block num */ bool hpet_msi_disable; -#ifdef CONFIG_GENERIC_MSI_IRQ +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_GENERIC_MSI_IRQ) static DEFINE_PER_CPU(struct hpet_channel *, cpu_hpet_channel); static struct irq_domain *hpet_domain; #endif @@ -469,7 +469,7 @@ static void __init hpet_legacy_clockevent_register(struct hpet_channel *hc) /* * HPET MSI Support */ -#ifdef CONFIG_GENERIC_MSI_IRQ +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_GENERIC_MSI_IRQ) static void hpet_msi_unmask(struct irq_data *data) { struct hpet_channel *hc = irq_data_get_irq_handler_data(data); diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 30a55207c000..c20d1832c481 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -32,6 +32,7 @@ */ static void init_8259A(int auto_eoi); +static bool pcat_compat __ro_after_init; static int i8259A_auto_eoi; DEFINE_RAW_SPINLOCK(i8259A_lock); @@ -299,15 +300,32 @@ static void unmask_8259A(void) static int probe_8259A(void) { + unsigned char new_val, probe_val = ~(1 << PIC_CASCADE_IR); unsigned long flags; - unsigned char probe_val = ~(1 << PIC_CASCADE_IR); - unsigned char new_val; + + /* + * If MADT has the PCAT_COMPAT flag set, then do not bother probing + * for the PIC. Some BIOSes leave the PIC uninitialized and probing + * fails. + * + * Right now this causes problems as quite some code depends on + * nr_legacy_irqs() > 0 or has_legacy_pic() == true. This is silly + * when the system has an IO/APIC because then PIC is not required + * at all, except for really old machines where the timer interrupt + * must be routed through the PIC. So just pretend that the PIC is + * there and let legacy_pic->init() initialize it for nothing. + * + * Alternatively this could just try to initialize the PIC and + * repeat the probe, but for cases where there is no PIC that's + * just pointless. + */ + if (pcat_compat) + return nr_legacy_irqs(); + /* - * Check to see if we have a PIC. - * Mask all except the cascade and read - * back the value we just wrote. If we don't - * have a PIC, we will read 0xff as opposed to the - * value we wrote. + * Check to see if we have a PIC. Mask all except the cascade and + * read back the value we just wrote. If we don't have a PIC, we + * will read 0xff as opposed to the value we wrote. */ raw_spin_lock_irqsave(&i8259A_lock, flags); @@ -429,5 +447,9 @@ static int __init i8259A_init_ops(void) return 0; } - device_initcall(i8259A_init_ops); + +void __init legacy_pic_pcat_compat(void) +{ + pcat_compat = true; +} diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index b786d48f5a0f..8857abc706e4 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -10,6 +10,7 @@ #include <asm/proto.h> #include <asm/desc.h> #include <asm/hw_irq.h> +#include <asm/ia32.h> #include <asm/idtentry.h> #define DPL0 0x0 @@ -116,6 +117,9 @@ static const __initconst struct idt_data def_idts[] = { #endif SYSG(X86_TRAP_OF, asm_exc_overflow), +}; + +static const struct idt_data ia32_idt[] __initconst = { #if defined(CONFIG_IA32_EMULATION) SYSG(IA32_SYSCALL_VECTOR, entry_INT80_compat), #elif defined(CONFIG_X86_32) @@ -225,6 +229,9 @@ void __init idt_setup_early_traps(void) void __init idt_setup_traps(void) { idt_setup_from_table(idt_table, def_idts, ARRAY_SIZE(def_idts), true); + + if (ia32_enabled()) + idt_setup_from_table(idt_table, ia32_idt, ARRAY_SIZE(ia32_idt), true); } #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S index aaf9e776f323..7f542a7799cb 100644 --- a/arch/x86/kernel/irqflags.S +++ b/arch/x86/kernel/irqflags.S @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include <asm/asm.h> -#include <asm/export.h> +#include <linux/export.h> #include <linux/linkage.h> /* diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c index ee4fe8cdb857..9a7c03d47861 100644 --- a/arch/x86/kernel/itmt.c +++ b/arch/x86/kernel/itmt.c @@ -74,7 +74,6 @@ static struct ctl_table itmt_kern_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - {} }; static struct ctl_table_header *itmt_sysctl_header; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index b8ab9ee5896c..0ddb3bd0f1aa 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -500,13 +500,13 @@ static bool pv_sched_yield_supported(void) static void __send_ipi_mask(const struct cpumask *mask, int vector) { unsigned long flags; - int cpu, apic_id, icr; - int min = 0, max = 0; + int cpu, min = 0, max = 0; #ifdef CONFIG_X86_64 __uint128_t ipi_bitmap = 0; #else u64 ipi_bitmap = 0; #endif + u32 apic_id, icr; long ret; if (cpumask_empty(mask)) @@ -1028,8 +1028,8 @@ arch_initcall(activate_jump_labels); /* Kick a cpu by its apicid. Used to wake up a halted vcpu */ static void kvm_kick_cpu(int cpu) { - int apicid; unsigned long flags = 0; + u32 apicid; apicid = per_cpu(x86_cpu_to_apicid, cpu); kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid); diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index a0c551846b35..4766b6bed443 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -507,12 +507,13 @@ DEFINE_IDTENTRY_RAW(exc_nmi) } this_cpu_write(nmi_state, NMI_EXECUTING); this_cpu_write(nmi_cr2, read_cr2()); + +nmi_restart: if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) { WRITE_ONCE(nsp->idt_seq, nsp->idt_seq + 1); WARN_ON_ONCE(!(nsp->idt_seq & 0x1)); WRITE_ONCE(nsp->recv_jiffies, jiffies); } -nmi_restart: /* * Needs to happen before DR7 is accessed, because the hypervisor can @@ -548,16 +549,16 @@ nmi_restart: if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) write_cr2(this_cpu_read(nmi_cr2)); - if (this_cpu_dec_return(nmi_state)) - goto nmi_restart; - - if (user_mode(regs)) - mds_user_clear_cpu_buffers(); if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) { WRITE_ONCE(nsp->idt_seq, nsp->idt_seq + 1); WARN_ON_ONCE(nsp->idt_seq & 0x1); WRITE_ONCE(nsp->recv_jiffies, jiffies); } + if (this_cpu_dec_return(nmi_state)) + goto nmi_restart; + + if (user_mode(regs)) + mds_user_clear_cpu_buffers(); } #if IS_ENABLED(CONFIG_KVM_INTEL) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b098b1fa2470..1526747bedf2 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -466,154 +466,29 @@ static void __init memblock_x86_reserve_range_setup_data(void) } } -/* - * --------- Crashkernel reservation ------------------------------ - */ - -/* 16M alignment for crash kernel regions */ -#define CRASH_ALIGN SZ_16M - -/* - * Keep the crash kernel below this limit. - * - * Earlier 32-bits kernels would limit the kernel to the low 512 MB range - * due to mapping restrictions. - * - * 64-bit kdump kernels need to be restricted to be under 64 TB, which is - * the upper limit of system RAM in 4-level paging mode. Since the kdump - * jump could be from 5-level paging to 4-level paging, the jump will fail if - * the kernel is put above 64 TB, and during the 1st kernel bootup there's - * no good way to detect the paging mode of the target kernel which will be - * loaded for dumping. - */ -#ifdef CONFIG_X86_32 -# define CRASH_ADDR_LOW_MAX SZ_512M -# define CRASH_ADDR_HIGH_MAX SZ_512M -#else -# define CRASH_ADDR_LOW_MAX SZ_4G -# define CRASH_ADDR_HIGH_MAX SZ_64T -#endif - -static int __init reserve_crashkernel_low(void) -{ -#ifdef CONFIG_X86_64 - unsigned long long base, low_base = 0, low_size = 0; - unsigned long low_mem_limit; - int ret; - - low_mem_limit = min(memblock_phys_mem_size(), CRASH_ADDR_LOW_MAX); - - /* crashkernel=Y,low */ - ret = parse_crashkernel_low(boot_command_line, low_mem_limit, &low_size, &base); - if (ret) { - /* - * two parts from kernel/dma/swiotlb.c: - * -swiotlb size: user-specified with swiotlb= or default. - * - * -swiotlb overflow buffer: now hardcoded to 32k. We round it - * to 8M for other buffers that may need to stay low too. Also - * make sure we allocate enough extra low memory so that we - * don't run out of DMA buffers for 32-bit devices. - */ - low_size = max(swiotlb_size_or_default() + (8UL << 20), 256UL << 20); - } else { - /* passed with crashkernel=0,low ? */ - if (!low_size) - return 0; - } - - low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX); - if (!low_base) { - pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n", - (unsigned long)(low_size >> 20)); - return -ENOMEM; - } - - pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (low RAM limit: %ldMB)\n", - (unsigned long)(low_size >> 20), - (unsigned long)(low_base >> 20), - (unsigned long)(low_mem_limit >> 20)); - - crashk_low_res.start = low_base; - crashk_low_res.end = low_base + low_size - 1; - insert_resource(&iomem_resource, &crashk_low_res); -#endif - return 0; -} - -static void __init reserve_crashkernel(void) +static void __init arch_reserve_crashkernel(void) { - unsigned long long crash_size, crash_base, total_mem; + unsigned long long crash_base, crash_size, low_size = 0; + char *cmdline = boot_command_line; bool high = false; int ret; if (!IS_ENABLED(CONFIG_KEXEC_CORE)) return; - total_mem = memblock_phys_mem_size(); - - /* crashkernel=XM */ - ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base); - if (ret != 0 || crash_size <= 0) { - /* crashkernel=X,high */ - ret = parse_crashkernel_high(boot_command_line, total_mem, - &crash_size, &crash_base); - if (ret != 0 || crash_size <= 0) - return; - high = true; - } + ret = parse_crashkernel(cmdline, memblock_phys_mem_size(), + &crash_size, &crash_base, + &low_size, &high); + if (ret) + return; if (xen_pv_domain()) { pr_info("Ignoring crashkernel for a Xen PV domain\n"); return; } - /* 0 means: find the address automatically */ - if (!crash_base) { - /* - * Set CRASH_ADDR_LOW_MAX upper bound for crash memory, - * crashkernel=x,high reserves memory over 4G, also allocates - * 256M extra low memory for DMA buffers and swiotlb. - * But the extra memory is not required for all machines. - * So try low memory first and fall back to high memory - * unless "crashkernel=size[KMG],high" is specified. - */ - if (!high) - crash_base = memblock_phys_alloc_range(crash_size, - CRASH_ALIGN, CRASH_ALIGN, - CRASH_ADDR_LOW_MAX); - if (!crash_base) - crash_base = memblock_phys_alloc_range(crash_size, - CRASH_ALIGN, CRASH_ALIGN, - CRASH_ADDR_HIGH_MAX); - if (!crash_base) { - pr_info("crashkernel reservation failed - No suitable area found.\n"); - return; - } - } else { - unsigned long long start; - - start = memblock_phys_alloc_range(crash_size, SZ_1M, crash_base, - crash_base + crash_size); - if (start != crash_base) { - pr_info("crashkernel reservation failed - memory is in use.\n"); - return; - } - } - - if (crash_base >= (1ULL << 32) && reserve_crashkernel_low()) { - memblock_phys_free(crash_base, crash_size); - return; - } - - pr_info("Reserving %ldMB of memory at %ldMB for crashkernel (System RAM: %ldMB)\n", - (unsigned long)(crash_size >> 20), - (unsigned long)(crash_base >> 20), - (unsigned long)(total_mem >> 20)); - - crashk_res.start = crash_base; - crashk_res.end = crash_base + crash_size - 1; - insert_resource(&iomem_resource, &crashk_res); + reserve_crashkernel_generic(cmdline, crash_size, crash_base, + low_size, high); } static struct resource standard_io_resources[] = { @@ -1120,7 +995,7 @@ void __init setup_arch(char **cmdline_p) * Needs to run after memblock setup because it needs the physical * memory size. */ - sev_setup_arch(); + mem_encrypt_setup_arch(); efi_fake_memmap(); efi_find_mirror(); @@ -1217,6 +1092,8 @@ void __init setup_arch(char **cmdline_p) early_acpi_boot_init(); + x86_flattree_get_config(); + initmem_init(); dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT); @@ -1227,7 +1104,7 @@ void __init setup_arch(char **cmdline_p) * Reserve memory for crash kernel after SRAT is parsed so that it * won't consume hotpluggable memory. */ - reserve_crashkernel(); + arch_reserve_crashkernel(); memblock_find_dma_reserve(); @@ -1290,7 +1167,7 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_VT #if defined(CONFIG_VGA_CONSOLE) if (!efi_enabled(EFI_BOOT) || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY)) - conswitchp = &vga_con; + vgacon_register_screen(&screen_info); #endif #endif x86_init.oem.banner(); diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index 2eabccde94fb..ccb0915e84e1 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -256,7 +256,7 @@ static int __sev_cpuid_hv(u32 fn, int reg_idx, u32 *reg) return 0; } -static int sev_cpuid_hv(struct cpuid_leaf *leaf) +static int __sev_cpuid_hv_msr(struct cpuid_leaf *leaf) { int ret; @@ -279,6 +279,45 @@ static int sev_cpuid_hv(struct cpuid_leaf *leaf) return ret; } +static int __sev_cpuid_hv_ghcb(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) +{ + u32 cr4 = native_read_cr4(); + int ret; + + ghcb_set_rax(ghcb, leaf->fn); + ghcb_set_rcx(ghcb, leaf->subfn); + + if (cr4 & X86_CR4_OSXSAVE) + /* Safe to read xcr0 */ + ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK)); + else + /* xgetbv will cause #UD - use reset value for xcr0 */ + ghcb_set_xcr0(ghcb, 1); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0); + if (ret != ES_OK) + return ret; + + if (!(ghcb_rax_is_valid(ghcb) && + ghcb_rbx_is_valid(ghcb) && + ghcb_rcx_is_valid(ghcb) && + ghcb_rdx_is_valid(ghcb))) + return ES_VMM_ERROR; + + leaf->eax = ghcb->save.rax; + leaf->ebx = ghcb->save.rbx; + leaf->ecx = ghcb->save.rcx; + leaf->edx = ghcb->save.rdx; + + return ES_OK; +} + +static int sev_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) +{ + return ghcb ? __sev_cpuid_hv_ghcb(ghcb, ctxt, leaf) + : __sev_cpuid_hv_msr(leaf); +} + /* * This may be called early while still running on the initial identity * mapping. Use RIP-relative addressing to obtain the correct address @@ -388,19 +427,20 @@ snp_cpuid_get_validated_func(struct cpuid_leaf *leaf) return false; } -static void snp_cpuid_hv(struct cpuid_leaf *leaf) +static void snp_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) { - if (sev_cpuid_hv(leaf)) + if (sev_cpuid_hv(ghcb, ctxt, leaf)) sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV); } -static int snp_cpuid_postprocess(struct cpuid_leaf *leaf) +static int snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt, + struct cpuid_leaf *leaf) { struct cpuid_leaf leaf_hv = *leaf; switch (leaf->fn) { case 0x1: - snp_cpuid_hv(&leaf_hv); + snp_cpuid_hv(ghcb, ctxt, &leaf_hv); /* initial APIC ID */ leaf->ebx = (leaf_hv.ebx & GENMASK(31, 24)) | (leaf->ebx & GENMASK(23, 0)); @@ -419,7 +459,7 @@ static int snp_cpuid_postprocess(struct cpuid_leaf *leaf) break; case 0xB: leaf_hv.subfn = 0; - snp_cpuid_hv(&leaf_hv); + snp_cpuid_hv(ghcb, ctxt, &leaf_hv); /* extended APIC ID */ leaf->edx = leaf_hv.edx; @@ -467,7 +507,7 @@ static int snp_cpuid_postprocess(struct cpuid_leaf *leaf) } break; case 0x8000001E: - snp_cpuid_hv(&leaf_hv); + snp_cpuid_hv(ghcb, ctxt, &leaf_hv); /* extended APIC ID */ leaf->eax = leaf_hv.eax; @@ -488,7 +528,7 @@ static int snp_cpuid_postprocess(struct cpuid_leaf *leaf) * Returns -EOPNOTSUPP if feature not enabled. Any other non-zero return value * should be treated as fatal by caller. */ -static int snp_cpuid(struct cpuid_leaf *leaf) +static int snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) { const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); @@ -522,7 +562,7 @@ static int snp_cpuid(struct cpuid_leaf *leaf) return 0; } - return snp_cpuid_postprocess(leaf); + return snp_cpuid_postprocess(ghcb, ctxt, leaf); } /* @@ -544,14 +584,14 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) leaf.fn = fn; leaf.subfn = subfn; - ret = snp_cpuid(&leaf); + ret = snp_cpuid(NULL, NULL, &leaf); if (!ret) goto cpuid_done; if (ret != -EOPNOTSUPP) goto fail; - if (sev_cpuid_hv(&leaf)) + if (__sev_cpuid_hv_msr(&leaf)) goto fail; cpuid_done: @@ -592,6 +632,23 @@ fail: sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); } +static enum es_result vc_insn_string_check(struct es_em_ctxt *ctxt, + unsigned long address, + bool write) +{ + if (user_mode(ctxt->regs) && fault_in_kernel_space(address)) { + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = X86_PF_USER; + ctxt->fi.cr2 = address; + if (write) + ctxt->fi.error_code |= X86_PF_WRITE; + + return ES_EXCEPTION; + } + + return ES_OK; +} + static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt, void *src, char *buf, unsigned int data_size, @@ -599,7 +656,12 @@ static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt, bool backwards) { int i, b = backwards ? -1 : 1; - enum es_result ret = ES_OK; + unsigned long address = (unsigned long)src; + enum es_result ret; + + ret = vc_insn_string_check(ctxt, address, false); + if (ret != ES_OK) + return ret; for (i = 0; i < count; i++) { void *s = src + (i * data_size * b); @@ -620,7 +682,12 @@ static enum es_result vc_insn_string_write(struct es_em_ctxt *ctxt, bool backwards) { int i, s = backwards ? -1 : 1; - enum es_result ret = ES_OK; + unsigned long address = (unsigned long)dst; + enum es_result ret; + + ret = vc_insn_string_check(ctxt, address, true); + if (ret != ES_OK) + return ret; for (i = 0; i < count; i++) { void *d = dst + (i * data_size * s); @@ -656,6 +723,9 @@ static enum es_result vc_insn_string_write(struct es_em_ctxt *ctxt, static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo) { struct insn *insn = &ctxt->insn; + size_t size; + u64 port; + *exitinfo = 0; switch (insn->opcode.bytes[0]) { @@ -664,7 +734,7 @@ static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo) case 0x6d: *exitinfo |= IOIO_TYPE_INS; *exitinfo |= IOIO_SEG_ES; - *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; + port = ctxt->regs->dx & 0xffff; break; /* OUTS opcodes */ @@ -672,41 +742,43 @@ static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo) case 0x6f: *exitinfo |= IOIO_TYPE_OUTS; *exitinfo |= IOIO_SEG_DS; - *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; + port = ctxt->regs->dx & 0xffff; break; /* IN immediate opcodes */ case 0xe4: case 0xe5: *exitinfo |= IOIO_TYPE_IN; - *exitinfo |= (u8)insn->immediate.value << 16; + port = (u8)insn->immediate.value & 0xffff; break; /* OUT immediate opcodes */ case 0xe6: case 0xe7: *exitinfo |= IOIO_TYPE_OUT; - *exitinfo |= (u8)insn->immediate.value << 16; + port = (u8)insn->immediate.value & 0xffff; break; /* IN register opcodes */ case 0xec: case 0xed: *exitinfo |= IOIO_TYPE_IN; - *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; + port = ctxt->regs->dx & 0xffff; break; /* OUT register opcodes */ case 0xee: case 0xef: *exitinfo |= IOIO_TYPE_OUT; - *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; + port = ctxt->regs->dx & 0xffff; break; default: return ES_DECODE_FAILED; } + *exitinfo |= port << 16; + switch (insn->opcode.bytes[0]) { case 0x6c: case 0x6e: @@ -716,12 +788,15 @@ static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo) case 0xee: /* Single byte opcodes */ *exitinfo |= IOIO_DATA_8; + size = 1; break; default: /* Length determined by instruction parsing */ *exitinfo |= (insn->opnd_bytes == 2) ? IOIO_DATA_16 : IOIO_DATA_32; + size = (insn->opnd_bytes == 2) ? 2 : 4; } + switch (insn->addr_bytes) { case 2: *exitinfo |= IOIO_ADDR_16; @@ -737,7 +812,7 @@ static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo) if (insn_has_rep_prefix(insn)) *exitinfo |= IOIO_REP; - return ES_OK; + return vc_ioio_check(ctxt, (u16)port, size); } static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) @@ -848,14 +923,15 @@ static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) return ret; } -static int vc_handle_cpuid_snp(struct pt_regs *regs) +static int vc_handle_cpuid_snp(struct ghcb *ghcb, struct es_em_ctxt *ctxt) { + struct pt_regs *regs = ctxt->regs; struct cpuid_leaf leaf; int ret; leaf.fn = regs->ax; leaf.subfn = regs->cx; - ret = snp_cpuid(&leaf); + ret = snp_cpuid(ghcb, ctxt, &leaf); if (!ret) { regs->ax = leaf.eax; regs->bx = leaf.ebx; @@ -874,7 +950,7 @@ static enum es_result vc_handle_cpuid(struct ghcb *ghcb, enum es_result ret; int snp_cpuid_ret; - snp_cpuid_ret = vc_handle_cpuid_snp(regs); + snp_cpuid_ret = vc_handle_cpuid_snp(ghcb, ctxt); if (!snp_cpuid_ret) return ES_OK; if (snp_cpuid_ret != -EOPNOTSUPP) diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 2787826d9f60..70472eebe719 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -524,6 +524,33 @@ static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt return ES_OK; } +static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size) +{ + BUG_ON(size > 4); + + if (user_mode(ctxt->regs)) { + struct thread_struct *t = ¤t->thread; + struct io_bitmap *iobm = t->io_bitmap; + size_t idx; + + if (!iobm) + goto fault; + + for (idx = port; idx < port + size; ++idx) { + if (test_bit(idx, iobm->bitmap)) + goto fault; + } + } + + return ES_OK; + +fault: + ctxt->fi.vector = X86_TRAP_GP; + ctxt->fi.error_code = 0; + + return ES_EXCEPTION; +} + /* Include code shared with pre-decompression boot stage */ #include "sev-shared.c" @@ -868,8 +895,7 @@ void snp_set_memory_private(unsigned long vaddr, unsigned long npages) void snp_accept_memory(phys_addr_t start, phys_addr_t end) { - unsigned long vaddr; - unsigned int npages; + unsigned long vaddr, npages; if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) return; @@ -940,7 +966,7 @@ static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa) free_page((unsigned long)vmsa); } -static int wakeup_cpu_via_vmgexit(int apic_id, unsigned long start_ip) +static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip) { struct sev_es_save_area *cur_vmsa, *vmsa; struct ghcb_state state; @@ -1509,6 +1535,9 @@ static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) return ES_DECODE_FAILED; } + if (user_mode(ctxt->regs)) + return ES_UNSUPPORTED; + switch (mmio) { case INSN_MMIO_WRITE: memcpy(ghcb->shared_buffer, reg_data, bytes); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 6eb06d001bcc..96a771f9f930 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -131,7 +131,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) } /* - * Disable virtualization, APIC etc. and park the CPU in a HLT loop + * this function calls the 'stop' function on all other CPUs in the system. */ DEFINE_IDTENTRY_SYSVEC(sysvec_reboot) { @@ -172,17 +172,13 @@ static void native_stop_other_cpus(int wait) * 2) Wait for all other CPUs to report that they reached the * HLT loop in stop_this_cpu() * - * 3) If the system uses INIT/STARTUP for CPU bringup, then - * send all present CPUs an INIT vector, which brings them - * completely out of the way. + * 3) If #2 timed out send an NMI to the CPUs which did not + * yet report * - * 4) If #3 is not possible and #2 timed out send an NMI to the - * CPUs which did not yet report - * - * 5) Wait for all other CPUs to report that they reached the + * 4) Wait for all other CPUs to report that they reached the * HLT loop in stop_this_cpu() * - * #4 can obviously race against a CPU reaching the HLT loop late. + * #3 can obviously race against a CPU reaching the HLT loop late. * That CPU will have reported already and the "have all CPUs * reached HLT" condition will be true despite the fact that the * other CPU is still handling the NMI. Again, there is no @@ -198,7 +194,7 @@ static void native_stop_other_cpus(int wait) /* * Don't wait longer than a second for IPI completion. The * wait request is not checked here because that would - * prevent an NMI/INIT shutdown in case that not all + * prevent an NMI shutdown attempt in case that not all * CPUs reach shutdown state. */ timeout = USEC_PER_SEC; @@ -206,27 +202,7 @@ static void native_stop_other_cpus(int wait) udelay(1); } - /* - * Park all other CPUs in INIT including "offline" CPUs, if - * possible. That's a safe place where they can't resume execution - * of HLT and then execute the HLT loop from overwritten text or - * page tables. - * - * The only downside is a broadcast MCE, but up to the point where - * the kexec() kernel brought all APs online again an MCE will just - * make HLT resume and handle the MCE. The machine crashes and burns - * due to overwritten text, page tables and data. So there is a - * choice between fire and frying pan. The result is pretty much - * the same. Chose frying pan until x86 provides a sane mechanism - * to park a CPU. - */ - if (smp_park_other_cpus_in_init()) - goto done; - - /* - * If park with INIT was not possible and the REBOOT_VECTOR didn't - * take all secondary CPUs offline, try with the NMI. - */ + /* if the REBOOT_VECTOR didn't work, try with the NMI */ if (!cpumask_empty(&cpus_stop_mask)) { /* * If NMI IPI is enabled, try to register the stop handler @@ -249,7 +225,6 @@ static void native_stop_other_cpus(int wait) udelay(1); } -done: local_irq_save(flags); disable_local_APIC(); mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 48e040618731..c4aca66f0902 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -87,6 +87,7 @@ #include <asm/hw_irq.h> #include <asm/stackprotector.h> #include <asm/sev.h> +#include <asm/spec-ctrl.h> /* representing HT siblings of each logical CPU */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); @@ -124,7 +125,20 @@ struct mwait_cpu_dead { */ static DEFINE_PER_CPU_ALIGNED(struct mwait_cpu_dead, mwait_cpu_dead); -/* Logical package management. We might want to allocate that dynamically */ +/* Logical package management. */ +struct logical_maps { + u32 phys_pkg_id; + u32 phys_die_id; + u32 logical_pkg_id; + u32 logical_die_id; +}; + +/* Temporary workaround until the full topology mechanics is in place */ +static DEFINE_PER_CPU_READ_MOSTLY(struct logical_maps, logical_maps) = { + .phys_pkg_id = U32_MAX, + .phys_die_id = U32_MAX, +}; + unsigned int __max_logical_packages __read_mostly; EXPORT_SYMBOL(__max_logical_packages); static unsigned int logical_packages __read_mostly; @@ -288,7 +302,7 @@ static void notrace start_secondary(void *unused) cpu_init(); fpu__init_cpu(); - rcu_cpu_starting(raw_smp_processor_id()); + rcutree_report_cpu_starting(raw_smp_processor_id()); x86_cpuinit.early_percpu_clock_init(); ap_starting(); @@ -337,10 +351,8 @@ int topology_phys_to_logical_pkg(unsigned int phys_pkg) int cpu; for_each_possible_cpu(cpu) { - struct cpuinfo_x86 *c = &cpu_data(cpu); - - if (c->initialized && c->phys_proc_id == phys_pkg) - return c->logical_proc_id; + if (per_cpu(logical_maps.phys_pkg_id, cpu) == phys_pkg) + return per_cpu(logical_maps.logical_pkg_id, cpu); } return -1; } @@ -355,14 +367,12 @@ EXPORT_SYMBOL(topology_phys_to_logical_pkg); */ static int topology_phys_to_logical_die(unsigned int die_id, unsigned int cur_cpu) { - int cpu, proc_id = cpu_data(cur_cpu).phys_proc_id; + int cpu, proc_id = cpu_data(cur_cpu).topo.pkg_id; for_each_possible_cpu(cpu) { - struct cpuinfo_x86 *c = &cpu_data(cpu); - - if (c->initialized && c->cpu_die_id == die_id && - c->phys_proc_id == proc_id) - return c->logical_die_id; + if (per_cpu(logical_maps.phys_pkg_id, cpu) == proc_id && + per_cpu(logical_maps.phys_die_id, cpu) == die_id) + return per_cpu(logical_maps.logical_die_id, cpu); } return -1; } @@ -387,7 +397,9 @@ int topology_update_package_map(unsigned int pkg, unsigned int cpu) cpu, pkg, new); } found: - cpu_data(cpu).logical_proc_id = new; + per_cpu(logical_maps.phys_pkg_id, cpu) = pkg; + per_cpu(logical_maps.logical_pkg_id, cpu) = new; + cpu_data(cpu).topo.logical_pkg_id = new; return 0; } /** @@ -410,7 +422,9 @@ int topology_update_die_map(unsigned int die, unsigned int cpu) cpu, die, new); } found: - cpu_data(cpu).logical_die_id = new; + per_cpu(logical_maps.phys_die_id, cpu) = die; + per_cpu(logical_maps.logical_die_id, cpu) = new; + cpu_data(cpu).topo.logical_die_id = new; return 0; } @@ -421,8 +435,8 @@ static void __init smp_store_boot_cpu_info(void) *c = boot_cpu_data; c->cpu_index = id; - topology_update_package_map(c->phys_proc_id, id); - topology_update_die_map(c->cpu_die_id, id); + topology_update_package_map(c->topo.pkg_id, id); + topology_update_die_map(c->topo.die_id, id); c->initialized = true; } @@ -476,21 +490,21 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { int cpu1 = c->cpu_index, cpu2 = o->cpu_index; - if (c->phys_proc_id == o->phys_proc_id && - c->cpu_die_id == o->cpu_die_id && - per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2)) { - if (c->cpu_core_id == o->cpu_core_id) + if (c->topo.pkg_id == o->topo.pkg_id && + c->topo.die_id == o->topo.die_id && + per_cpu_llc_id(cpu1) == per_cpu_llc_id(cpu2)) { + if (c->topo.core_id == o->topo.core_id) return topology_sane(c, o, "smt"); - if ((c->cu_id != 0xff) && - (o->cu_id != 0xff) && - (c->cu_id == o->cu_id)) + if ((c->topo.cu_id != 0xff) && + (o->topo.cu_id != 0xff) && + (c->topo.cu_id == o->topo.cu_id)) return topology_sane(c, o, "smt"); } - } else if (c->phys_proc_id == o->phys_proc_id && - c->cpu_die_id == o->cpu_die_id && - c->cpu_core_id == o->cpu_core_id) { + } else if (c->topo.pkg_id == o->topo.pkg_id && + c->topo.die_id == o->topo.die_id && + c->topo.core_id == o->topo.core_id) { return topology_sane(c, o, "smt"); } @@ -499,8 +513,8 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { - if (c->phys_proc_id == o->phys_proc_id && - c->cpu_die_id == o->cpu_die_id) + if (c->topo.pkg_id == o->topo.pkg_id && + c->topo.die_id == o->topo.die_id) return true; return false; } @@ -510,11 +524,11 @@ static bool match_l2c(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) int cpu1 = c->cpu_index, cpu2 = o->cpu_index; /* If the arch didn't set up l2c_id, fall back to SMT */ - if (per_cpu(cpu_l2c_id, cpu1) == BAD_APICID) + if (per_cpu_l2c_id(cpu1) == BAD_APICID) return match_smt(c, o); /* Do not match if L2 cache id does not match: */ - if (per_cpu(cpu_l2c_id, cpu1) != per_cpu(cpu_l2c_id, cpu2)) + if (per_cpu_l2c_id(cpu1) != per_cpu_l2c_id(cpu2)) return false; return topology_sane(c, o, "l2c"); @@ -527,7 +541,7 @@ static bool match_l2c(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) */ static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { - if (c->phys_proc_id == o->phys_proc_id) + if (c->topo.pkg_id == o->topo.pkg_id) return true; return false; } @@ -560,11 +574,11 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) bool intel_snc = id && id->driver_data; /* Do not match if we do not have a valid APICID for cpu: */ - if (per_cpu(cpu_llc_id, cpu1) == BAD_APICID) + if (per_cpu_llc_id(cpu1) == BAD_APICID) return false; /* Do not match if LLC id does not match: */ - if (per_cpu(cpu_llc_id, cpu1) != per_cpu(cpu_llc_id, cpu2)) + if (per_cpu_llc_id(cpu1) != per_cpu_llc_id(cpu2)) return false; /* @@ -640,13 +654,13 @@ static void __init build_sched_topology(void) }; #endif /* - * When there is NUMA topology inside the package skip the DIE domain + * When there is NUMA topology inside the package skip the PKG domain * since the NUMA domains will auto-magically create the right spanning * domains based on the SLIT. */ if (!x86_has_numa_in_package) { x86_topology[i++] = (struct sched_domain_topology_level){ - cpu_cpu_mask, x86_die_flags, SD_INIT_NAME(DIE) + cpu_cpu_mask, x86_die_flags, SD_INIT_NAME(PKG) }; } @@ -809,7 +823,7 @@ static void __init smp_quirk_init_udelay(void) /* * Wake up AP by INIT, INIT, STARTUP sequence. */ -static void send_init_sequence(int phys_apicid) +static void send_init_sequence(u32 phys_apicid) { int maxlvt = lapic_get_maxlvt(); @@ -835,7 +849,7 @@ static void send_init_sequence(int phys_apicid) /* * Wake up AP by INIT, INIT, STARTUP sequence. */ -static int wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) +static int wakeup_secondary_cpu_via_init(u32 phys_apicid, unsigned long start_eip) { unsigned long send_status = 0, accept_status = 0; int num_starts, j, maxlvt; @@ -982,7 +996,7 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle) * Returns zero if startup was successfully sent, else error code from * ->wakeup_secondary_cpu. */ -static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) +static int do_boot_cpu(u32 apicid, int cpu, struct task_struct *idle) { unsigned long start_ip = real_mode_header->trampoline_start; int ret; @@ -1050,7 +1064,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) int native_kick_ap(unsigned int cpu, struct task_struct *tidle) { - int apicid = apic->cpu_present_to_apicid(cpu); + u32 apicid = apic->cpu_present_to_apicid(cpu); int err; lockdep_assert_irqs_enabled(); @@ -1240,33 +1254,6 @@ void arch_thaw_secondary_cpus_end(void) cache_aps_init(); } -bool smp_park_other_cpus_in_init(void) -{ - unsigned int cpu, this_cpu = smp_processor_id(); - unsigned int apicid; - - if (apic->wakeup_secondary_cpu_64 || apic->wakeup_secondary_cpu) - return false; - - /* - * If this is a crash stop which does not execute on the boot CPU, - * then this cannot use the INIT mechanism because INIT to the boot - * CPU will reset the machine. - */ - if (this_cpu) - return false; - - for_each_cpu_and(cpu, &cpus_booted_once_mask, cpu_present_mask) { - if (cpu == this_cpu) - continue; - apicid = apic->cpu_present_to_apicid(cpu); - if (apicid == BAD_APICID) - continue; - send_init_sequence(apicid); - } - return true; -} - /* * Early setup to make printk work. */ @@ -1432,7 +1419,7 @@ static void remove_siblinginfo(int cpu) cpumask_clear(topology_sibling_cpumask(cpu)); cpumask_clear(topology_core_cpumask(cpu)); cpumask_clear(topology_die_cpumask(cpu)); - c->cpu_core_id = 0; + c->topo.core_id = 0; c->booted_cores = 0; cpumask_clear_cpu(cpu, cpu_sibling_setup_mask); recompute_smt_state(); @@ -1623,8 +1610,15 @@ void __noreturn hlt_play_dead(void) native_halt(); } +/* + * native_play_dead() is essentially a __noreturn function, but it can't + * be marked as such as the compiler may complain about it. + */ void native_play_dead(void) { + if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS)) + __update_spec_ctrl(0); + play_dead_common(); tboot_shutdown(TB_SHUTDOWN_WFS); diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index ca004e2e4469..0bab03130033 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -54,7 +54,7 @@ void arch_unregister_cpu(int num) EXPORT_SYMBOL(arch_unregister_cpu); #else /* CONFIG_HOTPLUG_CPU */ -static int __init arch_register_cpu(int num) +int __init arch_register_cpu(int num) { return register_cpu(&per_cpu(cpu_devices, num).cpu, num); } diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index bbc440c93e08..1123ef3ccf90 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -15,6 +15,7 @@ * ( The serial nature of the boot logic and the CPU hotplug lock * protects against more than 2 CPUs entering this code. ) */ +#include <linux/workqueue.h> #include <linux/topology.h> #include <linux/spinlock.h> #include <linux/kernel.h> @@ -342,6 +343,13 @@ static inline unsigned int loop_timeout(int cpu) return (cpumask_weight(topology_core_cpumask(cpu)) > 1) ? 2 : 20; } +static void tsc_sync_mark_tsc_unstable(struct work_struct *work) +{ + mark_tsc_unstable("check_tsc_sync_source failed"); +} + +static DECLARE_WORK(tsc_sync_work, tsc_sync_mark_tsc_unstable); + /* * The freshly booted CPU initiates this via an async SMP function call. */ @@ -395,7 +403,7 @@ retry: "turning off TSC clock.\n", max_warp); if (random_warps) pr_warn("TSC warped randomly between CPUs\n"); - mark_tsc_unstable("check_tsc_sync_source failed"); + schedule_work(&tsc_sync_work); } /* diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 7e574cf3bf8a..d00c28aaa5be 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -85,7 +85,7 @@ static struct orc_entry *__orc_find(int *ip_table, struct orc_entry *u_table, { int *first = ip_table; int *last = ip_table + num_entries - 1; - int *mid = first, *found = first; + int *mid, *found = first; if (!num_entries) return NULL; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index f15fb71f280e..54a5596adaa6 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -139,10 +139,7 @@ SECTIONS STATIC_CALL_TEXT ALIGN_ENTRY_TEXT_BEGIN -#ifdef CONFIG_CPU_SRSO *(.text..__x86.rethunk_untrain) -#endif - ENTRY_TEXT #ifdef CONFIG_CPU_SRSO @@ -520,12 +517,12 @@ INIT_PER_CPU(irq_stack_backing_store); "fixed_percpu_data is not at start of per-cpu area"); #endif -#ifdef CONFIG_RETHUNK +#ifdef CONFIG_CPU_UNRET_ENTRY . = ASSERT((retbleed_return_thunk & 0x3f) == 0, "retbleed_return_thunk not cacheline-aligned"); -. = ASSERT((srso_safe_ret & 0x3f) == 0, "srso_safe_ret not cacheline-aligned"); #endif #ifdef CONFIG_CPU_SRSO +. = ASSERT((srso_safe_ret & 0x3f) == 0, "srso_safe_ret not cacheline-aligned"); /* * GNU ld cannot do XOR until 2.41. * https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=f6f78318fca803c4907fb8d7f6ded8295f1947b1 diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index 65e96b76c423..d3fc01770558 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -127,7 +127,7 @@ static void __init vsmp_cap_cpus(void) #endif } -static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) +static u32 apicid_phys_pkg_id(u32 initial_apic_id, int index_msb) { return read_apic_id() >> index_msb; } diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index ed90f148140d..950c12868d30 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -154,4 +154,15 @@ config KVM_PROVE_MMU config KVM_EXTERNAL_WRITE_TRACKING bool +config KVM_MAX_NR_VCPUS + int "Maximum number of vCPUs per KVM guest" + depends on KVM + range 1024 4096 + default 4096 if MAXSMP + default 1024 + help + Set the maximum number of vCPUs per KVM guest. Larger values will increase + the memory footprint of each KVM guest, regardless of how many vCPUs are + created for a given VM. + endif # VIRTUALIZATION diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 0544e30b4946..dda6fc4cfae8 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -360,14 +360,6 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vcpu->arch.guest_supported_xcr0 = cpuid_get_supported_xcr0(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent); - /* - * FP+SSE can always be saved/restored via KVM_{G,S}ET_XSAVE, even if - * XSAVE/XCRO are not exposed to the guest, and even if XSAVE isn't - * supported by the host. - */ - vcpu->arch.guest_fpu.fpstate->user_xfeatures = vcpu->arch.guest_supported_xcr0 | - XFEATURE_MASK_FPSSE; - kvm_update_pv_runtime(vcpu); vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); @@ -456,7 +448,9 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, vcpu->arch.cpuid_nent = nent; vcpu->arch.kvm_cpuid = kvm_get_hypervisor_cpuid(vcpu, KVM_SIGNATURE); +#ifdef CONFIG_KVM_XEN vcpu->arch.xen.cpuid = kvm_get_hypervisor_cpuid(vcpu, XEN_SIGNATURE); +#endif kvm_vcpu_after_set_cpuid(vcpu); return 0; @@ -761,11 +755,13 @@ void kvm_set_cpu_caps(void) kvm_cpu_cap_mask(CPUID_8000_0021_EAX, F(NO_NESTED_DATA_BP) | F(LFENCE_RDTSC) | 0 /* SmmPgCfgLock */ | - F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */ + F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */ | + F(WRMSR_XX_BASE_NS) ); - if (cpu_feature_enabled(X86_FEATURE_SRSO_NO)) - kvm_cpu_cap_set(X86_FEATURE_SRSO_NO); + kvm_cpu_cap_check_and_set(X86_FEATURE_SBPB); + kvm_cpu_cap_check_and_set(X86_FEATURE_IBPB_BRTYPE); + kvm_cpu_cap_check_and_set(X86_FEATURE_SRSO_NO); kvm_cpu_cap_init_kvm_defined(CPUID_8000_0022_EAX, F(PERFMON_V2) diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 284fa4704553..0b90532b6e26 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -174,7 +174,8 @@ static inline bool guest_has_spec_ctrl_msr(struct kvm_vcpu *vcpu) static inline bool guest_has_pred_cmd_msr(struct kvm_vcpu *vcpu) { return (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) || - guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB)); + guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB) || + guest_cpuid_has(vcpu, X86_FEATURE_SBPB)); } static inline bool supports_cpuid_fault(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 7c2dac6824e2..238afd7335e4 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -727,10 +727,12 @@ static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count, stimer_cleanup(stimer); stimer->count = count; - if (stimer->count == 0) - stimer->config.enable = 0; - else if (stimer->config.auto_enable) - stimer->config.enable = 1; + if (!host) { + if (stimer->count == 0) + stimer->config.enable = 0; + else if (stimer->config.auto_enable) + stimer->config.enable = 1; + } if (stimer->config.enable) stimer_mark_pending(stimer, false); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index dcd60b39e794..245b20973cae 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2444,22 +2444,22 @@ EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) { struct kvm_lapic *apic = vcpu->arch.apic; - u64 val; /* - * ICR is a single 64-bit register when x2APIC is enabled. For legacy - * xAPIC, ICR writes need to go down the common (slightly slower) path - * to get the upper half from ICR2. + * ICR is a single 64-bit register when x2APIC is enabled, all others + * registers hold 32-bit values. For legacy xAPIC, ICR writes need to + * go down the common path to get the upper half from ICR2. + * + * Note, using the write helpers may incur an unnecessary write to the + * virtual APIC state, but KVM needs to conditionally modify the value + * in certain cases, e.g. to clear the ICR busy bit. The cost of extra + * conditional branches is likely a wash relative to the cost of the + * maybe-unecessary write, and both are in the noise anyways. */ - if (apic_x2apic_mode(apic) && offset == APIC_ICR) { - val = kvm_lapic_get_reg64(apic, APIC_ICR); - kvm_apic_send_ipi(apic, (u32)val, (u32)(val >> 32)); - trace_kvm_apic_write(APIC_ICR, val); - } else { - /* TODO: optimize to just emulate side effect w/o one more write */ - val = kvm_lapic_get_reg(apic, offset); - kvm_lapic_reg_write(apic, offset, (u32)val); - } + if (apic_x2apic_mode(apic) && offset == APIC_ICR) + kvm_x2apic_icr_write(apic, kvm_lapic_get_reg64(apic, APIC_ICR)); + else + kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset)); } EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode); @@ -2670,6 +2670,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) u64 msr_val; int i; + static_call_cond(kvm_x86_apicv_pre_state_restore)(vcpu); + if (!init_event) { msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_reset_bsp(vcpu)) @@ -2759,13 +2761,17 @@ int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) { u32 reg = kvm_lapic_get_reg(apic, lvt_type); int vector, mode, trig_mode; + int r; if (kvm_apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) { vector = reg & APIC_VECTOR_MASK; mode = reg & APIC_MODE_MASK; trig_mode = reg & APIC_LVT_LEVEL_TRIGGER; - return __apic_accept_irq(apic, mode, vector, 1, trig_mode, - NULL); + + r = __apic_accept_irq(apic, mode, vector, 1, trig_mode, NULL); + if (r && lvt_type == APIC_LVTPC) + kvm_lapic_set_reg(apic, APIC_LVTPC, reg | APIC_LVT_MASKED); + return r; } return 0; } @@ -2977,6 +2983,8 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) struct kvm_lapic *apic = vcpu->arch.apic; int r; + static_call_cond(kvm_x86_apicv_pre_state_restore)(vcpu); + kvm_lapic_set_base(vcpu, vcpu->arch.apic_base); /* set SPIV separately to get count of SW disabled APICs right */ apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV))); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 253fb2093d5d..bb8c86eefac0 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -237,6 +237,13 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return -(u32)fault & errcode; } +bool __kvm_mmu_honors_guest_mtrrs(bool vm_has_noncoherent_dma); + +static inline bool kvm_mmu_honors_guest_mtrrs(struct kvm *kvm) +{ + return __kvm_mmu_honors_guest_mtrrs(kvm_arch_has_noncoherent_dma(kvm)); +} + void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end); int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index f7901cb4d2fa..c57e181bba21 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3425,8 +3425,8 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_mmu_page *sp; int ret = RET_PF_INVALID; - u64 spte = 0ull; - u64 *sptep = NULL; + u64 spte; + u64 *sptep; uint retry_count = 0; if (!page_fault_can_be_fast(fault)) @@ -3442,6 +3442,14 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) else sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte); + /* + * It's entirely possible for the mapping to have been zapped + * by a different task, but the root page should always be + * available as the vCPU holds a reference to its root(s). + */ + if (WARN_ON_ONCE(!sptep)) + spte = REMOVED_SPTE; + if (!is_shadow_present_pte(spte)) break; @@ -4479,21 +4487,28 @@ out_unlock: } #endif -int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) +bool __kvm_mmu_honors_guest_mtrrs(bool vm_has_noncoherent_dma) { /* - * If the guest's MTRRs may be used to compute the "real" memtype, - * restrict the mapping level to ensure KVM uses a consistent memtype - * across the entire mapping. If the host MTRRs are ignored by TDP - * (shadow_memtype_mask is non-zero), and the VM has non-coherent DMA - * (DMA doesn't snoop CPU caches), KVM's ABI is to honor the memtype - * from the guest's MTRRs so that guest accesses to memory that is - * DMA'd aren't cached against the guest's wishes. + * If host MTRRs are ignored (shadow_memtype_mask is non-zero), and the + * VM has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is + * to honor the memtype from the guest's MTRRs so that guest accesses + * to memory that is DMA'd aren't cached against the guest's wishes. * * Note, KVM may still ultimately ignore guest MTRRs for certain PFNs, * e.g. KVM will force UC memtype for host MMIO. */ - if (shadow_memtype_mask && kvm_arch_has_noncoherent_dma(vcpu->kvm)) { + return vm_has_noncoherent_dma && shadow_memtype_mask; +} + +int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) +{ + /* + * If the guest's MTRRs may be used to compute the "real" memtype, + * restrict the mapping level to ensure KVM uses a consistent memtype + * across the entire mapping. + */ + if (kvm_mmu_honors_guest_mtrrs(vcpu->kvm)) { for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) { int page_num = KVM_PAGES_PER_HPAGE(fault->max_level); gfn_t base = gfn_round_for_level(fault->gfn, @@ -6785,11 +6800,7 @@ static unsigned long mmu_shrink_count(struct shrinker *shrink, return percpu_counter_read_positive(&kvm_total_used_mmu_pages); } -static struct shrinker mmu_shrinker = { - .count_objects = mmu_shrink_count, - .scan_objects = mmu_shrink_scan, - .seeks = DEFAULT_SEEKS * 10, -}; +static struct shrinker *mmu_shrinker; static void mmu_destroy_caches(void) { @@ -6922,10 +6933,16 @@ int kvm_mmu_vendor_module_init(void) if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL)) goto out; - ret = register_shrinker(&mmu_shrinker, "x86-mmu"); - if (ret) + mmu_shrinker = shrinker_alloc(0, "x86-mmu"); + if (!mmu_shrinker) goto out_shrinker; + mmu_shrinker->count_objects = mmu_shrink_count; + mmu_shrinker->scan_objects = mmu_shrink_scan; + mmu_shrinker->seeks = DEFAULT_SEEKS * 10; + + shrinker_register(mmu_shrinker); + return 0; out_shrinker: @@ -6947,7 +6964,7 @@ void kvm_mmu_vendor_module_exit(void) { mmu_destroy_caches(); percpu_counter_destroy(&kvm_total_used_mmu_pages); - unregister_shrinker(&mmu_shrinker); + shrinker_free(mmu_shrinker); } /* diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index 3eb6e7f47e96..a67c28a56417 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -320,7 +320,7 @@ static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr) struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state; gfn_t start, end; - if (!tdp_enabled || !kvm_arch_has_noncoherent_dma(vcpu->kvm)) + if (!kvm_mmu_honors_guest_mtrrs(vcpu->kvm)) return; if (!mtrr_is_enabled(mtrr_state) && msr != MSR_MTRRdefType) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index edb89b51b383..9ae07db6f0f6 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -93,14 +93,6 @@ void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops) #undef __KVM_X86_PMU_OP } -static void kvm_pmi_trigger_fn(struct irq_work *irq_work) -{ - struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work); - struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); - - kvm_pmu_deliver_pmi(vcpu); -} - static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); @@ -124,20 +116,7 @@ static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); } - if (!pmc->intr || skip_pmi) - return; - - /* - * Inject PMI. If vcpu was in a guest mode during NMI PMI - * can be ejected on a guest mode re-entry. Otherwise we can't - * be sure that vcpu wasn't executing hlt instruction at the - * time of vmexit and is not going to re-enter guest mode until - * woken up. So we should wake it, but this is impossible from - * NMI context. Do it from irq work instead. - */ - if (in_pmi && !kvm_handling_nmi_from_guest(pmc->vcpu)) - irq_work_queue(&pmc_to_pmu(pmc)->irq_work); - else + if (pmc->intr && !skip_pmi) kvm_make_request(KVM_REQ_PMI, pmc->vcpu); } @@ -675,9 +654,6 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) void kvm_pmu_reset(struct kvm_vcpu *vcpu) { - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - - irq_work_sync(&pmu->irq_work); static_call(kvm_x86_pmu_reset)(vcpu); } @@ -687,7 +663,6 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu) memset(pmu, 0, sizeof(*pmu)); static_call(kvm_x86_pmu_init)(vcpu); - init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn); pmu->event_count = 0; pmu->need_cleanup = false; kvm_pmu_refresh(vcpu); diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 7d9ba301c090..1d64113de488 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -74,6 +74,12 @@ static inline u64 pmc_read_counter(struct kvm_pmc *pmc) return counter & pmc_bitmask(pmc); } +static inline void pmc_write_counter(struct kvm_pmc *pmc, u64 val) +{ + pmc->counter += val - pmc_read_counter(pmc); + pmc->counter &= pmc_bitmask(pmc); +} + static inline void pmc_release_perf_event(struct kvm_pmc *pmc) { if (pmc->perf_event) { diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index b42111a24cc2..dc3d95fdca7d 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -324,7 +324,6 @@ void enter_smm(struct kvm_vcpu *vcpu) cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG); static_call(kvm_x86_set_cr0)(vcpu, cr0); - vcpu->arch.cr0 = cr0; static_call(kvm_x86_set_cr4)(vcpu, 0); diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 2092db892d7d..4b74ea91f4e6 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -529,8 +529,11 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu) case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE: WARN_ONCE(1, "Invalid backing page\n"); break; + case AVIC_IPI_FAILURE_INVALID_IPI_VECTOR: + /* Invalid IPI with vector < 16 */ + break; default: - pr_err("Unknown IPI interception\n"); + vcpu_unimpl(vcpu, "Unknown avic incomplete IPI interception\n"); } return 1; diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index dd496c9e5f91..3fea8c47679e 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1253,6 +1253,9 @@ void svm_leave_nested(struct kvm_vcpu *vcpu) nested_svm_uninit_mmu_context(vcpu); vmcb_mark_all_dirty(svm->vmcb); + + if (kvm_apicv_activated(vcpu->kvm)) + kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); } kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index cef5a3d0abd0..373ff6a6687b 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -160,7 +160,7 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) /* MSR_PERFCTRn */ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER); if (pmc) { - pmc->counter += data - pmc_read_counter(pmc); + pmc_write_counter(pmc, data); pmc_update_sample_period(pmc); return 0; } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9507df93f410..712146312358 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -199,7 +199,7 @@ module_param_named(npt, npt_enabled, bool, 0444); /* allow nested virtualization in KVM/SVM */ static int nested = true; -module_param(nested, int, S_IRUGO); +module_param(nested, int, 0444); /* enable/disable Next RIP Save */ int nrips = true; @@ -364,8 +364,6 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK; } -static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, - void *insn, int insn_len); static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu, bool commit_side_effects) @@ -386,14 +384,6 @@ static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu, } if (!svm->next_rip) { - /* - * FIXME: Drop this when kvm_emulate_instruction() does the - * right thing and treats "can't emulate" as outright failure - * for EMULTYPE_SKIP. - */ - if (!svm_can_emulate_instruction(vcpu, EMULTYPE_SKIP, NULL, 0)) - return 0; - if (unlikely(!commit_side_effects)) old_rflags = svm->vmcb->save.rflags; @@ -531,8 +521,6 @@ static bool __kvm_is_svm_supported(void) int cpu = smp_processor_id(); struct cpuinfo_x86 *c = &cpu_data(cpu); - u64 vm_cr; - if (c->x86_vendor != X86_VENDOR_AMD && c->x86_vendor != X86_VENDOR_HYGON) { pr_err("CPU %d isn't AMD or Hygon\n", cpu); @@ -549,12 +537,6 @@ static bool __kvm_is_svm_supported(void) return false; } - rdmsrl(MSR_VM_CR, vm_cr); - if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE)) { - pr_err("SVM disabled (by BIOS) in MSR_VM_CR on CPU %d\n", cpu); - return false; - } - return true; } @@ -691,7 +673,7 @@ static int svm_hardware_enable(void) */ if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) { struct sev_es_save_area *hostsa; - u32 msr_hi; + u32 __maybe_unused msr_hi; hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400); @@ -913,8 +895,7 @@ void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept) if (intercept == svm->x2avic_msrs_intercepted) return; - if (!x2avic_enabled || - !apic_x2apic_mode(svm->vcpu.arch.apic)) + if (!x2avic_enabled) return; for (i = 0; i < MAX_DIRECT_ACCESS_MSRS; i++) { @@ -2203,12 +2184,6 @@ static int shutdown_interception(struct kvm_vcpu *vcpu) struct kvm_run *kvm_run = vcpu->run; struct vcpu_svm *svm = to_svm(vcpu); - /* - * The VM save area has already been encrypted so it - * cannot be reinitialized - just terminate. - */ - if (sev_es_guest(vcpu->kvm)) - return -EINVAL; /* * VMCB is undefined after a SHUTDOWN intercept. INIT the vCPU to put @@ -2217,9 +2192,14 @@ static int shutdown_interception(struct kvm_vcpu *vcpu) * userspace. At a platform view, INIT is acceptable behavior as * there exist bare metal platforms that automatically INIT the CPU * in response to shutdown. + * + * The VM save area for SEV-ES guests has already been encrypted so it + * cannot be reinitialized, i.e. synthesizing INIT is futile. */ - clear_page(svm->vmcb); - kvm_vcpu_reset(vcpu, true); + if (!sev_es_guest(vcpu->kvm)) { + clear_page(svm->vmcb); + kvm_vcpu_reset(vcpu, true); + } kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; return 0; @@ -4728,15 +4708,15 @@ static void svm_enable_smi_window(struct kvm_vcpu *vcpu) } #endif -static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, - void *insn, int insn_len) +static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len) { bool smep, smap, is_user; u64 error_code; /* Emulation is always possible when KVM has access to all guest state. */ if (!sev_guest(vcpu->kvm)) - return true; + return X86EMUL_CONTINUE; /* #UD and #GP should never be intercepted for SEV guests. */ WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD | @@ -4748,14 +4728,14 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, * to guest register state. */ if (sev_es_guest(vcpu->kvm)) - return false; + return X86EMUL_RETRY_INSTR; /* * Emulation is possible if the instruction is already decoded, e.g. * when completing I/O after returning from userspace. */ if (emul_type & EMULTYPE_NO_DECODE) - return true; + return X86EMUL_CONTINUE; /* * Emulation is possible for SEV guests if and only if a prefilled @@ -4781,9 +4761,11 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, * success (and in practice it will work the vast majority of the time). */ if (unlikely(!insn)) { - if (!(emul_type & EMULTYPE_SKIP)) - kvm_queue_exception(vcpu, UD_VECTOR); - return false; + if (emul_type & EMULTYPE_SKIP) + return X86EMUL_UNHANDLEABLE; + + kvm_queue_exception(vcpu, UD_VECTOR); + return X86EMUL_PROPAGATE_FAULT; } /* @@ -4794,7 +4776,7 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, * table used to translate CS:RIP resides in emulated MMIO. */ if (likely(insn_len)) - return true; + return X86EMUL_CONTINUE; /* * Detect and workaround Errata 1096 Fam_17h_00_0Fh. @@ -4852,6 +4834,7 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, kvm_inject_gp(vcpu, 0); else kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + return X86EMUL_PROPAGATE_FAULT; } resume_guest: @@ -4869,7 +4852,7 @@ resume_guest: * doesn't explicitly define "ignored", i.e. doing nothing and letting * the guest spin is technically "ignoring" the access. */ - return false; + return X86EMUL_RETRY_INSTR; } static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu) @@ -5029,7 +5012,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .vm_copy_enc_context_from = sev_vm_copy_enc_context_from, .vm_move_enc_context_from = sev_vm_move_enc_context_from, - .can_emulate_instruction = svm_can_emulate_instruction, + .check_emulate_instruction = svm_check_emulate_instruction, .apic_init_signal_blocked = svm_apic_init_signal_blocked, diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index f2efa0bf7ae8..820d3e1f6b4f 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -436,11 +436,11 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated && !(msr & MSR_PMC_FULL_WIDTH_BIT)) data = (s64)(s32)data; - pmc->counter += data - pmc_read_counter(pmc); + pmc_write_counter(pmc, data); pmc_update_sample_period(pmc); break; } else if ((pmc = get_fixed_pmc(pmu, msr))) { - pmc->counter += data - pmc_read_counter(pmc); + pmc_write_counter(pmc, data); pmc_update_sample_period(pmc); break; } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 72e3943f3693..be20a60047b1 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -82,28 +82,28 @@ bool __read_mostly enable_vpid = 1; module_param_named(vpid, enable_vpid, bool, 0444); static bool __read_mostly enable_vnmi = 1; -module_param_named(vnmi, enable_vnmi, bool, S_IRUGO); +module_param_named(vnmi, enable_vnmi, bool, 0444); bool __read_mostly flexpriority_enabled = 1; -module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); +module_param_named(flexpriority, flexpriority_enabled, bool, 0444); bool __read_mostly enable_ept = 1; -module_param_named(ept, enable_ept, bool, S_IRUGO); +module_param_named(ept, enable_ept, bool, 0444); bool __read_mostly enable_unrestricted_guest = 1; module_param_named(unrestricted_guest, - enable_unrestricted_guest, bool, S_IRUGO); + enable_unrestricted_guest, bool, 0444); bool __read_mostly enable_ept_ad_bits = 1; -module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO); +module_param_named(eptad, enable_ept_ad_bits, bool, 0444); static bool __read_mostly emulate_invalid_guest_state = true; -module_param(emulate_invalid_guest_state, bool, S_IRUGO); +module_param(emulate_invalid_guest_state, bool, 0444); static bool __read_mostly fasteoi = 1; -module_param(fasteoi, bool, S_IRUGO); +module_param(fasteoi, bool, 0444); -module_param(enable_apicv, bool, S_IRUGO); +module_param(enable_apicv, bool, 0444); bool __read_mostly enable_ipiv = true; module_param(enable_ipiv, bool, 0444); @@ -114,10 +114,10 @@ module_param(enable_ipiv, bool, 0444); * use VMX instructions. */ static bool __read_mostly nested = 1; -module_param(nested, bool, S_IRUGO); +module_param(nested, bool, 0444); bool __read_mostly enable_pml = 1; -module_param_named(pml, enable_pml, bool, S_IRUGO); +module_param_named(pml, enable_pml, bool, 0444); static bool __read_mostly error_on_inconsistent_vmcs_config = true; module_param(error_on_inconsistent_vmcs_config, bool, 0444); @@ -1657,8 +1657,8 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) return 0; } -static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, - void *insn, int insn_len) +static int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len) { /* * Emulation of instructions in SGX enclaves is impossible as RIP does @@ -1669,9 +1669,9 @@ static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, */ if (to_vmx(vcpu)->exit_reason.enclave_mode) { kvm_queue_exception(vcpu, UD_VECTOR); - return false; + return X86EMUL_PROPAGATE_FAULT; } - return true; + return X86EMUL_CONTINUE; } static int skip_emulated_instruction(struct kvm_vcpu *vcpu) @@ -5792,7 +5792,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) { gpa_t gpa; - if (!vmx_can_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0)) + if (vmx_check_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0)) return 1; /* @@ -6912,7 +6912,7 @@ static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); } -static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu) +static void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -7579,8 +7579,6 @@ static int vmx_vm_init(struct kvm *kvm) static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { - u8 cache; - /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in * memory aliases with conflicting memory types and sometimes MCEs. * We have to be careful as to what are honored and when. @@ -7607,11 +7605,10 @@ static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) if (kvm_read_cr0_bits(vcpu, X86_CR0_CD)) { if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) - cache = MTRR_TYPE_WRBACK; + return MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT; else - cache = MTRR_TYPE_UNCACHABLE; - - return (cache << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; + return (MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT) | + VMX_EPT_IPAT_BIT; } return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT; @@ -8286,7 +8283,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .set_apic_access_page_addr = vmx_set_apic_access_page_addr, .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, .load_eoi_exitmap = vmx_load_eoi_exitmap, - .apicv_post_state_restore = vmx_apicv_post_state_restore, + .apicv_pre_state_restore = vmx_apicv_pre_state_restore, .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS, .hwapic_irr_update = vmx_hwapic_irr_update, .hwapic_isr_update = vmx_hwapic_isr_update, @@ -8341,7 +8338,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .enable_smi_window = vmx_enable_smi_window, #endif - .can_emulate_instruction = vmx_can_emulate_instruction, + .check_emulate_instruction = vmx_check_emulate_instruction, .apic_init_signal_blocked = vmx_apic_init_signal_blocked, .migrate_timers = vmx_migrate_timers, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9f18b06bbda6..2c924075f6f1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -145,21 +145,21 @@ EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits); EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg); static bool __read_mostly ignore_msrs = 0; -module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); +module_param(ignore_msrs, bool, 0644); bool __read_mostly report_ignored_msrs = true; -module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR); +module_param(report_ignored_msrs, bool, 0644); EXPORT_SYMBOL_GPL(report_ignored_msrs); unsigned int min_timer_period_us = 200; -module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); +module_param(min_timer_period_us, uint, 0644); static bool __read_mostly kvmclock_periodic_sync = true; -module_param(kvmclock_periodic_sync, bool, S_IRUGO); +module_param(kvmclock_periodic_sync, bool, 0444); /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ static u32 __read_mostly tsc_tolerance_ppm = 250; -module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); +module_param(tsc_tolerance_ppm, uint, 0644); /* * lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables @@ -168,13 +168,13 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); * tuning, i.e. allows privileged userspace to set an exact advancement time. */ static int __read_mostly lapic_timer_advance_ns = -1; -module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR); +module_param(lapic_timer_advance_ns, int, 0644); static bool __read_mostly vector_hashing = true; -module_param(vector_hashing, bool, S_IRUGO); +module_param(vector_hashing, bool, 0444); bool __read_mostly enable_vmware_backdoor = false; -module_param(enable_vmware_backdoor, bool, S_IRUGO); +module_param(enable_vmware_backdoor, bool, 0444); EXPORT_SYMBOL_GPL(enable_vmware_backdoor); /* @@ -186,7 +186,7 @@ static int __read_mostly force_emulation_prefix; module_param(force_emulation_prefix, int, 0644); int __read_mostly pi_inject_timer = -1; -module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR); +module_param(pi_inject_timer, bint, 0644); /* Enable/disable PMU virtualization */ bool __read_mostly enable_pmu = true; @@ -962,7 +962,7 @@ void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned lon kvm_mmu_reset_context(vcpu); if (((cr0 ^ old_cr0) & X86_CR0_CD) && - kvm_arch_has_noncoherent_dma(vcpu->kvm) && + kvm_mmu_honors_guest_mtrrs(vcpu->kvm) && !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL); } @@ -2331,14 +2331,9 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_o if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version))) return; - /* - * The guest calculates current wall clock time by adding - * system time (updated by kvm_guest_time_update below) to the - * wall clock specified here. We do the reverse here. - */ - wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm); + wall_nsec = kvm_get_wall_clock_epoch(kvm); - wc.nsec = do_div(wall_nsec, 1000000000); + wc.nsec = do_div(wall_nsec, NSEC_PER_SEC); wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */ wc.version = version; @@ -2714,8 +2709,9 @@ static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc, kvm_track_tsc_matching(vcpu); } -static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) +static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value) { + u64 data = user_value ? *user_value : 0; struct kvm *kvm = vcpu->kvm; u64 offset, ns, elapsed; unsigned long flags; @@ -2730,25 +2726,37 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) if (vcpu->arch.virtual_tsc_khz) { if (data == 0) { /* - * detection of vcpu initialization -- need to sync - * with other vCPUs. This particularly helps to keep - * kvm_clock stable after CPU hotplug + * Force synchronization when creating a vCPU, or when + * userspace explicitly writes a zero value. */ synchronizing = true; - } else { + } else if (kvm->arch.user_set_tsc) { u64 tsc_exp = kvm->arch.last_tsc_write + nsec_to_cycles(vcpu, elapsed); u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL; /* - * Special case: TSC write with a small delta (1 second) - * of virtual cycle time against real time is - * interpreted as an attempt to synchronize the CPU. + * Here lies UAPI baggage: when a user-initiated TSC write has + * a small delta (1 second) of virtual cycle time against the + * previously set vCPU, we assume that they were intended to be + * in sync and the delta was only due to the racy nature of the + * legacy API. + * + * This trick falls down when restoring a guest which genuinely + * has been running for less time than the 1 second of imprecision + * which we allow for in the legacy API. In this case, the first + * value written by userspace (on any vCPU) should not be subject + * to this 'correction' to make it sync up with values that only + * come from the kernel's default vCPU creation. Make the 1-second + * slop hack only trigger if the user_set_tsc flag is already set. */ synchronizing = data < tsc_exp + tsc_hz && data + tsc_hz > tsc_exp; } } + if (user_value) + kvm->arch.user_set_tsc = true; + /* * For a reliable TSC, we can match TSC offsets, and for an unstable * TSC, we add elapsed time in this computation. We could let the @@ -3232,16 +3240,94 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) if (vcpu->pv_time.active) kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0); +#ifdef CONFIG_KVM_XEN if (vcpu->xen.vcpu_info_cache.active) kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache, offsetof(struct compat_vcpu_info, time)); if (vcpu->xen.vcpu_time_info_cache.active) kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0); +#endif kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock); return 0; } /* + * The pvclock_wall_clock ABI tells the guest the wall clock time at + * which it started (i.e. its epoch, when its kvmclock was zero). + * + * In fact those clocks are subtly different; wall clock frequency is + * adjusted by NTP and has leap seconds, while the kvmclock is a + * simple function of the TSC without any such adjustment. + * + * Perhaps the ABI should have exposed CLOCK_TAI and a ratio between + * that and kvmclock, but even that would be subject to change over + * time. + * + * Attempt to calculate the epoch at a given moment using the *same* + * TSC reading via kvm_get_walltime_and_clockread() to obtain both + * wallclock and kvmclock times, and subtracting one from the other. + * + * Fall back to using their values at slightly different moments by + * calling ktime_get_real_ns() and get_kvmclock_ns() separately. + */ +uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm) +{ +#ifdef CONFIG_X86_64 + struct pvclock_vcpu_time_info hv_clock; + struct kvm_arch *ka = &kvm->arch; + unsigned long seq, local_tsc_khz; + struct timespec64 ts; + uint64_t host_tsc; + + do { + seq = read_seqcount_begin(&ka->pvclock_sc); + + local_tsc_khz = 0; + if (!ka->use_master_clock) + break; + + /* + * The TSC read and the call to get_cpu_tsc_khz() must happen + * on the same CPU. + */ + get_cpu(); + + local_tsc_khz = get_cpu_tsc_khz(); + + if (local_tsc_khz && + !kvm_get_walltime_and_clockread(&ts, &host_tsc)) + local_tsc_khz = 0; /* Fall back to old method */ + + put_cpu(); + + /* + * These values must be snapshotted within the seqcount loop. + * After that, it's just mathematics which can happen on any + * CPU at any time. + */ + hv_clock.tsc_timestamp = ka->master_cycle_now; + hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; + + } while (read_seqcount_retry(&ka->pvclock_sc, seq)); + + /* + * If the conditions were right, and obtaining the wallclock+TSC was + * successful, calculate the KVM clock at the corresponding time and + * subtract one from the other to get the guest's epoch in nanoseconds + * since 1970-01-01. + */ + if (local_tsc_khz) { + kvm_get_time_scale(NSEC_PER_SEC, local_tsc_khz * NSEC_PER_USEC, + &hv_clock.tsc_shift, + &hv_clock.tsc_to_system_mul); + return ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec - + __pvclock_read_cycles(&hv_clock, host_tsc); + } +#endif + return ktime_get_real_ns() - get_kvmclock_ns(kvm); +} + +/* * kvmclock updates which are isolated to a given vcpu, such as * vcpu->cpu migration, should not allow system_timestamp from * the rest of the vcpus to remain static. Otherwise ntp frequency @@ -3290,9 +3376,6 @@ static void kvmclock_sync_fn(struct work_struct *work) kvmclock_sync_work); struct kvm *kvm = container_of(ka, struct kvm, arch); - if (!kvmclock_periodic_sync) - return; - schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0); schedule_delayed_work(&kvm->arch.kvmclock_sync_work, KVMCLOCK_SYNC_PERIOD); @@ -3641,6 +3724,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_AMD64_PATCH_LOADER: case MSR_AMD64_BU_CFG2: case MSR_AMD64_DC_CFG: + case MSR_AMD64_TW_CFG: case MSR_F15H_EX_CFG: break; @@ -3670,17 +3754,36 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.perf_capabilities = data; kvm_pmu_refresh(vcpu); break; - case MSR_IA32_PRED_CMD: - if (!msr_info->host_initiated && !guest_has_pred_cmd_msr(vcpu)) - return 1; + case MSR_IA32_PRED_CMD: { + u64 reserved_bits = ~(PRED_CMD_IBPB | PRED_CMD_SBPB); + + if (!msr_info->host_initiated) { + if ((!guest_has_pred_cmd_msr(vcpu))) + return 1; + + if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) && + !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB)) + reserved_bits |= PRED_CMD_IBPB; - if (!boot_cpu_has(X86_FEATURE_IBPB) || (data & ~PRED_CMD_IBPB)) + if (!guest_cpuid_has(vcpu, X86_FEATURE_SBPB)) + reserved_bits |= PRED_CMD_SBPB; + } + + if (!boot_cpu_has(X86_FEATURE_IBPB)) + reserved_bits |= PRED_CMD_IBPB; + + if (!boot_cpu_has(X86_FEATURE_SBPB)) + reserved_bits |= PRED_CMD_SBPB; + + if (data & reserved_bits) return 1; + if (!data) break; - wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); + wrmsrl(MSR_IA32_PRED_CMD, data); break; + } case MSR_IA32_FLUSH_CMD: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D)) @@ -3700,13 +3803,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) data &= ~(u64)0x100; /* ignore ignne emulation enable */ data &= ~(u64)0x8; /* ignore TLB cache disable */ - /* Handle McStatusWrEn */ - if (data == BIT_ULL(18)) { - vcpu->arch.msr_hwcr = data; - } else if (data != 0) { + /* + * Allow McStatusWrEn and TscFreqSel. (Linux guests from v3.2 + * through at least v6.6 whine if TscFreqSel is clear, + * depending on F/M/S. + */ + if (data & ~(BIT_ULL(18) | BIT_ULL(24))) { kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } + vcpu->arch.msr_hwcr = data; break; case MSR_FAM10H_MMIO_CONF_BASE: if (data != 0) { @@ -3777,7 +3883,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_TSC: if (msr_info->host_initiated) { - kvm_synchronize_tsc(vcpu, data); + kvm_synchronize_tsc(vcpu, &data); } else { u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset; adjust_tsc_offset_guest(vcpu, adj); @@ -4065,6 +4171,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_AMD64_BU_CFG2: case MSR_IA32_PERF_CTL: case MSR_AMD64_DC_CFG: + case MSR_AMD64_TW_CFG: case MSR_F15H_EX_CFG: /* * Intel Sandy Bridge CPUs must support the RAPL (running average power @@ -5382,26 +5489,37 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, return 0; } -static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, - struct kvm_xsave *guest_xsave) -{ - if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) - return; - - fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, - guest_xsave->region, - sizeof(guest_xsave->region), - vcpu->arch.pkru); -} static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu, u8 *state, unsigned int size) { + /* + * Only copy state for features that are enabled for the guest. The + * state itself isn't problematic, but setting bits in the header for + * features that are supported in *this* host but not exposed to the + * guest can result in KVM_SET_XSAVE failing when live migrating to a + * compatible host without the features that are NOT exposed to the + * guest. + * + * FP+SSE can always be saved/restored via KVM_{G,S}ET_XSAVE, even if + * XSAVE/XCRO are not exposed to the guest, and even if XSAVE isn't + * supported by the host. + */ + u64 supported_xcr0 = vcpu->arch.guest_supported_xcr0 | + XFEATURE_MASK_FPSSE; + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) return; - fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, - state, size, vcpu->arch.pkru); + fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, state, size, + supported_xcr0, vcpu->arch.pkru); +} + +static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, + struct kvm_xsave *guest_xsave) +{ + return kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region, + sizeof(guest_xsave->region)); } static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, @@ -5536,6 +5654,7 @@ static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu, tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset; ns = get_kvmclock_base_ns(); + kvm->arch.user_set_tsc = true; __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); @@ -6248,6 +6367,9 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) struct kvm_vcpu *vcpu; unsigned long i; + if (!kvm_x86_ops.cpu_dirty_log_size) + return; + kvm_for_each_vcpu(i, vcpu, kvm) kvm_vcpu_kick(vcpu); } @@ -7474,11 +7596,11 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val, } EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); -static int kvm_can_emulate_insn(struct kvm_vcpu *vcpu, int emul_type, - void *insn, int insn_len) +static int kvm_check_emulate_insn(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len) { - return static_call(kvm_x86_can_emulate_instruction)(vcpu, emul_type, - insn, insn_len); + return static_call(kvm_x86_check_emulate_instruction)(vcpu, emul_type, + insn, insn_len); } int handle_ud(struct kvm_vcpu *vcpu) @@ -7488,8 +7610,10 @@ int handle_ud(struct kvm_vcpu *vcpu) int emul_type = EMULTYPE_TRAP_UD; char sig[5]; /* ud2; .ascii "kvm" */ struct x86_exception e; + int r; - if (unlikely(!kvm_can_emulate_insn(vcpu, emul_type, NULL, 0))) + r = kvm_check_emulate_insn(vcpu, emul_type, NULL, 0); + if (r != X86EMUL_CONTINUE) return 1; if (fep_flags && @@ -8871,8 +8995,14 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; bool writeback = true; - if (unlikely(!kvm_can_emulate_insn(vcpu, emulation_type, insn, insn_len))) - return 1; + r = kvm_check_emulate_insn(vcpu, emulation_type, insn, insn_len); + if (r != X86EMUL_CONTINUE) { + if (r == X86EMUL_RETRY_INSTR || r == X86EMUL_PROPAGATE_FAULT) + return 1; + + WARN_ON_ONCE(r != X86EMUL_UNHANDLEABLE); + return handle_emulation_failure(vcpu, emulation_type); + } vcpu->arch.l1tf_flush_l1d = true; @@ -10576,16 +10706,16 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu)) record_steal_time(vcpu); + if (kvm_check_request(KVM_REQ_PMU, vcpu)) + kvm_pmu_handle_event(vcpu); + if (kvm_check_request(KVM_REQ_PMI, vcpu)) + kvm_pmu_deliver_pmi(vcpu); #ifdef CONFIG_KVM_SMM if (kvm_check_request(KVM_REQ_SMI, vcpu)) process_smi(vcpu); #endif if (kvm_check_request(KVM_REQ_NMI, vcpu)) process_nmi(vcpu); - if (kvm_check_request(KVM_REQ_PMU, vcpu)) - kvm_pmu_handle_event(vcpu); - if (kvm_check_request(KVM_REQ_PMI, vcpu)) - kvm_pmu_deliver_pmi(vcpu); if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) { BUG_ON(vcpu->arch.pending_ioapic_eoi > 255); if (test_bit(vcpu->arch.pending_ioapic_eoi, @@ -11521,7 +11651,6 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs, *mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0); - vcpu->arch.cr0 = sregs->cr0; *mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4); @@ -11565,8 +11694,10 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) if (ret) return ret; - if (mmu_reset_needed) + if (mmu_reset_needed) { kvm_mmu_reset_context(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); + } max_bits = KVM_NR_INTERRUPTS; pending_vec = find_first_bit( @@ -11607,8 +11738,10 @@ static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2) mmu_reset_needed = 1; vcpu->arch.pdptrs_from_userspace = true; } - if (mmu_reset_needed) + if (mmu_reset_needed) { kvm_mmu_reset_context(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); + } return 0; } @@ -11959,7 +12092,7 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) if (mutex_lock_killable(&vcpu->mutex)) return; vcpu_load(vcpu); - kvm_synchronize_tsc(vcpu, 0); + kvm_synchronize_tsc(vcpu, NULL); vcpu_put(vcpu); /* poll control enabled by default */ @@ -12315,7 +12448,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) goto out_uninit_mmu; INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); - INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); atomic_set(&kvm->arch.noncoherent_dma_count, 0); /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ @@ -12843,6 +12975,9 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) return true; #endif + if (kvm_test_request(KVM_REQ_PMI, vcpu)) + return true; + if (kvm_arch_interrupt_allowed(vcpu) && (kvm_cpu_has_interrupt(vcpu) || kvm_guest_apic_has_interrupt(vcpu))) @@ -13188,15 +13323,30 @@ bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device); +static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm) +{ + /* + * Non-coherent DMA assignment and de-assignment will affect + * whether KVM honors guest MTRRs and cause changes in memtypes + * in TDP. + * So, pass %true unconditionally to indicate non-coherent DMA was, + * or will be involved, and that zapping SPTEs might be necessary. + */ + if (__kvm_mmu_honors_guest_mtrrs(true)) + kvm_zap_gfn_range(kvm, gpa_to_gfn(0), gpa_to_gfn(~0ULL)); +} + void kvm_arch_register_noncoherent_dma(struct kvm *kvm) { - atomic_inc(&kvm->arch.noncoherent_dma_count); + if (atomic_inc_return(&kvm->arch.noncoherent_dma_count) == 1) + kvm_noncoherent_dma_assignment_start_or_stop(kvm); } EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma); void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm) { - atomic_dec(&kvm->arch.noncoherent_dma_count); + if (!atomic_dec_return(&kvm->arch.noncoherent_dma_count)) + kvm_noncoherent_dma_assignment_start_or_stop(kvm); } EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 1e7be1f6ab29..5184fde1dc54 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -293,6 +293,7 @@ static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk) void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); u64 get_kvmclock_ns(struct kvm *kvm); +uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm); int kvm_read_guest_virt(struct kvm_vcpu *vcpu, gva_t addr, void *val, unsigned int bytes, diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 40edf4d1974c..e53fad915a62 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -59,7 +59,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) * This code mirrors kvm_write_wall_clock() except that it writes * directly through the pfn cache and doesn't mark the page dirty. */ - wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm); + wall_nsec = kvm_get_wall_clock_epoch(kvm); /* It could be invalid again already, so we need to check */ read_lock_irq(&gpc->lock); @@ -98,7 +98,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) wc_version = wc->version = (wc->version + 1) | 1; smp_wmb(); - wc->nsec = do_div(wall_nsec, 1000000000); + wc->nsec = do_div(wall_nsec, NSEC_PER_SEC); wc->sec = (u32)wall_nsec; *wc_sec_hi = wall_nsec >> 32; smp_wmb(); @@ -134,9 +134,23 @@ static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer) { struct kvm_vcpu *vcpu = container_of(timer, struct kvm_vcpu, arch.xen.timer); + struct kvm_xen_evtchn e; + int rc; + if (atomic_read(&vcpu->arch.xen.timer_pending)) return HRTIMER_NORESTART; + e.vcpu_id = vcpu->vcpu_id; + e.vcpu_idx = vcpu->vcpu_idx; + e.port = vcpu->arch.xen.timer_virq; + e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL; + + rc = kvm_xen_set_evtchn_fast(&e, vcpu->kvm); + if (rc != -EWOULDBLOCK) { + vcpu->arch.xen.timer_expires = 0; + return HRTIMER_NORESTART; + } + atomic_inc(&vcpu->arch.xen.timer_pending); kvm_make_request(KVM_REQ_UNBLOCK, vcpu); kvm_vcpu_kick(vcpu); @@ -146,6 +160,14 @@ static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer) static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_ns) { + /* + * Avoid races with the old timer firing. Checking timer_expires + * to avoid calling hrtimer_cancel() will only have false positives + * so is fine. + */ + if (vcpu->arch.xen.timer_expires) + hrtimer_cancel(&vcpu->arch.xen.timer); + atomic_set(&vcpu->arch.xen.timer_pending, 0); vcpu->arch.xen.timer_expires = guest_abs; @@ -1019,9 +1041,36 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) break; case KVM_XEN_VCPU_ATTR_TYPE_TIMER: + /* + * Ensure a consistent snapshot of state is captured, with a + * timer either being pending, or the event channel delivered + * to the corresponding bit in the shared_info. Not still + * lurking in the timer_pending flag for deferred delivery. + * Purely as an optimisation, if the timer_expires field is + * zero, that means the timer isn't active (or even in the + * timer_pending flag) and there is no need to cancel it. + */ + if (vcpu->arch.xen.timer_expires) { + hrtimer_cancel(&vcpu->arch.xen.timer); + kvm_xen_inject_timer_irqs(vcpu); + } + data->u.timer.port = vcpu->arch.xen.timer_virq; data->u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL; data->u.timer.expires_ns = vcpu->arch.xen.timer_expires; + + /* + * The hrtimer may trigger and raise the IRQ immediately, + * while the returned state causes it to be set up and + * raised again on the destination system after migration. + * That's fine, as the guest won't even have had a chance + * to run and handle the interrupt. Asserting an already + * pending event channel is idempotent. + */ + if (vcpu->arch.xen.timer_expires) + hrtimer_start_expires(&vcpu->arch.xen.timer, + HRTIMER_MODE_ABS_HARD); + r = 0; break; @@ -1374,12 +1423,8 @@ static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd, return true; } + /* A delta <= 0 results in an immediate callback, which is what we want */ delta = oneshot.timeout_abs_ns - get_kvmclock_ns(vcpu->kvm); - if ((oneshot.flags & VCPU_SSHOTTMR_future) && delta < 0) { - *r = -ETIME; - return true; - } - kvm_xen_start_timer(vcpu, oneshot.timeout_abs_ns, delta); *r = 0; return true; diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S index 23318c338db0..68f7fa3e1322 100644 --- a/arch/x86/lib/checksum_32.S +++ b/arch/x86/lib/checksum_32.S @@ -21,10 +21,10 @@ * converted to pure assembler */ +#include <linux/export.h> #include <linux/linkage.h> #include <asm/errno.h> #include <asm/asm.h> -#include <asm/export.h> #include <asm/nospec-branch.h> /* diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index f74a3e704a1c..2760a15fbc00 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-only */ +#include <linux/export.h> #include <linux/linkage.h> #include <asm/asm.h> -#include <asm/export.h> /* * Most CPUs support enhanced REP MOVSB/STOSB instructions. It is diff --git a/arch/x86/lib/cmpxchg8b_emu.S b/arch/x86/lib/cmpxchg8b_emu.S index 49805257b125..873e4ef23e49 100644 --- a/arch/x86/lib/cmpxchg8b_emu.S +++ b/arch/x86/lib/cmpxchg8b_emu.S @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-only */ +#include <linux/export.h> #include <linux/linkage.h> -#include <asm/export.h> #include <asm/percpu.h> #include <asm/processor-flags.h> diff --git a/arch/x86/lib/copy_mc.c b/arch/x86/lib/copy_mc.c index 80efd45a7761..6e8b7e600def 100644 --- a/arch/x86/lib/copy_mc.c +++ b/arch/x86/lib/copy_mc.c @@ -70,23 +70,23 @@ unsigned long __must_check copy_mc_to_kernel(void *dst, const void *src, unsigne } EXPORT_SYMBOL_GPL(copy_mc_to_kernel); -unsigned long __must_check copy_mc_to_user(void *dst, const void *src, unsigned len) +unsigned long __must_check copy_mc_to_user(void __user *dst, const void *src, unsigned len) { unsigned long ret; if (copy_mc_fragile_enabled) { __uaccess_begin(); - ret = copy_mc_fragile(dst, src, len); + ret = copy_mc_fragile((__force void *)dst, src, len); __uaccess_end(); return ret; } if (static_cpu_has(X86_FEATURE_ERMS)) { __uaccess_begin(); - ret = copy_mc_enhanced_fast_string(dst, src, len); + ret = copy_mc_enhanced_fast_string((__force void *)dst, src, len); __uaccess_end(); return ret; } - return copy_user_generic(dst, src, len); + return copy_user_generic((__force void *)dst, src, len); } diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index 30ea644bf446..d6ae793d08fa 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S @@ -1,10 +1,10 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */ +#include <linux/export.h> #include <linux/linkage.h> #include <asm/cpufeatures.h> #include <asm/alternative.h> -#include <asm/export.h> /* * Some CPUs run faster using the string copy instructions (sane microcode). diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 0a81aafed7f8..fc9fb5d06174 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -6,11 +6,11 @@ * Functions to copy from and to user space. */ +#include <linux/export.h> #include <linux/linkage.h> #include <asm/cpufeatures.h> #include <asm/alternative.h> #include <asm/asm.h> -#include <asm/export.h> /* * rep_movs_alternative - memory copy with exception handling. diff --git a/arch/x86/lib/copy_user_uncached_64.S b/arch/x86/lib/copy_user_uncached_64.S index 5c5f38d32672..2918e36eece2 100644 --- a/arch/x86/lib/copy_user_uncached_64.S +++ b/arch/x86/lib/copy_user_uncached_64.S @@ -3,9 +3,9 @@ * Copyright 2023 Linus Torvalds <torvalds@linux-foundation.org> */ +#include <linux/export.h> #include <linux/linkage.h> #include <asm/asm.h> -#include <asm/export.h> /* * copy_user_nocache - Uncached memory copy with exception handling diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c index 145f9a0bde29..f4df4d241526 100644 --- a/arch/x86/lib/csum-wrappers_64.c +++ b/arch/x86/lib/csum-wrappers_64.c @@ -14,8 +14,6 @@ * @src: source address (user space) * @dst: destination address * @len: number of bytes to be copied. - * @isum: initial sum that is added into the result (32bit unfolded) - * @errp: set to -EFAULT for an bad source address. * * Returns an 32bit unfolded checksum of the buffer. * src and dst are best aligned to 64bits. @@ -38,8 +36,6 @@ csum_and_copy_from_user(const void __user *src, void *dst, int len) * @src: source address * @dst: destination address (user space) * @len: number of bytes to be copied. - * @isum: initial sum that is added into the result (32bit unfolded) - * @errp: set to -EFAULT for an bad destination address. * * Returns an 32bit unfolded checksum of the buffer. * src and dst are best aligned to 64bits. @@ -62,7 +58,6 @@ csum_and_copy_to_user(const void *src, void __user *dst, int len) * @src: source address * @dst: destination address * @len: number of bytes to be copied. - * @sum: initial sum that is added into the result (32bit unfolded) * * Returns an 32bit unfolded checksum of the buffer. */ diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index 9c63713477bb..20ef350a60fb 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -26,6 +26,7 @@ * as they get called from within inline assembly. */ +#include <linux/export.h> #include <linux/linkage.h> #include <asm/page_types.h> #include <asm/errno.h> @@ -33,7 +34,6 @@ #include <asm/thread_info.h> #include <asm/asm.h> #include <asm/smap.h> -#include <asm/export.h> #define ASM_BARRIER_NOSPEC ALTERNATIVE "", "lfence", X86_FEATURE_LFENCE_RDTSC diff --git a/arch/x86/lib/hweight.S b/arch/x86/lib/hweight.S index 12c16c6aa44a..774bdf3e6f0a 100644 --- a/arch/x86/lib/hweight.S +++ b/arch/x86/lib/hweight.S @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/export.h> #include <linux/linkage.h> -#include <asm/export.h> #include <asm/asm.h> @@ -36,8 +36,12 @@ SYM_FUNC_START(__sw_hweight32) SYM_FUNC_END(__sw_hweight32) EXPORT_SYMBOL(__sw_hweight32) -SYM_FUNC_START(__sw_hweight64) +/* + * No 32-bit variant, because it's implemented as an inline wrapper + * on top of __arch_hweight32(): + */ #ifdef CONFIG_X86_64 +SYM_FUNC_START(__sw_hweight64) pushq %rdi pushq %rdx @@ -66,18 +70,6 @@ SYM_FUNC_START(__sw_hweight64) popq %rdx popq %rdi RET -#else /* CONFIG_X86_32 */ - /* We're getting an u64 arg in (%eax,%edx): unsigned long hweight64(__u64 w) */ - pushl %ecx - - call __sw_hweight32 - movl %eax, %ecx # stash away result - movl %edx, %eax # second part of input - call __sw_hweight32 - addl %ecx, %eax # result - - popl %ecx - RET -#endif SYM_FUNC_END(__sw_hweight64) EXPORT_SYMBOL(__sw_hweight64) +#endif diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 76697df8dfd5..0ae2e1712e2e 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -1,12 +1,12 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright 2002 Andi Kleen */ +#include <linux/export.h> #include <linux/linkage.h> #include <linux/cfi_types.h> #include <asm/errno.h> #include <asm/cpufeatures.h> #include <asm/alternative.h> -#include <asm/export.h> .section .noinstr.text, "ax" diff --git a/arch/x86/lib/memmove_32.S b/arch/x86/lib/memmove_32.S index 0588b2c0fc95..35010ba3dd6f 100644 --- a/arch/x86/lib/memmove_32.S +++ b/arch/x86/lib/memmove_32.S @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/export.h> #include <linux/linkage.h> -#include <asm/export.h> SYM_FUNC_START(memmove) /* diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index ccdf3a597045..1b60ae81ecd8 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S @@ -6,10 +6,10 @@ * This assembly file is re-written from memmove_64.c file. * - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com> */ +#include <linux/export.h> #include <linux/linkage.h> #include <asm/cpufeatures.h> #include <asm/alternative.h> -#include <asm/export.h> #undef memmove diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 3d818b849ec6..0199d56cb479 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -1,10 +1,10 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright 2002 Andi Kleen, SuSE Labs */ +#include <linux/export.h> #include <linux/linkage.h> #include <asm/cpufeatures.h> #include <asm/alternative.h> -#include <asm/export.h> .section .noinstr.text, "ax" diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S index 235bbda6fc82..2877f5934177 100644 --- a/arch/x86/lib/putuser.S +++ b/arch/x86/lib/putuser.S @@ -11,13 +11,12 @@ * return an error value in addition to the "real" * return value. */ +#include <linux/export.h> #include <linux/linkage.h> #include <asm/thread_info.h> #include <asm/errno.h> #include <asm/asm.h> #include <asm/smap.h> -#include <asm/export.h> - /* * __put_user_X diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index cd86aeb5fdd3..7b2589877d06 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -1,12 +1,12 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/export.h> #include <linux/stringify.h> #include <linux/linkage.h> #include <asm/dwarf2.h> #include <asm/cpufeatures.h> #include <asm/alternative.h> #include <asm/asm-offsets.h> -#include <asm/export.h> #include <asm/nospec-branch.h> #include <asm/unwind_hints.h> #include <asm/percpu.h> @@ -126,11 +126,19 @@ SYM_CODE_END(__x86_indirect_jump_thunk_array) #include <asm/GEN-for-each-reg.h> #undef GEN #endif + +#ifdef CONFIG_RETHUNK + /* - * This function name is magical and is used by -mfunction-return=thunk-extern - * for the compiler to generate JMPs to it. + * Be careful here: that label cannot really be removed because in + * some configurations and toolchains, the JMP __x86_return_thunk the + * compiler issues is either a short one or the compiler doesn't use + * relocations for same-section JMPs and that breaks the returns + * detection logic in apply_returns() and in objtool. */ -#ifdef CONFIG_RETHUNK + .section .text..__x86.return_thunk + +#ifdef CONFIG_CPU_SRSO /* * srso_alias_untrain_ret() and srso_alias_safe_ret() are placed at @@ -147,29 +155,18 @@ SYM_CODE_END(__x86_indirect_jump_thunk_array) * * As a result, srso_alias_safe_ret() becomes a safe return. */ -#ifdef CONFIG_CPU_SRSO - .section .text..__x86.rethunk_untrain - -SYM_START(srso_alias_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE) + .pushsection .text..__x86.rethunk_untrain +SYM_CODE_START_NOALIGN(srso_alias_untrain_ret) UNWIND_HINT_FUNC ANNOTATE_NOENDBR ASM_NOP2 lfence jmp srso_alias_return_thunk SYM_FUNC_END(srso_alias_untrain_ret) -__EXPORT_THUNK(srso_alias_untrain_ret) - - .section .text..__x86.rethunk_safe -#else -/* dummy definition for alternatives */ -SYM_START(srso_alias_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE) - ANNOTATE_UNRET_SAFE - ret - int3 -SYM_FUNC_END(srso_alias_untrain_ret) -#endif + .popsection -SYM_START(srso_alias_safe_ret, SYM_L_GLOBAL, SYM_A_NONE) + .pushsection .text..__x86.rethunk_safe +SYM_CODE_START_NOALIGN(srso_alias_safe_ret) lea 8(%_ASM_SP), %_ASM_SP UNWIND_HINT_FUNC ANNOTATE_UNRET_SAFE @@ -177,14 +174,63 @@ SYM_START(srso_alias_safe_ret, SYM_L_GLOBAL, SYM_A_NONE) int3 SYM_FUNC_END(srso_alias_safe_ret) - .section .text..__x86.return_thunk - -SYM_CODE_START(srso_alias_return_thunk) +SYM_CODE_START_NOALIGN(srso_alias_return_thunk) UNWIND_HINT_FUNC ANNOTATE_NOENDBR call srso_alias_safe_ret ud2 SYM_CODE_END(srso_alias_return_thunk) + .popsection + +/* + * SRSO untraining sequence for Zen1/2, similar to retbleed_untrain_ret() + * above. On kernel entry, srso_untrain_ret() is executed which is a + * + * movabs $0xccccc30824648d48,%rax + * + * and when the return thunk executes the inner label srso_safe_ret() + * later, it is a stack manipulation and a RET which is mispredicted and + * thus a "safe" one to use. + */ + .align 64 + .skip 64 - (srso_safe_ret - srso_untrain_ret), 0xcc +SYM_CODE_START_LOCAL_NOALIGN(srso_untrain_ret) + ANNOTATE_NOENDBR + .byte 0x48, 0xb8 + +/* + * This forces the function return instruction to speculate into a trap + * (UD2 in srso_return_thunk() below). This RET will then mispredict + * and execution will continue at the return site read from the top of + * the stack. + */ +SYM_INNER_LABEL(srso_safe_ret, SYM_L_GLOBAL) + lea 8(%_ASM_SP), %_ASM_SP + ret + int3 + int3 + /* end of movabs */ + lfence + call srso_safe_ret + ud2 +SYM_CODE_END(srso_safe_ret) +SYM_FUNC_END(srso_untrain_ret) + +SYM_CODE_START(srso_return_thunk) + UNWIND_HINT_FUNC + ANNOTATE_NOENDBR + call srso_safe_ret + ud2 +SYM_CODE_END(srso_return_thunk) + +#define JMP_SRSO_UNTRAIN_RET "jmp srso_untrain_ret" +#define JMP_SRSO_ALIAS_UNTRAIN_RET "jmp srso_alias_untrain_ret" +#else /* !CONFIG_CPU_SRSO */ +#define JMP_SRSO_UNTRAIN_RET "ud2" +#define JMP_SRSO_ALIAS_UNTRAIN_RET "ud2" +#endif /* CONFIG_CPU_SRSO */ + +#ifdef CONFIG_CPU_UNRET_ENTRY /* * Some generic notes on the untraining sequences: @@ -216,7 +262,7 @@ SYM_CODE_END(srso_alias_return_thunk) */ .align 64 .skip 64 - (retbleed_return_thunk - retbleed_untrain_ret), 0xcc -SYM_START(retbleed_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE) +SYM_CODE_START_LOCAL_NOALIGN(retbleed_untrain_ret) ANNOTATE_NOENDBR /* * As executed from retbleed_untrain_ret, this is: @@ -264,72 +310,27 @@ SYM_CODE_END(retbleed_return_thunk) jmp retbleed_return_thunk int3 SYM_FUNC_END(retbleed_untrain_ret) -__EXPORT_THUNK(retbleed_untrain_ret) -/* - * SRSO untraining sequence for Zen1/2, similar to retbleed_untrain_ret() - * above. On kernel entry, srso_untrain_ret() is executed which is a - * - * movabs $0xccccc30824648d48,%rax - * - * and when the return thunk executes the inner label srso_safe_ret() - * later, it is a stack manipulation and a RET which is mispredicted and - * thus a "safe" one to use. - */ - .align 64 - .skip 64 - (srso_safe_ret - srso_untrain_ret), 0xcc -SYM_START(srso_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE) - ANNOTATE_NOENDBR - .byte 0x48, 0xb8 +#define JMP_RETBLEED_UNTRAIN_RET "jmp retbleed_untrain_ret" +#else /* !CONFIG_CPU_UNRET_ENTRY */ +#define JMP_RETBLEED_UNTRAIN_RET "ud2" +#endif /* CONFIG_CPU_UNRET_ENTRY */ -/* - * This forces the function return instruction to speculate into a trap - * (UD2 in srso_return_thunk() below). This RET will then mispredict - * and execution will continue at the return site read from the top of - * the stack. - */ -SYM_INNER_LABEL(srso_safe_ret, SYM_L_GLOBAL) - lea 8(%_ASM_SP), %_ASM_SP - ret - int3 - int3 - /* end of movabs */ - lfence - call srso_safe_ret - ud2 -SYM_CODE_END(srso_safe_ret) -SYM_FUNC_END(srso_untrain_ret) -__EXPORT_THUNK(srso_untrain_ret) - -SYM_CODE_START(srso_return_thunk) - UNWIND_HINT_FUNC - ANNOTATE_NOENDBR - call srso_safe_ret - ud2 -SYM_CODE_END(srso_return_thunk) +#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO) SYM_FUNC_START(entry_untrain_ret) - ALTERNATIVE_2 "jmp retbleed_untrain_ret", \ - "jmp srso_untrain_ret", X86_FEATURE_SRSO, \ - "jmp srso_alias_untrain_ret", X86_FEATURE_SRSO_ALIAS + ALTERNATIVE_2 JMP_RETBLEED_UNTRAIN_RET, \ + JMP_SRSO_UNTRAIN_RET, X86_FEATURE_SRSO, \ + JMP_SRSO_ALIAS_UNTRAIN_RET, X86_FEATURE_SRSO_ALIAS SYM_FUNC_END(entry_untrain_ret) __EXPORT_THUNK(entry_untrain_ret) -SYM_CODE_START(__x86_return_thunk) - UNWIND_HINT_FUNC - ANNOTATE_NOENDBR - ANNOTATE_UNRET_SAFE - ret - int3 -SYM_CODE_END(__x86_return_thunk) -EXPORT_SYMBOL(__x86_return_thunk) - -#endif /* CONFIG_RETHUNK */ +#endif /* CONFIG_CPU_UNRET_ENTRY || CONFIG_CPU_SRSO */ #ifdef CONFIG_CALL_DEPTH_TRACKING .align 64 -SYM_FUNC_START(__x86_return_skl) +SYM_FUNC_START(call_depth_return_thunk) ANNOTATE_NOENDBR /* * Keep the hotpath in a 16byte I-fetch for the non-debug @@ -356,6 +357,33 @@ SYM_FUNC_START(__x86_return_skl) ANNOTATE_UNRET_SAFE ret int3 -SYM_FUNC_END(__x86_return_skl) +SYM_FUNC_END(call_depth_return_thunk) #endif /* CONFIG_CALL_DEPTH_TRACKING */ + +/* + * This function name is magical and is used by -mfunction-return=thunk-extern + * for the compiler to generate JMPs to it. + * + * This code is only used during kernel boot or module init. All + * 'JMP __x86_return_thunk' sites are changed to something else by + * apply_returns(). + * + * This should be converted eventually to call a warning function which + * should scream loudly when the default return thunk is called after + * alternatives have been applied. + * + * That warning function cannot BUG() because the bug splat cannot be + * displayed in all possible configurations, leading to users not really + * knowing why the machine froze. + */ +SYM_CODE_START(__x86_return_thunk) + UNWIND_HINT_FUNC + ANNOTATE_NOENDBR + ANNOTATE_UNRET_SAFE + ret + int3 +SYM_CODE_END(__x86_return_thunk) +EXPORT_SYMBOL(__x86_return_thunk) + +#endif /* CONFIG_RETHUNK */ diff --git a/arch/x86/mm/maccess.c b/arch/x86/mm/maccess.c index 5a53c2cc169c..6993f026adec 100644 --- a/arch/x86/mm/maccess.c +++ b/arch/x86/mm/maccess.c @@ -9,12 +9,21 @@ bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) unsigned long vaddr = (unsigned long)unsafe_src; /* - * Range covering the highest possible canonical userspace address - * as well as non-canonical address range. For the canonical range - * we also need to include the userspace guard page. + * Do not allow userspace addresses. This disallows + * normal userspace and the userspace guard page: */ - return vaddr >= TASK_SIZE_MAX + PAGE_SIZE && - __is_canonical_address(vaddr, boot_cpu_data.x86_virt_bits); + if (vaddr < TASK_SIZE_MAX + PAGE_SIZE) + return false; + + /* + * Allow everything during early boot before 'x86_virt_bits' + * is initialized. Needed for instruction decoding in early + * exception handlers. + */ + if (!boot_cpu_data.x86_virt_bits) + return true; + + return __is_canonical_address(vaddr, boot_cpu_data.x86_virt_bits); } #else bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 9f27e14e185f..c290c55b632b 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -12,6 +12,7 @@ #include <linux/swiotlb.h> #include <linux/cc_platform.h> #include <linux/mem_encrypt.h> +#include <linux/virtio_anchor.h> /* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */ bool force_dma_unencrypted(struct device *dev) @@ -86,3 +87,36 @@ void __init mem_encrypt_init(void) print_mem_encrypt_feature_info(); } + +void __init mem_encrypt_setup_arch(void) +{ + phys_addr_t total_mem = memblock_phys_mem_size(); + unsigned long size; + + if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) + return; + + /* + * For SEV and TDX, all DMA has to occur via shared/unencrypted pages. + * Kernel uses SWIOTLB to make this happen without changing device + * drivers. However, depending on the workload being run, the + * default 64MB of SWIOTLB may not be enough and SWIOTLB may + * run out of buffers for DMA, resulting in I/O errors and/or + * performance degradation especially with high I/O workloads. + * + * Adjust the default size of SWIOTLB using a percentage of guest + * memory for SWIOTLB buffers. Also, as the SWIOTLB bounce buffer + * memory is allocated from low memory, ensure that the adjusted size + * is within the limits of low available memory. + * + * The percentage of guest memory used here for SWIOTLB buffers + * is more of an approximation of the static adjustment which + * 64MB for <1G, and ~128M to 256M for 1G-to-4G, i.e., the 6% + */ + size = total_mem * 6 / 100; + size = clamp_val(size, IO_TLB_DEFAULT_SIZE, SZ_1G); + swiotlb_adjust_size(size); + + /* Set restricted memory access for virtio. */ + virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc); +} diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c index 6faea41e99b6..a68f2dda0948 100644 --- a/arch/x86/mm/mem_encrypt_amd.c +++ b/arch/x86/mm/mem_encrypt_amd.c @@ -19,8 +19,6 @@ #include <linux/kernel.h> #include <linux/bitops.h> #include <linux/dma-mapping.h> -#include <linux/virtio_config.h> -#include <linux/virtio_anchor.h> #include <linux/cc_platform.h> #include <asm/tlbflush.h> @@ -215,40 +213,6 @@ void __init sme_map_bootdata(char *real_mode_data) __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true); } -void __init sev_setup_arch(void) -{ - phys_addr_t total_mem = memblock_phys_mem_size(); - unsigned long size; - - if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) - return; - - /* - * For SEV, all DMA has to occur via shared/unencrypted pages. - * SEV uses SWIOTLB to make this happen without changing device - * drivers. However, depending on the workload being run, the - * default 64MB of SWIOTLB may not be enough and SWIOTLB may - * run out of buffers for DMA, resulting in I/O errors and/or - * performance degradation especially with high I/O workloads. - * - * Adjust the default size of SWIOTLB for SEV guests using - * a percentage of guest memory for SWIOTLB buffers. - * Also, as the SWIOTLB bounce buffer memory is allocated - * from low memory, ensure that the adjusted size is within - * the limits of low available memory. - * - * The percentage of guest memory used here for SWIOTLB buffers - * is more of an approximation of the static adjustment which - * 64MB for <1G, and ~128M to 256M for 1G-to-4G, i.e., the 6% - */ - size = total_mem * 6 / 100; - size = clamp_val(size, IO_TLB_DEFAULT_SIZE, SZ_1G); - swiotlb_adjust_size(size); - - /* Set restricted memory access for virtio. */ - virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc); -} - static unsigned long pg_level_to_pfn(int level, pte_t *kpte, pgprot_t *ret_prot) { unsigned long pfn = 0; diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 2aadb2019b4f..b29ceb19e46e 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -3,6 +3,7 @@ #include <linux/acpi.h> #include <linux/kernel.h> #include <linux/mm.h> +#include <linux/of.h> #include <linux/string.h> #include <linux/init.h> #include <linux/memblock.h> @@ -11,6 +12,7 @@ #include <linux/nodemask.h> #include <linux/sched.h> #include <linux/topology.h> +#include <linux/sort.h> #include <asm/e820/api.h> #include <asm/proto.h> @@ -56,7 +58,7 @@ s16 __apicid_to_node[MAX_LOCAL_APIC] = { int numa_cpu_node(int cpu) { - int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); + u32 apicid = early_per_cpu(x86_cpu_to_apicid, cpu); if (apicid != BAD_APICID) return __apicid_to_node[apicid]; @@ -601,13 +603,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) if (start >= end) continue; - /* - * Don't confuse VM with a node that doesn't have the - * minimum amount of memory: - */ - if (end && (end - start) < NODE_MIN_SIZE) - continue; - alloc_node_data(nid); } @@ -733,6 +728,8 @@ void __init x86_numa_init(void) if (!numa_init(amd_numa_init)) return; #endif + if (acpi_disabled && !numa_init(of_numa_init)) + return; } numa_init(dummy_numa_init); @@ -786,7 +783,7 @@ void __init init_gi_nodes(void) void __init init_cpu_to_node(void) { int cpu; - u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); + u32 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); BUG_ON(cpu_to_apicid == NULL); @@ -961,4 +958,83 @@ int memory_add_physaddr_to_nid(u64 start) return nid; } EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); + +static int __init cmp_memblk(const void *a, const void *b) +{ + const struct numa_memblk *ma = *(const struct numa_memblk **)a; + const struct numa_memblk *mb = *(const struct numa_memblk **)b; + + return ma->start - mb->start; +} + +static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata; + +/** + * numa_fill_memblks - Fill gaps in numa_meminfo memblks + * @start: address to begin fill + * @end: address to end fill + * + * Find and extend numa_meminfo memblks to cover the @start-@end + * physical address range, such that the first memblk includes + * @start, the last memblk includes @end, and any gaps in between + * are filled. + * + * RETURNS: + * 0 : Success + * NUMA_NO_MEMBLK : No memblk exists in @start-@end range + */ + +int __init numa_fill_memblks(u64 start, u64 end) +{ + struct numa_memblk **blk = &numa_memblk_list[0]; + struct numa_meminfo *mi = &numa_meminfo; + int count = 0; + u64 prev_end; + + /* + * Create a list of pointers to numa_meminfo memblks that + * overlap start, end. Exclude (start == bi->end) since + * end addresses in both a CFMWS range and a memblk range + * are exclusive. + * + * This list of pointers is used to make in-place changes + * that fill out the numa_meminfo memblks. + */ + for (int i = 0; i < mi->nr_blks; i++) { + struct numa_memblk *bi = &mi->blk[i]; + + if (start < bi->end && end >= bi->start) { + blk[count] = &mi->blk[i]; + count++; + } + } + if (!count) + return NUMA_NO_MEMBLK; + + /* Sort the list of pointers in memblk->start order */ + sort(&blk[0], count, sizeof(blk[0]), cmp_memblk, NULL); + + /* Make sure the first/last memblks include start/end */ + blk[0]->start = min(blk[0]->start, start); + blk[count - 1]->end = max(blk[count - 1]->end, end); + + /* + * Fill any gaps by tracking the previous memblks + * end address and backfilling to it if needed. + */ + prev_end = blk[0]->end; + for (int i = 1; i < count; i++) { + struct numa_memblk *curr = blk[i]; + + if (prev_end >= curr->start) { + if (prev_end < curr->end) + prev_end = curr->end; + } else { + curr->start = prev_end; + prev_end = curr->end; + } + } + return 0; +} + #endif diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 9deadf517f14..0cbc1b8e8e3d 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -76,6 +76,9 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) #if CONFIG_PGTABLE_LEVELS > 3 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) { + struct ptdesc *ptdesc = virt_to_ptdesc(pud); + + pagetable_pud_dtor(ptdesc); paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); paravirt_tlb_remove_table(tlb, virt_to_page(pud)); } diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 78414c6d1b5e..5dd733944629 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -69,6 +69,7 @@ static void __init pti_print_if_secure(const char *reason) pr_info("%s\n", reason); } +/* Assume mode is auto unless overridden via cmdline below. */ static enum pti_mode { PTI_AUTO = 0, PTI_FORCE_OFF, @@ -77,50 +78,49 @@ static enum pti_mode { void __init pti_check_boottime_disable(void) { - char arg[5]; - int ret; - - /* Assume mode is auto unless overridden. */ - pti_mode = PTI_AUTO; - if (hypervisor_is_type(X86_HYPER_XEN_PV)) { pti_mode = PTI_FORCE_OFF; pti_print_if_insecure("disabled on XEN PV."); return; } - ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg)); - if (ret > 0) { - if (ret == 3 && !strncmp(arg, "off", 3)) { - pti_mode = PTI_FORCE_OFF; - pti_print_if_insecure("disabled on command line."); - return; - } - if (ret == 2 && !strncmp(arg, "on", 2)) { - pti_mode = PTI_FORCE_ON; - pti_print_if_secure("force enabled on command line."); - goto enable; - } - if (ret == 4 && !strncmp(arg, "auto", 4)) { - pti_mode = PTI_AUTO; - goto autosel; - } - } - - if (cmdline_find_option_bool(boot_command_line, "nopti") || - cpu_mitigations_off()) { + if (cpu_mitigations_off()) pti_mode = PTI_FORCE_OFF; + if (pti_mode == PTI_FORCE_OFF) { pti_print_if_insecure("disabled on command line."); return; } -autosel: - if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) + if (pti_mode == PTI_FORCE_ON) + pti_print_if_secure("force enabled on command line."); + + if (pti_mode == PTI_AUTO && !boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) return; -enable: + setup_force_cpu_cap(X86_FEATURE_PTI); } +static int __init pti_parse_cmdline(char *arg) +{ + if (!strcmp(arg, "off")) + pti_mode = PTI_FORCE_OFF; + else if (!strcmp(arg, "on")) + pti_mode = PTI_FORCE_ON; + else if (!strcmp(arg, "auto")) + pti_mode = PTI_AUTO; + else + return -EINVAL; + return 0; +} +early_param("pti", pti_parse_cmdline); + +static int __init pti_parse_cmdline_nopti(char *arg) +{ + pti_mode = PTI_FORCE_OFF; + return 0; +} +early_param("nopti", pti_parse_cmdline_nopti); + pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) { /* diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index a5930042139d..8c10d9abc239 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -16,6 +16,9 @@ #include <asm/set_memory.h> #include <asm/nospec-branch.h> #include <asm/text-patching.h> +#include <asm/unwind.h> + +static bool all_callee_regs_used[4] = {true, true, true, true}; static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) { @@ -255,6 +258,14 @@ struct jit_context { /* Number of bytes that will be skipped on tailcall */ #define X86_TAIL_CALL_OFFSET (11 + ENDBR_INSN_SIZE) +static void push_r12(u8 **pprog) +{ + u8 *prog = *pprog; + + EMIT2(0x41, 0x54); /* push r12 */ + *pprog = prog; +} + static void push_callee_regs(u8 **pprog, bool *callee_regs_used) { u8 *prog = *pprog; @@ -270,6 +281,14 @@ static void push_callee_regs(u8 **pprog, bool *callee_regs_used) *pprog = prog; } +static void pop_r12(u8 **pprog) +{ + u8 *prog = *pprog; + + EMIT2(0x41, 0x5C); /* pop r12 */ + *pprog = prog; +} + static void pop_callee_regs(u8 **pprog, bool *callee_regs_used) { u8 *prog = *pprog; @@ -291,7 +310,8 @@ static void pop_callee_regs(u8 **pprog, bool *callee_regs_used) * while jumping to another program */ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, - bool tail_call_reachable, bool is_subprog) + bool tail_call_reachable, bool is_subprog, + bool is_exception_cb) { u8 *prog = *pprog; @@ -303,12 +323,30 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, prog += X86_PATCH_SIZE; if (!ebpf_from_cbpf) { if (tail_call_reachable && !is_subprog) + /* When it's the entry of the whole tailcall context, + * zeroing rax means initialising tail_call_cnt. + */ EMIT2(0x31, 0xC0); /* xor eax, eax */ else + /* Keep the same instruction layout. */ EMIT2(0x66, 0x90); /* nop2 */ } - EMIT1(0x55); /* push rbp */ - EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */ + /* Exception callback receives FP as third parameter */ + if (is_exception_cb) { + EMIT3(0x48, 0x89, 0xF4); /* mov rsp, rsi */ + EMIT3(0x48, 0x89, 0xD5); /* mov rbp, rdx */ + /* The main frame must have exception_boundary as true, so we + * first restore those callee-saved regs from stack, before + * reusing the stack frame. + */ + pop_callee_regs(&prog, all_callee_regs_used); + pop_r12(&prog); + /* Reset the stack frame. */ + EMIT3(0x48, 0x89, 0xEC); /* mov rsp, rbp */ + } else { + EMIT1(0x55); /* push rbp */ + EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */ + } /* X86_TAIL_CALL_OFFSET is here */ EMIT_ENDBR(); @@ -467,7 +505,8 @@ static void emit_return(u8 **pprog, u8 *ip) * goto *(prog->bpf_func + prologue_size); * out: */ -static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used, +static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog, + u8 **pprog, bool *callee_regs_used, u32 stack_depth, u8 *ip, struct jit_context *ctx) { @@ -517,7 +556,12 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used, offset = ctx->tail_call_indirect_label - (prog + 2 - start); EMIT2(X86_JE, offset); /* je out */ - pop_callee_regs(&prog, callee_regs_used); + if (bpf_prog->aux->exception_boundary) { + pop_callee_regs(&prog, all_callee_regs_used); + pop_r12(&prog); + } else { + pop_callee_regs(&prog, callee_regs_used); + } EMIT1(0x58); /* pop rax */ if (stack_depth) @@ -541,7 +585,8 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used, *pprog = prog; } -static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke, +static void emit_bpf_tail_call_direct(struct bpf_prog *bpf_prog, + struct bpf_jit_poke_descriptor *poke, u8 **pprog, u8 *ip, bool *callee_regs_used, u32 stack_depth, struct jit_context *ctx) @@ -570,7 +615,13 @@ static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke, emit_jump(&prog, (u8 *)poke->tailcall_target + X86_PATCH_SIZE, poke->tailcall_bypass); - pop_callee_regs(&prog, callee_regs_used); + if (bpf_prog->aux->exception_boundary) { + pop_callee_regs(&prog, all_callee_regs_used); + pop_r12(&prog); + } else { + pop_callee_regs(&prog, callee_regs_used); + } + EMIT1(0x58); /* pop rax */ if (stack_depth) EMIT3_off32(0x48, 0x81, 0xC4, round_up(stack_depth, 8)); @@ -1018,6 +1069,10 @@ static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op) #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp))) +/* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */ +#define RESTORE_TAIL_CALL_CNT(stack) \ + EMIT3_off32(0x48, 0x8B, 0x85, -round_up(stack, 8) - 8) + static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image, int oldproglen, struct jit_context *ctx, bool jmp_padding) { @@ -1041,8 +1096,20 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image emit_prologue(&prog, bpf_prog->aux->stack_depth, bpf_prog_was_classic(bpf_prog), tail_call_reachable, - bpf_prog->aux->func_idx != 0); - push_callee_regs(&prog, callee_regs_used); + bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb); + /* Exception callback will clobber callee regs for its own use, and + * restore the original callee regs from main prog's stack frame. + */ + if (bpf_prog->aux->exception_boundary) { + /* We also need to save r12, which is not mapped to any BPF + * register, as we throw after entry into the kernel, which may + * overwrite r12. + */ + push_r12(&prog); + push_callee_regs(&prog, all_callee_regs_used); + } else { + push_callee_regs(&prog, callee_regs_used); + } ilen = prog - temp; if (rw_image) @@ -1623,9 +1690,7 @@ st: if (is_imm8(insn->off)) func = (u8 *) __bpf_call_base + imm32; if (tail_call_reachable) { - /* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */ - EMIT3_off32(0x48, 0x8B, 0x85, - -round_up(bpf_prog->aux->stack_depth, 8) - 8); + RESTORE_TAIL_CALL_CNT(bpf_prog->aux->stack_depth); if (!imm32) return -EINVAL; offs = 7 + x86_call_depth_emit_accounting(&prog, func); @@ -1641,13 +1706,15 @@ st: if (is_imm8(insn->off)) case BPF_JMP | BPF_TAIL_CALL: if (imm32) - emit_bpf_tail_call_direct(&bpf_prog->aux->poke_tab[imm32 - 1], + emit_bpf_tail_call_direct(bpf_prog, + &bpf_prog->aux->poke_tab[imm32 - 1], &prog, image + addrs[i - 1], callee_regs_used, bpf_prog->aux->stack_depth, ctx); else - emit_bpf_tail_call_indirect(&prog, + emit_bpf_tail_call_indirect(bpf_prog, + &prog, callee_regs_used, bpf_prog->aux->stack_depth, image + addrs[i - 1], @@ -1900,7 +1967,12 @@ emit_jmp: seen_exit = true; /* Update cleanup_addr */ ctx->cleanup_addr = proglen; - pop_callee_regs(&prog, callee_regs_used); + if (bpf_prog->aux->exception_boundary) { + pop_callee_regs(&prog, all_callee_regs_used); + pop_r12(&prog); + } else { + pop_callee_regs(&prog, callee_regs_used); + } EMIT1(0xC9); /* leave */ emit_return(&prog, image + addrs[i - 1] + (prog - temp)); break; @@ -2400,6 +2472,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i * [ ... ] * [ stack_arg2 ] * RBP - arg_stack_off [ stack_arg1 ] + * RSP [ tail_call_cnt ] BPF_TRAMP_F_TAIL_CALL_CTX */ /* room for return value of orig_call or fentry prog */ @@ -2464,6 +2537,8 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i else /* sub rsp, stack_size */ EMIT4(0x48, 0x83, 0xEC, stack_size); + if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) + EMIT1(0x50); /* push rax */ /* mov QWORD PTR [rbp - rbx_off], rbx */ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_6, -rbx_off); @@ -2516,9 +2591,15 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i restore_regs(m, &prog, regs_off); save_args(m, &prog, arg_stack_off, true); + if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) + /* Before calling the original function, restore the + * tail_call_cnt from stack to rax. + */ + RESTORE_TAIL_CALL_CNT(stack_size); + if (flags & BPF_TRAMP_F_ORIG_STACK) { - emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, 8); - EMIT2(0xff, 0xd0); /* call *rax */ + emit_ldx(&prog, BPF_DW, BPF_REG_6, BPF_REG_FP, 8); + EMIT2(0xff, 0xd3); /* call *rbx */ } else { /* call original function */ if (emit_rsb_call(&prog, orig_call, prog)) { @@ -2569,7 +2650,12 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i ret = -EINVAL; goto cleanup; } - } + } else if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) + /* Before running the original function, restore the + * tail_call_cnt from stack to rax. + */ + RESTORE_TAIL_CALL_CNT(stack_size); + /* restore return value of orig_call or fentry prog back into RAX */ if (save_ret) emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8); @@ -2913,3 +2999,29 @@ void bpf_jit_free(struct bpf_prog *prog) bpf_prog_unlock_free(prog); } + +bool bpf_jit_supports_exceptions(void) +{ + /* We unwind through both kernel frames (starting from within bpf_throw + * call) and BPF frames. Therefore we require ORC unwinder to be enabled + * to walk kernel frames and reach BPF frames in the stack trace. + */ + return IS_ENABLED(CONFIG_UNWINDER_ORC); +} + +void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie) +{ +#if defined(CONFIG_UNWINDER_ORC) + struct unwind_state state; + unsigned long addr; + + for (unwind_start(&state, current, NULL, NULL); !unwind_done(&state); + unwind_next_frame(&state)) { + addr = unwind_get_return_address(&state); + if (!addr || !consume_fn(cookie, (u64)addr, (u64)state.sp, (u64)state.bp)) + break; + } + return; +#endif + WARN(1, "verification of programs using bpf_throw should have failed\n"); +} diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index e3ec02e6ac9f..f347c20247d3 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -3,9 +3,11 @@ * Exceptions for specific devices. Usually work-arounds for fatal design flaws. */ +#include <linux/bitfield.h> #include <linux/delay.h> #include <linux/dmi.h> #include <linux/pci.h> +#include <linux/suspend.h> #include <linux/vgaarb.h> #include <asm/amd_nb.h> #include <asm/hpet.h> @@ -904,3 +906,60 @@ static void chromeos_fixup_apl_pci_l1ss_capability(struct pci_dev *dev) } DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x5ad6, chromeos_save_apl_pci_l1ss_capability); DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_INTEL, 0x5ad6, chromeos_fixup_apl_pci_l1ss_capability); + +#ifdef CONFIG_SUSPEND +/* + * Root Ports on some AMD SoCs advertise PME_Support for D3hot and D3cold, but + * if the SoC is put into a hardware sleep state by the amd-pmc driver, the + * Root Ports don't generate wakeup interrupts for USB devices. + * + * When suspending, remove D3hot and D3cold from the PME_Support advertised + * by the Root Port so we don't use those states if we're expecting wakeup + * interrupts. Restore the advertised PME_Support when resuming. + */ +static void amd_rp_pme_suspend(struct pci_dev *dev) +{ + struct pci_dev *rp; + + /* + * PM_SUSPEND_ON means we're doing runtime suspend, which means + * amd-pmc will not be involved so PMEs during D3 work as advertised. + * + * The PMEs *do* work if amd-pmc doesn't put the SoC in the hardware + * sleep state, but we assume amd-pmc is always present. + */ + if (pm_suspend_target_state == PM_SUSPEND_ON) + return; + + rp = pcie_find_root_port(dev); + if (!rp->pm_cap) + return; + + rp->pme_support &= ~((PCI_PM_CAP_PME_D3hot|PCI_PM_CAP_PME_D3cold) >> + PCI_PM_CAP_PME_SHIFT); + dev_info_once(&rp->dev, "quirk: disabling D3cold for suspend\n"); +} + +static void amd_rp_pme_resume(struct pci_dev *dev) +{ + struct pci_dev *rp; + u16 pmc; + + rp = pcie_find_root_port(dev); + if (!rp->pm_cap) + return; + + pci_read_config_word(rp, rp->pm_cap + PCI_PM_PMC, &pmc); + rp->pme_support = FIELD_GET(PCI_PM_CAP_PME_MASK, pmc); +} +/* Rembrandt (yellow_carp) */ +DECLARE_PCI_FIXUP_SUSPEND(PCI_VENDOR_ID_AMD, 0x162e, amd_rp_pme_suspend); +DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x162e, amd_rp_pme_resume); +DECLARE_PCI_FIXUP_SUSPEND(PCI_VENDOR_ID_AMD, 0x162f, amd_rp_pme_suspend); +DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x162f, amd_rp_pme_resume); +/* Phoenix (pink_sardine) */ +DECLARE_PCI_FIXUP_SUSPEND(PCI_VENDOR_ID_AMD, 0x1668, amd_rp_pme_suspend); +DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x1668, amd_rp_pme_resume); +DECLARE_PCI_FIXUP_SUSPEND(PCI_VENDOR_ID_AMD, 0x1669, amd_rp_pme_suspend); +DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x1669, amd_rp_pme_resume); +#endif /* CONFIG_SUSPEND */ diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c index 45d0c17ce77c..e03207de2880 100644 --- a/arch/x86/platform/uv/uv_nmi.c +++ b/arch/x86/platform/uv/uv_nmi.c @@ -17,6 +17,7 @@ #include <linux/sched.h> #include <linux/sched/debug.h> #include <linux/slab.h> +#include <linux/string.h> #include <linux/clocksource.h> #include <asm/apic.h> @@ -178,49 +179,56 @@ module_param_named(debug, uv_nmi_debug, int, 0644); } while (0) /* Valid NMI Actions */ -#define ACTION_LEN 16 -static struct nmi_action { - char *action; - char *desc; -} valid_acts[] = { - { "kdump", "do kernel crash dump" }, - { "dump", "dump process stack for each cpu" }, - { "ips", "dump Inst Ptr info for each cpu" }, - { "kdb", "enter KDB (needs kgdboc= assignment)" }, - { "kgdb", "enter KGDB (needs gdb target remote)" }, - { "health", "check if CPUs respond to NMI" }, +enum action_t { + nmi_act_kdump, + nmi_act_dump, + nmi_act_ips, + nmi_act_kdb, + nmi_act_kgdb, + nmi_act_health, + nmi_act_max }; -typedef char action_t[ACTION_LEN]; -static action_t uv_nmi_action = { "dump" }; + +static const char * const actions[nmi_act_max] = { + [nmi_act_kdump] = "kdump", + [nmi_act_dump] = "dump", + [nmi_act_ips] = "ips", + [nmi_act_kdb] = "kdb", + [nmi_act_kgdb] = "kgdb", + [nmi_act_health] = "health", +}; + +static const char * const actions_desc[nmi_act_max] = { + [nmi_act_kdump] = "do kernel crash dump", + [nmi_act_dump] = "dump process stack for each cpu", + [nmi_act_ips] = "dump Inst Ptr info for each cpu", + [nmi_act_kdb] = "enter KDB (needs kgdboc= assignment)", + [nmi_act_kgdb] = "enter KGDB (needs gdb target remote)", + [nmi_act_health] = "check if CPUs respond to NMI", +}; + +static enum action_t uv_nmi_action = nmi_act_dump; static int param_get_action(char *buffer, const struct kernel_param *kp) { - return sprintf(buffer, "%s\n", uv_nmi_action); + return sprintf(buffer, "%s\n", actions[uv_nmi_action]); } static int param_set_action(const char *val, const struct kernel_param *kp) { - int i; - int n = ARRAY_SIZE(valid_acts); - char arg[ACTION_LEN]; - - /* (remove possible '\n') */ - strscpy(arg, val, strnchrnul(val, sizeof(arg)-1, '\n') - val + 1); - - for (i = 0; i < n; i++) - if (!strcmp(arg, valid_acts[i].action)) - break; + int i, n = ARRAY_SIZE(actions); - if (i < n) { - strscpy(uv_nmi_action, arg, sizeof(uv_nmi_action)); - pr_info("UV: New NMI action:%s\n", uv_nmi_action); + i = sysfs_match_string(actions, val); + if (i >= 0) { + uv_nmi_action = i; + pr_info("UV: New NMI action:%s\n", actions[i]); return 0; } - pr_err("UV: Invalid NMI action:%s, valid actions are:\n", arg); + pr_err("UV: Invalid NMI action. Valid actions are:\n"); for (i = 0; i < n; i++) - pr_err("UV: %-8s - %s\n", - valid_acts[i].action, valid_acts[i].desc); + pr_err("UV: %-8s - %s\n", actions[i], actions_desc[i]); + return -EINVAL; } @@ -228,15 +236,10 @@ static const struct kernel_param_ops param_ops_action = { .get = param_get_action, .set = param_set_action, }; -#define param_check_action(name, p) __param_check(name, p, action_t) +#define param_check_action(name, p) __param_check(name, p, enum action_t) module_param_named(action, uv_nmi_action, action, 0644); -static inline bool uv_nmi_action_is(const char *action) -{ - return (strncmp(uv_nmi_action, action, strlen(action)) == 0); -} - /* Setup which NMI support is present in system */ static void uv_nmi_setup_mmrs(void) { @@ -727,10 +730,10 @@ static void uv_nmi_dump_state_cpu(int cpu, struct pt_regs *regs) if (cpu == 0) uv_nmi_dump_cpu_ip_hdr(); - if (current->pid != 0 || !uv_nmi_action_is("ips")) + if (current->pid != 0 || uv_nmi_action != nmi_act_ips) uv_nmi_dump_cpu_ip(cpu, regs); - if (uv_nmi_action_is("dump")) { + if (uv_nmi_action == nmi_act_dump) { pr_info("UV:%sNMI process trace for CPU %d\n", dots, cpu); show_regs(regs); } @@ -798,7 +801,7 @@ static void uv_nmi_dump_state(int cpu, struct pt_regs *regs, int master) int saved_console_loglevel = console_loglevel; pr_alert("UV: tracing %s for %d CPUs from CPU %d\n", - uv_nmi_action_is("ips") ? "IPs" : "processes", + uv_nmi_action == nmi_act_ips ? "IPs" : "processes", atomic_read(&uv_nmi_cpus_in_nmi), cpu); console_loglevel = uv_nmi_loglevel; @@ -874,7 +877,7 @@ static inline int uv_nmi_kdb_reason(void) static inline int uv_nmi_kdb_reason(void) { /* Ensure user is expecting to attach gdb remote */ - if (uv_nmi_action_is("kgdb")) + if (uv_nmi_action == nmi_act_kgdb) return 0; pr_err("UV: NMI error: KDB is not enabled in this kernel\n"); @@ -950,28 +953,35 @@ static int uv_handle_nmi(unsigned int reason, struct pt_regs *regs) master = (atomic_read(&uv_nmi_cpu) == cpu); /* If NMI action is "kdump", then attempt to do it */ - if (uv_nmi_action_is("kdump")) { + if (uv_nmi_action == nmi_act_kdump) { uv_nmi_kdump(cpu, master, regs); /* Unexpected return, revert action to "dump" */ if (master) - strscpy(uv_nmi_action, "dump", sizeof(uv_nmi_action)); + uv_nmi_action = nmi_act_dump; } /* Pause as all CPU's enter the NMI handler */ uv_nmi_wait(master); /* Process actions other than "kdump": */ - if (uv_nmi_action_is("health")) { + switch (uv_nmi_action) { + case nmi_act_health: uv_nmi_action_health(cpu, regs, master); - } else if (uv_nmi_action_is("ips") || uv_nmi_action_is("dump")) { + break; + case nmi_act_ips: + case nmi_act_dump: uv_nmi_dump_state(cpu, regs, master); - } else if (uv_nmi_action_is("kdb") || uv_nmi_action_is("kgdb")) { + break; + case nmi_act_kdb: + case nmi_act_kgdb: uv_call_kgdb_kdb(cpu, regs, master); - } else { + break; + default: if (master) - pr_alert("UV: unknown NMI action: %s\n", uv_nmi_action); + pr_alert("UV: unknown NMI action: %d\n", uv_nmi_action); uv_nmi_sync_exit(master); + break; } /* Clear per_cpu "in_nmi" flag */ diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c index 54663f3e00cb..ff5afc8a5a41 100644 --- a/arch/x86/platform/uv/uv_time.c +++ b/arch/x86/platform/uv/uv_time.c @@ -53,7 +53,7 @@ struct uv_rtc_timer_head { struct { int lcpu; /* systemwide logical cpu number */ u64 expires; /* next timer expiration for this cpu */ - } cpu[]; + } cpu[] __counted_by(ncpus); }; /* diff --git a/arch/x86/video/fbdev.c b/arch/x86/video/fbdev.c index 49a0452402e9..1dd6528cc947 100644 --- a/arch/x86/video/fbdev.c +++ b/arch/x86/video/fbdev.c @@ -13,16 +13,17 @@ #include <linux/vgaarb.h> #include <asm/fb.h> -void fb_pgprotect(struct file *file, struct vm_area_struct *vma, unsigned long off) +pgprot_t pgprot_framebuffer(pgprot_t prot, + unsigned long vm_start, unsigned long vm_end, + unsigned long offset) { - unsigned long prot; - - prot = pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK; + pgprot_val(prot) &= ~_PAGE_CACHE_MASK; if (boot_cpu_data.x86 > 3) - pgprot_val(vma->vm_page_prot) = - prot | cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS); + pgprot_val(prot) |= cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS); + + return prot; } -EXPORT_SYMBOL(fb_pgprotect); +EXPORT_SYMBOL(pgprot_framebuffer); int fb_is_primary_device(struct fb_info *info) { diff --git a/arch/x86/virt/Makefile b/arch/x86/virt/Makefile new file mode 100644 index 000000000000..1e36502cd738 --- /dev/null +++ b/arch/x86/virt/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-y += vmx/ diff --git a/arch/x86/virt/vmx/Makefile b/arch/x86/virt/vmx/Makefile new file mode 100644 index 000000000000..feebda21d793 --- /dev/null +++ b/arch/x86/virt/vmx/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_INTEL_TDX_HOST) += tdx/ diff --git a/arch/x86/virt/vmx/tdx/Makefile b/arch/x86/virt/vmx/tdx/Makefile new file mode 100644 index 000000000000..46ef8f73aebb --- /dev/null +++ b/arch/x86/virt/vmx/tdx/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-y += seamcall.o diff --git a/arch/x86/virt/vmx/tdx/seamcall.S b/arch/x86/virt/vmx/tdx/seamcall.S new file mode 100644 index 000000000000..5b1f2286aea9 --- /dev/null +++ b/arch/x86/virt/vmx/tdx/seamcall.S @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/linkage.h> +#include <asm/frame.h> + +#include "tdxcall.S" + +/* + * __seamcall() - Host-side interface functions to SEAM software + * (the P-SEAMLDR or the TDX module). + * + * __seamcall() function ABI: + * + * @fn (RDI) - SEAMCALL Leaf number, moved to RAX + * @args (RSI) - struct tdx_module_args for input + * + * Only RCX/RDX/R8-R11 are used as input registers. + * + * Return (via RAX) TDX_SEAMCALL_VMFAILINVALID if the SEAMCALL itself + * fails, or the completion status of the SEAMCALL leaf function. + */ +SYM_FUNC_START(__seamcall) + TDX_MODULE_CALL host=1 +SYM_FUNC_END(__seamcall) + +/* + * __seamcall_ret() - Host-side interface functions to SEAM software + * (the P-SEAMLDR or the TDX module), with saving output registers to + * the 'struct tdx_module_args' used as input. + * + * __seamcall_ret() function ABI: + * + * @fn (RDI) - SEAMCALL Leaf number, moved to RAX + * @args (RSI) - struct tdx_module_args for input and output + * + * Only RCX/RDX/R8-R11 are used as input/output registers. + * + * Return (via RAX) TDX_SEAMCALL_VMFAILINVALID if the SEAMCALL itself + * fails, or the completion status of the SEAMCALL leaf function. + */ +SYM_FUNC_START(__seamcall_ret) + TDX_MODULE_CALL host=1 ret=1 +SYM_FUNC_END(__seamcall_ret) + +/* + * __seamcall_saved_ret() - Host-side interface functions to SEAM software + * (the P-SEAMLDR or the TDX module), with saving output registers to the + * 'struct tdx_module_args' used as input. + * + * __seamcall_saved_ret() function ABI: + * + * @fn (RDI) - SEAMCALL Leaf number, moved to RAX + * @args (RSI) - struct tdx_module_args for input and output + * + * All registers in @args are used as input/output registers. + * + * Return (via RAX) TDX_SEAMCALL_VMFAILINVALID if the SEAMCALL itself + * fails, or the completion status of the SEAMCALL leaf function. + */ +SYM_FUNC_START(__seamcall_saved_ret) + TDX_MODULE_CALL host=1 ret=1 saved=1 +SYM_FUNC_END(__seamcall_saved_ret) diff --git a/arch/x86/virt/vmx/tdx/tdxcall.S b/arch/x86/virt/vmx/tdx/tdxcall.S index 49a54356ae99..016a2a1ec1d6 100644 --- a/arch/x86/virt/vmx/tdx/tdxcall.S +++ b/arch/x86/virt/vmx/tdx/tdxcall.S @@ -1,5 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include <asm/asm-offsets.h> +#include <asm/frame.h> +#include <asm/asm.h> #include <asm/tdx.h> /* @@ -16,35 +18,75 @@ * TDX module and hypercalls to the VMM. * SEAMCALL - used by TDX hosts to make requests to the * TDX module. + * + *------------------------------------------------------------------------- + * TDCALL/SEAMCALL ABI: + *------------------------------------------------------------------------- + * Input Registers: + * + * RAX - TDCALL/SEAMCALL Leaf number. + * RCX,RDX,RDI,RSI,RBX,R8-R15 - TDCALL/SEAMCALL Leaf specific input registers. + * + * Output Registers: + * + * RAX - TDCALL/SEAMCALL instruction error code. + * RCX,RDX,RDI,RSI,RBX,R8-R15 - TDCALL/SEAMCALL Leaf specific output registers. + * + *------------------------------------------------------------------------- + * + * So while the common core (RAX,RCX,RDX,R8-R11) fits nicely in the + * callee-clobbered registers and even leaves RDI,RSI free to act as a + * base pointer, some leafs (e.g., VP.ENTER) make a giant mess of things. + * + * For simplicity, assume that anything that needs the callee-saved regs + * also tramples on RDI,RSI. This isn't strictly true, see for example + * TDH.EXPORT.MEM. */ -.macro TDX_MODULE_CALL host:req - /* - * R12 will be used as temporary storage for struct tdx_module_output - * pointer. Since R12-R15 registers are not used by TDCALL/SEAMCALL - * services supported by this function, it can be reused. - */ +.macro TDX_MODULE_CALL host:req ret=0 saved=0 + FRAME_BEGIN - /* Callee saved, so preserve it */ - push %r12 + /* Move Leaf ID to RAX */ + mov %rdi, %rax + + /* Move other input regs from 'struct tdx_module_args' */ + movq TDX_MODULE_rcx(%rsi), %rcx + movq TDX_MODULE_rdx(%rsi), %rdx + movq TDX_MODULE_r8(%rsi), %r8 + movq TDX_MODULE_r9(%rsi), %r9 + movq TDX_MODULE_r10(%rsi), %r10 + movq TDX_MODULE_r11(%rsi), %r11 +.if \saved /* - * Push output pointer to stack. - * After the operation, it will be fetched into R12 register. + * Move additional input regs from the structure. For simplicity + * assume that anything needs the callee-saved regs also tramples + * on RDI/RSI (see VP.ENTER). */ - push %r9 + /* Save those callee-saved GPRs as mandated by the x86_64 ABI */ + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 - /* Mangle function call ABI into TDCALL/SEAMCALL ABI: */ - /* Move Leaf ID to RAX */ - mov %rdi, %rax - /* Move input 4 to R9 */ - mov %r8, %r9 - /* Move input 3 to R8 */ - mov %rcx, %r8 - /* Move input 1 to RCX */ - mov %rsi, %rcx - /* Leave input param 2 in RDX */ - - .if \host + movq TDX_MODULE_r12(%rsi), %r12 + movq TDX_MODULE_r13(%rsi), %r13 + movq TDX_MODULE_r14(%rsi), %r14 + movq TDX_MODULE_r15(%rsi), %r15 + movq TDX_MODULE_rbx(%rsi), %rbx + +.if \ret + /* Save the structure pointer as RSI is about to be clobbered */ + pushq %rsi +.endif + + movq TDX_MODULE_rdi(%rsi), %rdi + /* RSI needs to be done at last */ + movq TDX_MODULE_rsi(%rsi), %rsi +.endif /* \saved */ + +.if \host +.Lseamcall\@: seamcall /* * SEAMCALL instruction is essentially a VMExit from VMX root @@ -57,40 +99,122 @@ * This value will never be used as actual SEAMCALL error code as * it is from the Reserved status code class. */ - jnc .Lno_vmfailinvalid - mov $TDX_SEAMCALL_VMFAILINVALID, %rax -.Lno_vmfailinvalid: - - .else + jc .Lseamcall_vmfailinvalid\@ +.else tdcall - .endif +.endif +.if \ret +.if \saved /* - * Fetch output pointer from stack to R12 (It is used - * as temporary storage) + * Restore the structure from stack to save the output registers + * + * In case of VP.ENTER returns due to TDVMCALL, all registers are + * valid thus no register can be used as spare to restore the + * structure from the stack (see "TDH.VP.ENTER Output Operands + * Definition on TDCALL(TDG.VP.VMCALL) Following a TD Entry"). + * For this case, need to make one register as spare by saving it + * to the stack and then manually load the structure pointer to + * the spare register. + * + * Note for other TDCALLs/SEAMCALLs there are spare registers + * thus no need for such hack but just use this for all. */ - pop %r12 + pushq %rax /* save the TDCALL/SEAMCALL return code */ + movq 8(%rsp), %rax /* restore the structure pointer */ + movq %rsi, TDX_MODULE_rsi(%rax) /* save RSI */ + popq %rax /* restore the return code */ + popq %rsi /* pop the structure pointer */ + + /* Copy additional output regs to the structure */ + movq %r12, TDX_MODULE_r12(%rsi) + movq %r13, TDX_MODULE_r13(%rsi) + movq %r14, TDX_MODULE_r14(%rsi) + movq %r15, TDX_MODULE_r15(%rsi) + movq %rbx, TDX_MODULE_rbx(%rsi) + movq %rdi, TDX_MODULE_rdi(%rsi) +.endif /* \saved */ + /* Copy output registers to the structure */ + movq %rcx, TDX_MODULE_rcx(%rsi) + movq %rdx, TDX_MODULE_rdx(%rsi) + movq %r8, TDX_MODULE_r8(%rsi) + movq %r9, TDX_MODULE_r9(%rsi) + movq %r10, TDX_MODULE_r10(%rsi) + movq %r11, TDX_MODULE_r11(%rsi) +.endif /* \ret */ + +.if \saved && \ret /* - * Since this macro can be invoked with NULL as an output pointer, - * check if caller provided an output struct before storing output - * registers. + * Clear registers shared by guest for VP.VMCALL/VP.ENTER to prevent + * speculative use of guest's/VMM's values, including those are + * restored from the stack. + * + * See arch/x86/kvm/vmx/vmenter.S: * - * Update output registers, even if the call failed (RAX != 0). - * Other registers may contain details of the failure. + * In theory, a L1 cache miss when restoring register from stack + * could lead to speculative execution with guest's values. + * + * Note: RBP/RSP are not used as shared register. RSI has been + * restored already. + * + * XOR is cheap, thus unconditionally do for all leafs. */ - test %r12, %r12 - jz .Lno_output_struct - - /* Copy result registers to output struct: */ - movq %rcx, TDX_MODULE_rcx(%r12) - movq %rdx, TDX_MODULE_rdx(%r12) - movq %r8, TDX_MODULE_r8(%r12) - movq %r9, TDX_MODULE_r9(%r12) - movq %r10, TDX_MODULE_r10(%r12) - movq %r11, TDX_MODULE_r11(%r12) - -.Lno_output_struct: - /* Restore the state of R12 register */ - pop %r12 + xorl %ecx, %ecx + xorl %edx, %edx + xorl %r8d, %r8d + xorl %r9d, %r9d + xorl %r10d, %r10d + xorl %r11d, %r11d + xorl %r12d, %r12d + xorl %r13d, %r13d + xorl %r14d, %r14d + xorl %r15d, %r15d + xorl %ebx, %ebx + xorl %edi, %edi +.endif /* \ret && \host */ + +.if \host +.Lout\@: +.endif + +.if \saved + /* Restore callee-saved GPRs as mandated by the x86_64 ABI */ + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx +.endif /* \saved */ + + FRAME_END + RET + +.if \host +.Lseamcall_vmfailinvalid\@: + mov $TDX_SEAMCALL_VMFAILINVALID, %rax + jmp .Lseamcall_fail\@ + +.Lseamcall_trap\@: + /* + * SEAMCALL caused #GP or #UD. By reaching here RAX contains + * the trap number. Convert the trap number to the TDX error + * code by setting TDX_SW_ERROR to the high 32-bits of RAX. + * + * Note cannot OR TDX_SW_ERROR directly to RAX as OR instruction + * only accepts 32-bit immediate at most. + */ + movq $TDX_SW_ERROR, %rdi + orq %rdi, %rax + +.Lseamcall_fail\@: +.if \ret && \saved + /* pop the unused structure pointer back to RSI */ + popq %rsi +.endif + jmp .Lout\@ + + _ASM_EXTABLE_FAULT(.Lseamcall\@, .Lseamcall_trap\@) +.endif /* \host */ + .endm diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index 7ad91225fdf4..9dd5490b3318 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -33,13 +33,13 @@ static unsigned int xen_io_apic_read(unsigned apic, unsigned reg) return 0xfd; } -static u32 xen_set_apic_id(unsigned int x) +static u32 xen_set_apic_id(u32 x) { WARN_ON(1); return x; } -static unsigned int xen_get_apic_id(unsigned long x) +static u32 xen_get_apic_id(u32 x) { return ((x)>>24) & 0xFFu; } @@ -110,15 +110,15 @@ static int xen_madt_oem_check(char *oem_id, char *oem_table_id) return xen_pv_domain(); } -static int xen_phys_pkg_id(int initial_apic_id, int index_msb) +static u32 xen_phys_pkg_id(u32 initial_apic_id, int index_msb) { return initial_apic_id >> index_msb; } -static int xen_cpu_present_to_apicid(int cpu) +static u32 xen_cpu_present_to_apicid(int cpu) { if (cpu_present(cpu)) - return cpu_data(cpu).apicid; + return cpu_data(cpu).topo.apicid; else return BAD_APICID; } |