From 17270717e80de33a884ad328fea5f407d87f6d6a Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 21:43:31 -0500 Subject: x86/head: Remove confusing comment This comment is actively wrong and confusing. It refers to the registers' stack offsets after the pt_regs has been constructed on the stack, but this code is *before* that. At this point the stack just has the standard iret frame, for which no comment should be needed. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Jiri Slaby Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a3c267b770fc56c9b86df9c11c552848248aace2.1505764066.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_64.S | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 513cbb012ecc..3b04e4c99389 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -270,10 +270,6 @@ bad_address: __INIT ENTRY(early_idt_handler_array) - # 104(%rsp) %rflags - # 96(%rsp) %cs - # 88(%rsp) %rip - # 80(%rsp) error code i = 0 .rept NUM_EXCEPTION_VECTORS .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 -- cgit v1.2.3 From a8b88e84d124bc92c4808e72b8b8c0e0bb538630 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 21:43:32 -0500 Subject: x86/head: Remove unused 'bad_address' code It's no longer possible for this code to be executed, so remove it. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Jiri Slaby Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/32a46fe92d2083700599b36872b26e7dfd7b7965.1505764066.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_64.S | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 3b04e4c99389..afb0a1e22d41 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -265,9 +265,6 @@ ENDPROC(start_cpu0) .quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS __FINITDATA -bad_address: - jmp bad_address - __INIT ENTRY(early_idt_handler_array) i = 0 -- cgit v1.2.3 From 015a2ea5478680fc5216d56b7ff306f2a74efaf9 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 21:43:33 -0500 Subject: x86/head: Fix head ELF function annotations These functions aren't callable C-type functions, so don't annotate them as such. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Jiri Slaby Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/36eb182738c28514f8bf95e403d89b6413a88883.1505764066.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_64.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index afb0a1e22d41..edacd579d504 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -234,7 +234,7 @@ ENTRY(secondary_startup_64) pushq %rax # target address in negative space lretq .Lafter_lret: -ENDPROC(secondary_startup_64) +END(secondary_startup_64) #include "verify_cpu.S" @@ -277,7 +277,7 @@ ENTRY(early_idt_handler_array) i = i + 1 .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc .endr -ENDPROC(early_idt_handler_array) +END(early_idt_handler_array) early_idt_handler_common: /* @@ -320,7 +320,7 @@ early_idt_handler_common: 20: decl early_recursion_flag(%rip) jmp restore_regs_and_iret -ENDPROC(early_idt_handler_common) +END(early_idt_handler_common) __INITDATA -- cgit v1.2.3 From e93db75a0054b23a874a12c63376753544f3fe9e Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 21:43:34 -0500 Subject: x86/boot: Annotate verify_cpu() as a callable function verify_cpu() is a callable function. Annotate it as such. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Jiri Slaby Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/293024b8a080832075312f38c07ccc970fc70292.1505764066.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/verify_cpu.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S index 014ea59aa153..3d3c2f71f617 100644 --- a/arch/x86/kernel/verify_cpu.S +++ b/arch/x86/kernel/verify_cpu.S @@ -33,7 +33,7 @@ #include #include -verify_cpu: +ENTRY(verify_cpu) pushf # Save caller passed flags push $0 # Kill any dangerous flags popf @@ -139,3 +139,4 @@ verify_cpu: popf # Restore caller passed flags xorl %eax, %eax ret +ENDPROC(verify_cpu) -- cgit v1.2.3 From 2704fbb672d0d9a19414907fda7949283dcef6a1 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 21:43:37 -0500 Subject: x86/head: Add unwind hint annotations Jiri Slaby reported an ORC issue when unwinding from an idle task. The stack was: ffffffff811083c2 do_idle+0x142/0x1e0 ffffffff8110861d cpu_startup_entry+0x5d/0x60 ffffffff82715f58 start_kernel+0x3ff/0x407 ffffffff827153e8 x86_64_start_kernel+0x14e/0x15d ffffffff810001bf secondary_startup_64+0x9f/0xa0 The ORC unwinder errored out at secondary_startup_64 because the head code isn't annotated yet so there wasn't a corresponding ORC entry. Fix that and any other head-related unwinding issues by adding unwind hints to the head code. Reported-by: Jiri Slaby Tested-by: Jiri Slaby Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/78ef000a2f68f545d6eef44ee912edceaad82ccf.1505764066.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 1 - arch/x86/kernel/head_64.S | 14 ++++++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index fd0a7895b63f..d8e2b700d1db 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -26,7 +26,6 @@ KASAN_SANITIZE_dumpstack.o := n KASAN_SANITIZE_dumpstack_$(BITS).o := n KASAN_SANITIZE_stacktrace.o := n -OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y OBJECT_FILES_NON_STANDARD_test_nx.o := y diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index edacd579d504..42e32c2e51bb 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -49,6 +49,7 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map) .code64 .globl startup_64 startup_64: + UNWIND_HINT_EMPTY /* * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, * and someone has loaded an identity mapped page table @@ -88,6 +89,7 @@ startup_64: addq $(early_top_pgt - __START_KERNEL_map), %rax jmp 1f ENTRY(secondary_startup_64) + UNWIND_HINT_EMPTY /* * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, * and someone has loaded a mapped page table. @@ -132,6 +134,7 @@ ENTRY(secondary_startup_64) movq $1f, %rax jmp *%rax 1: + UNWIND_HINT_EMPTY /* Check if nx is implemented */ movl $0x80000001, %eax @@ -246,6 +249,7 @@ END(secondary_startup_64) */ ENTRY(start_cpu0) movq initial_stack(%rip), %rsp + UNWIND_HINT_EMPTY jmp .Ljump_to_C_code ENDPROC(start_cpu0) #endif @@ -270,13 +274,18 @@ ENTRY(early_idt_handler_array) i = 0 .rept NUM_EXCEPTION_VECTORS .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 - pushq $0 # Dummy error code, to make stack frame uniform + UNWIND_HINT_IRET_REGS + pushq $0 # Dummy error code, to make stack frame uniform + .else + UNWIND_HINT_IRET_REGS offset=8 .endif pushq $i # 72(%rsp) Vector number jmp early_idt_handler_common + UNWIND_HINT_IRET_REGS i = i + 1 .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc .endr + UNWIND_HINT_IRET_REGS offset=16 END(early_idt_handler_array) early_idt_handler_common: @@ -305,6 +314,7 @@ early_idt_handler_common: pushq %r13 /* pt_regs->r13 */ pushq %r14 /* pt_regs->r14 */ pushq %r15 /* pt_regs->r15 */ + UNWIND_HINT_REGS cmpq $14,%rsi /* Page fault? */ jnz 10f @@ -427,7 +437,7 @@ ENTRY(phys_base) EXPORT_SYMBOL(phys_base) #include "../../x86/xen/xen-head.S" - + __PAGE_ALIGNED_BSS NEXT_PAGE(empty_zero_page) .skip PAGE_SIZE -- cgit v1.2.3 From 11af847446ed0d131cf24d16a7ef3d5ea7a49554 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 13 Oct 2017 15:02:00 -0500 Subject: x86/unwind: Rename unwinder config options to 'CONFIG_UNWINDER_*' Rename the unwinder config options from: CONFIG_ORC_UNWINDER CONFIG_FRAME_POINTER_UNWINDER CONFIG_GUESS_UNWINDER to: CONFIG_UNWINDER_ORC CONFIG_UNWINDER_FRAME_POINTER CONFIG_UNWINDER_GUESS ... in order to give them a more logical config namespace. Suggested-by: Ingo Molnar Signed-off-by: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/73972fc7e2762e91912c6b9584582703d6f1b8cc.1507924831.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- Documentation/x86/orc-unwinder.txt | 2 +- Makefile | 4 ++-- arch/x86/Kconfig | 2 +- arch/x86/Kconfig.debug | 10 +++++----- arch/x86/configs/tiny.config | 4 ++-- arch/x86/configs/x86_64_defconfig | 2 +- arch/x86/include/asm/module.h | 2 +- arch/x86/include/asm/unwind.h | 8 ++++---- arch/x86/kernel/Makefile | 6 +++--- include/asm-generic/vmlinux.lds.h | 2 +- lib/Kconfig.debug | 2 +- scripts/Makefile.build | 2 +- 12 files changed, 23 insertions(+), 23 deletions(-) (limited to 'arch/x86/kernel') diff --git a/Documentation/x86/orc-unwinder.txt b/Documentation/x86/orc-unwinder.txt index af0c9a4c65a6..cd4b29be29af 100644 --- a/Documentation/x86/orc-unwinder.txt +++ b/Documentation/x86/orc-unwinder.txt @@ -4,7 +4,7 @@ ORC unwinder Overview -------- -The kernel CONFIG_ORC_UNWINDER option enables the ORC unwinder, which is +The kernel CONFIG_UNWINDER_ORC option enables the ORC unwinder, which is similar in concept to a DWARF unwinder. The difference is that the format of the ORC data is much simpler than DWARF, which in turn allows the ORC unwinder to be much simpler and faster. diff --git a/Makefile b/Makefile index bc5c79e8e3cf..c0f723f81c06 100644 --- a/Makefile +++ b/Makefile @@ -933,8 +933,8 @@ ifdef CONFIG_STACK_VALIDATION ifeq ($(has_libelf),1) objtool_target := tools/objtool FORCE else - ifdef CONFIG_ORC_UNWINDER - $(error "Cannot generate ORC metadata for CONFIG_ORC_UNWINDER=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel") + ifdef CONFIG_UNWINDER_ORC + $(error "Cannot generate ORC metadata for CONFIG_UNWINDER_ORC=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel") else $(warning "Cannot use CONFIG_STACK_VALIDATION=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel") endif diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 971feac13506..6b94ca0aa585 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -170,7 +170,7 @@ config X86 select HAVE_PERF_USER_STACK_DUMP select HAVE_RCU_TABLE_FREE select HAVE_REGS_AND_STACK_ACCESS_API - select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER_UNWINDER && STACK_VALIDATION + select HAVE_RELIABLE_STACKTRACE if X86_64 && UNWINDER_FRAME_POINTER && STACK_VALIDATION select HAVE_STACK_VALIDATION if X86_64 select HAVE_SYSCALL_TRACEPOINTS select HAVE_UNSTABLE_SCHED_CLOCK diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 71a48a30fc84..f274dbb87c26 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -358,13 +358,13 @@ config PUNIT_ATOM_DEBUG choice prompt "Choose kernel unwinder" - default FRAME_POINTER_UNWINDER + default UNWINDER_FRAME_POINTER ---help--- This determines which method will be used for unwinding kernel stack traces for panics, oopses, bugs, warnings, perf, /proc//stack, livepatch, lockdep, and more. -config FRAME_POINTER_UNWINDER +config UNWINDER_FRAME_POINTER bool "Frame pointer unwinder" select FRAME_POINTER ---help--- @@ -379,7 +379,7 @@ config FRAME_POINTER_UNWINDER consistency model, as this is currently the only way to get a reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE). -config ORC_UNWINDER +config UNWINDER_ORC bool "ORC unwinder" depends on X86_64 select STACK_VALIDATION @@ -395,7 +395,7 @@ config ORC_UNWINDER Enabling this option will increase the kernel's runtime memory usage by roughly 2-4MB, depending on your kernel config. -config GUESS_UNWINDER +config UNWINDER_GUESS bool "Guess unwinder" depends on EXPERT ---help--- @@ -410,7 +410,7 @@ config GUESS_UNWINDER endchoice config FRAME_POINTER - depends on !ORC_UNWINDER && !GUESS_UNWINDER + depends on !UNWINDER_ORC && !UNWINDER_GUESS bool endmenu diff --git a/arch/x86/configs/tiny.config b/arch/x86/configs/tiny.config index 550cd5012b73..66c9e2aab16c 100644 --- a/arch/x86/configs/tiny.config +++ b/arch/x86/configs/tiny.config @@ -1,5 +1,5 @@ CONFIG_NOHIGHMEM=y # CONFIG_HIGHMEM4G is not set # CONFIG_HIGHMEM64G is not set -CONFIG_GUESS_UNWINDER=y -# CONFIG_FRAME_POINTER_UNWINDER is not set +CONFIG_UNWINDER_GUESS=y +# CONFIG_UNWINDER_FRAME_POINTER is not set diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index eb65c248708d..e32fc1f274d8 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -299,7 +299,7 @@ CONFIG_DEBUG_STACKOVERFLOW=y # CONFIG_DEBUG_RODATA_TEST is not set CONFIG_DEBUG_BOOT_PARAMS=y CONFIG_OPTIMIZE_INLINING=y -CONFIG_ORC_UNWINDER=y +CONFIG_UNWINDER_ORC=y CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_SELINUX=y diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index 9eb7c718aaf8..9f05a1002aa9 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h @@ -5,7 +5,7 @@ #include struct mod_arch_specific { -#ifdef CONFIG_ORC_UNWINDER +#ifdef CONFIG_UNWINDER_ORC unsigned int num_orcs; int *orc_unwind_ip; struct orc_entry *orc_unwind; diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h index e9f793e2df7a..35d67dc7b69f 100644 --- a/arch/x86/include/asm/unwind.h +++ b/arch/x86/include/asm/unwind.h @@ -12,11 +12,11 @@ struct unwind_state { struct task_struct *task; int graph_idx; bool error; -#if defined(CONFIG_ORC_UNWINDER) +#if defined(CONFIG_UNWINDER_ORC) bool signal, full_regs; unsigned long sp, bp, ip; struct pt_regs *regs; -#elif defined(CONFIG_FRAME_POINTER_UNWINDER) +#elif defined(CONFIG_UNWINDER_FRAME_POINTER) bool got_irq; unsigned long *bp, *orig_sp, ip; struct pt_regs *regs; @@ -50,7 +50,7 @@ void unwind_start(struct unwind_state *state, struct task_struct *task, __unwind_start(state, task, regs, first_frame); } -#if defined(CONFIG_ORC_UNWINDER) || defined(CONFIG_FRAME_POINTER_UNWINDER) +#if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER) static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) { if (unwind_done(state)) @@ -65,7 +65,7 @@ static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) } #endif -#ifdef CONFIG_ORC_UNWINDER +#ifdef CONFIG_UNWINDER_ORC void unwind_init(void); void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, void *orc, size_t orc_size); diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index fd0a7895b63f..6209ab6deb50 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -127,9 +127,9 @@ obj-$(CONFIG_PERF_EVENTS) += perf_regs.o obj-$(CONFIG_TRACING) += tracepoint.o obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o -obj-$(CONFIG_ORC_UNWINDER) += unwind_orc.o -obj-$(CONFIG_FRAME_POINTER_UNWINDER) += unwind_frame.o -obj-$(CONFIG_GUESS_UNWINDER) += unwind_guess.o +obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o +obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o +obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o ### # 64 bit specific files diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 8acfc1e099e1..63e56f6c1877 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -687,7 +687,7 @@ #define BUG_TABLE #endif -#ifdef CONFIG_ORC_UNWINDER +#ifdef CONFIG_UNWINDER_ORC #define ORC_UNWIND_TABLE \ . = ALIGN(4); \ .orc_unwind_ip : AT(ADDR(.orc_unwind_ip) - LOAD_OFFSET) { \ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 2689b7c50c52..7566eff22236 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -376,7 +376,7 @@ config STACK_VALIDATION that runtime stack traces are more reliable. This is also a prerequisite for generation of ORC unwind data, which - is needed for CONFIG_ORC_UNWINDER. + is needed for CONFIG_UNWINDER_ORC. For more information, see tools/objtool/Documentation/stack-validation.txt. diff --git a/scripts/Makefile.build b/scripts/Makefile.build index 061d0c3a420a..f965f477832e 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -258,7 +258,7 @@ ifneq ($(SKIP_STACK_VALIDATION),1) __objtool_obj := $(objtree)/tools/objtool/objtool -objtool_args = $(if $(CONFIG_ORC_UNWINDER),orc generate,check) +objtool_args = $(if $(CONFIG_UNWINDER_ORC),orc generate,check) ifndef CONFIG_FRAME_POINTER objtool_args += --no-fp -- cgit v1.2.3 From 0b00de857a648dafe7020878c7a27cf776f5edf4 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 13 Oct 2017 14:56:42 -0700 Subject: x86/cpuid: Add generic table for CPUID dependencies Some CPUID features depend on other features. Currently it's possible to to clear dependent features, but not clear the base features, which can cause various interesting problems. This patch implements a generic table to describe dependencies between CPUID features, to be used by all code that clears CPUID. Some subsystems (like XSAVE) had an own implementation of this, but it's better to do it all in a single place for everyone. Then clear_cpu_cap and setup_clear_cpu_cap always look up this table and clear all dependencies too. This is intended to be a practical table: only for features that make sense to clear. If someone for example clears FPU, or other features that are essentially part of the required base feature set, not much is going to work. Handling that is right now out of scope. We're only handling features which can be usefully cleared. Signed-off-by: Andi Kleen Reviewed-by: Thomas Gleixner Cc: Jonathan McDowell Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20171013215645.23166-3-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 9 ++- arch/x86/include/asm/cpufeatures.h | 5 ++ arch/x86/kernel/cpu/Makefile | 1 + arch/x86/kernel/cpu/cpuid-deps.c | 113 +++++++++++++++++++++++++++++++++++++ 4 files changed, 123 insertions(+), 5 deletions(-) create mode 100644 arch/x86/kernel/cpu/cpuid-deps.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index d59c15c3defd..225fd8374fae 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -125,11 +125,10 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; #define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit) #define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability)) -#define clear_cpu_cap(c, bit) clear_bit(bit, (unsigned long *)((c)->x86_capability)) -#define setup_clear_cpu_cap(bit) do { \ - clear_cpu_cap(&boot_cpu_data, bit); \ - set_bit(bit, (unsigned long *)cpu_caps_cleared); \ -} while (0) + +extern void setup_clear_cpu_cap(unsigned int bit); +extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); + #define setup_force_cpu_cap(bit) do { \ set_cpu_cap(&boot_cpu_data, bit); \ set_bit(bit, (unsigned long *)cpu_caps_set); \ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 2519c6c801c9..401a70992060 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -21,6 +21,11 @@ * this feature bit is not displayed in /proc/cpuinfo at all. */ +/* + * When adding new features here that depend on other features, + * please update the table in kernel/cpu/cpuid-deps.c + */ + /* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */ #define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */ #define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */ diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index e17942c131c8..de260fae1017 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -22,6 +22,7 @@ obj-y += rdrand.o obj-y += match.o obj-y += bugs.o obj-$(CONFIG_CPU_FREQ) += aperfmperf.o +obj-y += cpuid-deps.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c new file mode 100644 index 000000000000..e48eb7313120 --- /dev/null +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -0,0 +1,113 @@ +/* Declare dependencies between CPUIDs */ +#include +#include +#include +#include + +struct cpuid_dep { + unsigned int feature; + unsigned int depends; +}; + +/* + * Table of CPUID features that depend on others. + * + * This only includes dependencies that can be usefully disabled, not + * features part of the base set (like FPU). + * + * Note this all is not __init / __initdata because it can be + * called from cpu hotplug. It shouldn't do anything in this case, + * but it's difficult to tell that to the init reference checker. + */ +const static struct cpuid_dep cpuid_deps[] = { + { X86_FEATURE_XSAVEOPT, X86_FEATURE_XSAVE }, + { X86_FEATURE_XSAVEC, X86_FEATURE_XSAVE }, + { X86_FEATURE_XSAVES, X86_FEATURE_XSAVE }, + { X86_FEATURE_AVX, X86_FEATURE_XSAVE }, + { X86_FEATURE_PKU, X86_FEATURE_XSAVE }, + { X86_FEATURE_MPX, X86_FEATURE_XSAVE }, + { X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE }, + { X86_FEATURE_FXSR_OPT, X86_FEATURE_FXSR }, + { X86_FEATURE_XMM, X86_FEATURE_FXSR }, + { X86_FEATURE_XMM2, X86_FEATURE_XMM }, + { X86_FEATURE_XMM3, X86_FEATURE_XMM2 }, + { X86_FEATURE_XMM4_1, X86_FEATURE_XMM2 }, + { X86_FEATURE_XMM4_2, X86_FEATURE_XMM2 }, + { X86_FEATURE_XMM3, X86_FEATURE_XMM2 }, + { X86_FEATURE_PCLMULQDQ, X86_FEATURE_XMM2 }, + { X86_FEATURE_SSSE3, X86_FEATURE_XMM2, }, + { X86_FEATURE_F16C, X86_FEATURE_XMM2, }, + { X86_FEATURE_AES, X86_FEATURE_XMM2 }, + { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 }, + { X86_FEATURE_FMA, X86_FEATURE_AVX }, + { X86_FEATURE_AVX2, X86_FEATURE_AVX, }, + { X86_FEATURE_AVX512F, X86_FEATURE_AVX, }, + { X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512PF, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512ER, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512CD, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512DQ, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512BW, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F }, + {} +}; + +static inline void __clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit) +{ + clear_bit32(bit, c->x86_capability); +} + +static inline void __setup_clear_cpu_cap(unsigned int bit) +{ + clear_cpu_cap(&boot_cpu_data, bit); + set_bit32(bit, cpu_caps_cleared); +} + +static inline void clear_feature(struct cpuinfo_x86 *c, unsigned int feature) +{ + if (!c) + __setup_clear_cpu_cap(feature); + else + __clear_cpu_cap(c, feature); +} + +static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature) +{ + bool changed; + DECLARE_BITMAP(disable, NCAPINTS * sizeof(u32) * 8); + const struct cpuid_dep *d; + + clear_feature(c, feature); + + /* Collect all features to disable, handling dependencies */ + memset(disable, 0, sizeof(disable)); + __set_bit(feature, disable); + + /* Loop until we get a stable state. */ + do { + changed = false; + for (d = cpuid_deps; d->feature; d++) { + if (!test_bit(d->depends, disable)) + continue; + if (__test_and_set_bit(d->feature, disable)) + continue; + + changed = true; + clear_feature(c, d->feature); + } + } while (changed); +} + +void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature) +{ + do_clear_cpu_cap(c, feature); +} + +void setup_clear_cpu_cap(unsigned int feature) +{ + do_clear_cpu_cap(NULL, feature); +} -- cgit v1.2.3 From 0c2a3913d6f50503f7c59d83a6219e39508cc898 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 13 Oct 2017 14:56:43 -0700 Subject: x86/fpu: Parse clearcpuid= as early XSAVE argument With a followon patch we want to make clearcpuid affect the XSAVE configuration. But xsave is currently initialized before arguments are parsed. Move the clearcpuid= parsing into the special early xsave argument parsing code. Since clearcpuid= contains a = we need to keep the old __setup around as a dummy, otherwise it would end up as a environment variable in init's environment. Signed-off-by: Andi Kleen Reviewed-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20171013215645.23166-4-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 16 +++++++--------- arch/x86/kernel/fpu/init.c | 11 +++++++++++ 2 files changed, 18 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c9176bae7fd8..03bb004bb15e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1301,18 +1301,16 @@ void print_cpu_info(struct cpuinfo_x86 *c) pr_cont(")\n"); } -static __init int setup_disablecpuid(char *arg) +/* + * clearcpuid= was already parsed in fpu__init_parse_early_param. + * But we need to keep a dummy __setup around otherwise it would + * show up as an environment variable for init. + */ +static __init int setup_clearcpuid(char *arg) { - int bit; - - if (get_option(&arg, &bit) && bit >= 0 && bit < NCAPINTS * 32) - setup_clear_cpu_cap(bit); - else - return 0; - return 1; } -__setup("clearcpuid=", setup_disablecpuid); +__setup("clearcpuid=", setup_clearcpuid); #ifdef CONFIG_X86_64 DEFINE_PER_CPU_FIRST(union irq_stack_union, diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 7affb7e3d9a5..6abd83572b01 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -249,6 +249,10 @@ static void __init fpu__init_system_ctx_switch(void) */ static void __init fpu__init_parse_early_param(void) { + char arg[32]; + char *argptr = arg; + int bit; + if (cmdline_find_option_bool(boot_command_line, "no387")) setup_clear_cpu_cap(X86_FEATURE_FPU); @@ -266,6 +270,13 @@ static void __init fpu__init_parse_early_param(void) if (cmdline_find_option_bool(boot_command_line, "noxsaves")) setup_clear_cpu_cap(X86_FEATURE_XSAVES); + + if (cmdline_find_option(boot_command_line, "clearcpuid", arg, + sizeof(arg)) && + get_option(&argptr, &bit) && + bit >= 0 && + bit < NCAPINTS * 32) + setup_clear_cpu_cap(bit); } /* -- cgit v1.2.3 From ccb18db2ab9d923df07e7495123fe5fb02329713 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 13 Oct 2017 14:56:44 -0700 Subject: x86/fpu: Make XSAVE check the base CPUID features before enabling Before enabling XSAVE, not only check the XSAVE specific CPUID bits, but also the base CPUID features of the respective XSAVE feature. This allows to disable individual XSAVE states using the existing clearcpuid= option, which can be useful for performance testing and debugging, and also in general avoids inconsistencies. Signed-off-by: Andi Kleen Reviewed-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20171013215645.23166-5-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index f1d5476c9022..fb581292975b 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -15,6 +15,7 @@ #include #include +#include /* * Although we spell it out in here, the Processor Trace @@ -36,6 +37,19 @@ static const char *xfeature_names[] = "unknown xstate feature" , }; +static short xsave_cpuid_features[] __initdata = { + X86_FEATURE_FPU, + X86_FEATURE_XMM, + X86_FEATURE_AVX, + X86_FEATURE_MPX, + X86_FEATURE_MPX, + X86_FEATURE_AVX512F, + X86_FEATURE_AVX512F, + X86_FEATURE_AVX512F, + X86_FEATURE_INTEL_PT, + X86_FEATURE_PKU, +}; + /* * Mask of xstate features supported by the CPU and the kernel: */ @@ -726,6 +740,7 @@ void __init fpu__init_system_xstate(void) unsigned int eax, ebx, ecx, edx; static int on_boot_cpu __initdata = 1; int err; + int i; WARN_ON_FPU(!on_boot_cpu); on_boot_cpu = 0; @@ -759,6 +774,14 @@ void __init fpu__init_system_xstate(void) goto out_disable; } + /* + * Clear XSAVE features that are disabled in the normal CPUID. + */ + for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) { + if (!boot_cpu_has(xsave_cpuid_features[i])) + xfeatures_mask &= ~BIT(i); + } + xfeatures_mask &= fpu__get_supported_xfeatures_mask(); /* Enable xstate instructions to be able to continue with initialization: */ -- cgit v1.2.3 From 73e3a7d2a7c3be29a5a22b85026f6cfa5664267f Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 13 Oct 2017 14:56:45 -0700 Subject: x86/fpu: Remove the explicit clearing of XSAVE dependent features Clearing a CPU feature with setup_clear_cpu_cap() clears all features which depend on it. Expressing feature dependencies in one place is easier to maintain than keeping functions like fpu__xstate_clear_all_cpu_caps() up to date. The features which depend on XSAVE have their dependency expressed in the dependency table, so its sufficient to clear X86_FEATURE_XSAVE. Remove the explicit clearing of XSAVE dependent features. Signed-off-by: Andi Kleen Reviewed-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20171013215645.23166-6-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index fb581292975b..87a57b7642d3 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -73,26 +73,6 @@ unsigned int fpu_user_xstate_size; void fpu__xstate_clear_all_cpu_caps(void) { setup_clear_cpu_cap(X86_FEATURE_XSAVE); - setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); - setup_clear_cpu_cap(X86_FEATURE_XSAVEC); - setup_clear_cpu_cap(X86_FEATURE_XSAVES); - setup_clear_cpu_cap(X86_FEATURE_AVX); - setup_clear_cpu_cap(X86_FEATURE_AVX2); - setup_clear_cpu_cap(X86_FEATURE_AVX512F); - setup_clear_cpu_cap(X86_FEATURE_AVX512IFMA); - setup_clear_cpu_cap(X86_FEATURE_AVX512PF); - setup_clear_cpu_cap(X86_FEATURE_AVX512ER); - setup_clear_cpu_cap(X86_FEATURE_AVX512CD); - setup_clear_cpu_cap(X86_FEATURE_AVX512DQ); - setup_clear_cpu_cap(X86_FEATURE_AVX512BW); - setup_clear_cpu_cap(X86_FEATURE_AVX512VL); - setup_clear_cpu_cap(X86_FEATURE_MPX); - setup_clear_cpu_cap(X86_FEATURE_XGETBV1); - setup_clear_cpu_cap(X86_FEATURE_AVX512VBMI); - setup_clear_cpu_cap(X86_FEATURE_PKU); - setup_clear_cpu_cap(X86_FEATURE_AVX512_4VNNIW); - setup_clear_cpu_cap(X86_FEATURE_AVX512_4FMAPS); - setup_clear_cpu_cap(X86_FEATURE_AVX512_VPOPCNTDQ); } /* -- cgit v1.2.3 From 57b8b1a1856adaa849d02d547411a553a531022b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 18 Oct 2017 19:39:35 +0200 Subject: x86/cpuid: Prevent out of bound access in do_clear_cpu_cap() do_clear_cpu_cap() allocates a bitmap to keep track of disabled feature dependencies. That bitmap is sized NCAPINTS * BITS_PER_INIT. The possible 'features' which can be handed in are larger than this, because after the capabilities the bug 'feature' bits occupy another 32bit. Not really obvious... So clearing any of the misfeature bits, as 32bit does for the F00F bug, accesses that bitmap out of bounds thereby corrupting the stack. Size the bitmap proper and add a sanity check to catch accidental out of bound access. Fixes: 0b00de857a64 ("x86/cpuid: Add generic table for CPUID dependencies") Reported-by: kernel test robot Signed-off-by: Thomas Gleixner Cc: Andi Kleen Cc: Borislav Petkov Link: https://lkml.kernel.org/r/20171018022023.GA12058@yexl-desktop --- arch/x86/kernel/cpu/cpuid-deps.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index e48eb7313120..c1d49842a411 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -75,11 +75,17 @@ static inline void clear_feature(struct cpuinfo_x86 *c, unsigned int feature) __clear_cpu_cap(c, feature); } +/* Take the capabilities and the BUG bits into account */ +#define MAX_FEATURE_BITS ((NCAPINTS + NBUGINTS) * sizeof(u32) * 8) + static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature) { - bool changed; - DECLARE_BITMAP(disable, NCAPINTS * sizeof(u32) * 8); + DECLARE_BITMAP(disable, MAX_FEATURE_BITS); const struct cpuid_dep *d; + bool changed; + + if (WARN_ON(feature >= MAX_FEATURE_BITS)) + return; clear_feature(c, feature); -- cgit v1.2.3 From da20ab35180780e4a6eadc804544f1fa967f3567 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 18 Oct 2017 10:21:07 -0700 Subject: x86/entry: Use SYSCALL_DEFINE() macros for sys_modify_ldt() We do not have tracepoints for sys_modify_ldt() because we define it directly instead of using the normal SYSCALL_DEFINEx() macros. However, there is a reason sys_modify_ldt() does not use the macros: it has an 'int' return type instead of 'unsigned long'. This is a bug, but it's a bug cemented in the ABI. What does this mean? If we return -EINVAL from a function that returns 'int', we have 0x00000000ffffffea in %rax. But, if we return -EINVAL from a function returning 'unsigned long', we end up with 0xffffffffffffffea in %rax, which is wrong. To work around this and maintain the 'int' behavior while using the SYSCALL_DEFINEx() macros, so we add a cast to 'unsigned int' in both implementations of sys_modify_ldt(). Signed-off-by: Dave Hansen Reviewed-by: Andy Lutomirski Reviewed-by: Brian Gerst Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171018172107.1A79C532@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/syscalls.h | 2 +- arch/x86/kernel/ldt.c | 16 +++++++++++++--- arch/x86/um/ldt.c | 7 +++++-- 3 files changed, 19 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 91dfcafe27a6..bad25bb80679 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -21,7 +21,7 @@ asmlinkage long sys_ioperm(unsigned long, unsigned long, int); asmlinkage long sys_iopl(unsigned int); /* kernel/ldt.c */ -asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); +asmlinkage long sys_modify_ldt(int, void __user *, unsigned long); /* kernel/signal.c */ asmlinkage long sys_rt_sigreturn(void); diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index f0e64db18ac8..0402d44deb4d 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -294,8 +295,8 @@ out: return error; } -asmlinkage int sys_modify_ldt(int func, void __user *ptr, - unsigned long bytecount) +SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr , + unsigned long , bytecount) { int ret = -ENOSYS; @@ -313,5 +314,14 @@ asmlinkage int sys_modify_ldt(int func, void __user *ptr, ret = write_ldt(ptr, bytecount, 0); break; } - return ret; + /* + * The SYSCALL_DEFINE() macros give us an 'unsigned long' + * return type, but tht ABI for sys_modify_ldt() expects + * 'int'. This cast gives us an int-sized value in %rax + * for the return code. The 'unsigned' is necessary so + * the compiler does not try to sign-extend the negative + * return codes into the high half of the register when + * taking the value from int->long. + */ + return (unsigned int)ret; } diff --git a/arch/x86/um/ldt.c b/arch/x86/um/ldt.c index 836a1eb5df43..3ee234b6234d 100644 --- a/arch/x86/um/ldt.c +++ b/arch/x86/um/ldt.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -369,7 +370,9 @@ void free_ldt(struct mm_context *mm) mm->arch.ldt.entry_count = 0; } -int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) +SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr , + unsigned long , bytecount) { - return do_modify_ldt_skas(func, ptr, bytecount); + /* See non-um modify_ldt() for why we do this cast */ + return (unsigned int)do_modify_ldt_skas(func, ptr, bytecount); } -- cgit v1.2.3 From 82c62fa0c49aa305104013cee4468772799bb391 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 20 Oct 2017 11:21:35 -0500 Subject: x86/asm: Don't use the confusing '.ifeq' directive I find the '.ifeq ' directive to be confusing. Reading it quickly seems to suggest its opposite meaning, or that it's missing an argument. Improve readability by replacing all of its x86 uses with '.if == 0'. Signed-off-by: Josh Poimboeuf Cc: Andrei Vagin Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/757da028e802c7e98d23fbab8d234b1063e161cf.1508516398.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 2 +- arch/x86/kernel/head_32.S | 2 +- arch/x86/kernel/head_64.S | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index f6cdb7a1455e..846e84a1d1f7 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -817,7 +817,7 @@ ENTRY(\sym) ASM_CLAC - .ifeq \has_error_code + .if \has_error_code == 0 pushq $-1 /* ORIG_RAX: no syscall to restart */ .endif diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 9ed3074d0d27..6e50f87765e5 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -401,7 +401,7 @@ ENTRY(early_idt_handler_array) # 24(%rsp) error code i = 0 .rept NUM_EXCEPTION_VECTORS - .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 + .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0 pushl $0 # Dummy error code, to make stack frame uniform .endif pushl $i # 20(%esp) Vector number diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 42e32c2e51bb..311db1a73c11 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -273,7 +273,7 @@ ENDPROC(start_cpu0) ENTRY(early_idt_handler_array) i = 0 .rept NUM_EXCEPTION_VECTORS - .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 + .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0 UNWIND_HINT_IRET_REGS pushq $0 # Dummy error code, to make stack frame uniform .else -- cgit v1.2.3 From c128dbfa0f879f8ce7b79054037889b0b2240728 Mon Sep 17 00:00:00 2001 From: Gayatri Kammela Date: Mon, 30 Oct 2017 18:20:29 -0700 Subject: x86/cpufeatures: Enable new SSE/AVX/AVX512 CPU features Add a few new SSE/AVX/AVX512 instruction groups/features for enumeration in /proc/cpuinfo: AVX512_VBMI2, GFNI, VAES, VPCLMULQDQ, AVX512_VNNI, AVX512_BITALG. CPUID.(EAX=7,ECX=0):ECX[bit 6] AVX512_VBMI2 CPUID.(EAX=7,ECX=0):ECX[bit 8] GFNI CPUID.(EAX=7,ECX=0):ECX[bit 9] VAES CPUID.(EAX=7,ECX=0):ECX[bit 10] VPCLMULQDQ CPUID.(EAX=7,ECX=0):ECX[bit 11] AVX512_VNNI CPUID.(EAX=7,ECX=0):ECX[bit 12] AVX512_BITALG Detailed information of CPUID bits for these features can be found in the Intel Architecture Instruction Set Extensions and Future Features Programming Interface document (refer to Table 1-1. and Table 1-2.). A copy of this document is available at https://bugzilla.kernel.org/show_bug.cgi?id=197239 Signed-off-by: Gayatri Kammela Acked-by: Thomas Gleixner Cc: Andi Kleen Cc: Fenghua Yu Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Ravi Shankar Cc: Ricardo Neri Cc: Yang Zhong Cc: bp@alien8.de Link: http://lkml.kernel.org/r/1509412829-23380-1-git-send-email-gayatri.kammela@intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeatures.h | 6 ++++++ arch/x86/kernel/cpu/cpuid-deps.c | 6 ++++++ 2 files changed, 12 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 401a70992060..b0556f882aa8 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -299,6 +299,12 @@ #define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ +#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ +#define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */ +#define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */ +#define X86_FEATURE_VPCLMULQDQ (16*32+ 10) /* Carry-Less Multiplication Double Quadword */ +#define X86_FEATURE_AVX512_VNNI (16*32+ 11) /* Vector Neural Network Instructions */ +#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB */ #define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ #define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */ #define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */ diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index c1d49842a411..c21f22d836ad 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -50,6 +50,12 @@ const static struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_AVX512BW, X86_FEATURE_AVX512F }, { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F }, { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL }, + { X86_FEATURE_GFNI, X86_FEATURE_AVX512VL }, + { X86_FEATURE_VAES, X86_FEATURE_AVX512VL }, + { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX512VL }, + { X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL }, + { X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL }, { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F }, { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F }, { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F }, -- cgit v1.2.3 From b0ce5b8c95c83a7b98c679b117e3d6ae6f97154b Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Fri, 27 Oct 2017 13:25:29 -0700 Subject: x86/boot: Relocate definition of the initial state of CR0 Both head_32.S and head_64.S utilize the same value to initialize the control register CR0. Also, other parts of the kernel might want to access this initial definition (e.g., emulation code for User-Mode Instruction Prevention uses this state to provide a sane dummy value for CR0 when emulating the smsw instruction). Thus, relocate this definition to a header file from which it can be conveniently accessed. Suggested-by: Borislav Petkov Signed-off-by: Ricardo Neri Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Reviewed-by: Andy Lutomirski Cc: "Michael S. Tsirkin" Cc: Peter Zijlstra Cc: Dave Hansen Cc: ricardo.neri@intel.com Cc: linux-mm@kvack.org Cc: Paul Gortmaker Cc: Huang Rui Cc: Shuah Khan Cc: linux-arch@vger.kernel.org Cc: Jonathan Corbet Cc: Jiri Slaby Cc: "Ravi V. Shankar" Cc: Denys Vlasenko Cc: Chris Metcalf Cc: Brian Gerst Cc: Josh Poimboeuf Cc: Chen Yucong Cc: Vlastimil Babka Cc: Dave Hansen Cc: Andy Lutomirski Cc: Masami Hiramatsu Cc: Paolo Bonzini Cc: Andrew Morton Cc: Linus Torvalds Link: https://lkml.kernel.org/r/1509135945-13762-3-git-send-email-ricardo.neri-calderon@linux.intel.com --- arch/x86/include/uapi/asm/processor-flags.h | 3 +++ arch/x86/kernel/head_32.S | 3 --- arch/x86/kernel/head_64.S | 3 --- 3 files changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h index 185f3d10c194..39946d0a1d41 100644 --- a/arch/x86/include/uapi/asm/processor-flags.h +++ b/arch/x86/include/uapi/asm/processor-flags.h @@ -151,5 +151,8 @@ #define CX86_ARR_BASE 0xc4 #define CX86_RCR_BASE 0xdc +#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ + X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ + X86_CR0_PG) #endif /* _UAPI_ASM_X86_PROCESSOR_FLAGS_H */ diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 9ed3074d0d27..c3cfc655f551 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -211,9 +211,6 @@ ENTRY(startup_32_smp) #endif .Ldefault_entry: -#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ - X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ - X86_CR0_PG) movl $(CR0_STATE & ~X86_CR0_PG),%eax movl %eax,%cr0 diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 513cbb012ecc..5e1bfdd86b5b 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -149,9 +149,6 @@ ENTRY(secondary_startup_64) 1: wrmsr /* Make changes effective */ /* Setup cr0 */ -#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ - X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ - X86_CR0_PG) movl $CR0_STATE, %eax /* Make changes effective */ movq %rax, %cr0 -- cgit v1.2.3 From 26c4ef9c49d8a0341f6d97ce2cfdd55d1236ed29 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:58:59 -0700 Subject: x86/entry/64: Split the IRET-to-user and IRET-to-kernel paths These code paths will diverge soon. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/dccf8c7b3750199b4b30383c812d4e2931811509.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 34 +++++++++++++++++++++++++--------- arch/x86/entry/entry_64_compat.S | 2 +- arch/x86/kernel/head_64.S | 2 +- 3 files changed, 27 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index e8ef83df46e6..3eeb1694210c 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -321,7 +321,7 @@ syscall_return_via_sysret: opportunistic_sysret_failed: SWAPGS - jmp restore_regs_and_iret + jmp restore_regs_and_return_to_usermode END(entry_SYSCALL_64) ENTRY(stub_ptregs_64) @@ -423,7 +423,7 @@ ENTRY(ret_from_fork) call syscall_return_slowpath /* returns with IRQs disabled */ TRACE_IRQS_ON /* user mode is traced as IRQS on */ SWAPGS - jmp restore_regs_and_iret + jmp restore_regs_and_return_to_usermode 1: /* kernel thread */ @@ -612,7 +612,20 @@ GLOBAL(retint_user) call prepare_exit_to_usermode TRACE_IRQS_IRETQ SWAPGS - jmp restore_regs_and_iret + +GLOBAL(restore_regs_and_return_to_usermode) +#ifdef CONFIG_DEBUG_ENTRY + /* Assert that pt_regs indicates user mode. */ + testl $3, CS(%rsp) + jnz 1f + ud2 +1: +#endif + RESTORE_EXTRA_REGS + RESTORE_C_REGS + REMOVE_PT_GPREGS_FROM_STACK 8 + INTERRUPT_RETURN + /* Returning to kernel space */ retint_kernel: @@ -632,11 +645,14 @@ retint_kernel: */ TRACE_IRQS_IRETQ -/* - * At this label, code paths which return to kernel and to user, - * which come from interrupts/exception and from syscalls, merge. - */ -GLOBAL(restore_regs_and_iret) +GLOBAL(restore_regs_and_return_to_kernel) +#ifdef CONFIG_DEBUG_ENTRY + /* Assert that pt_regs indicates kernel mode. */ + testl $3, CS(%rsp) + jz 1f + ud2 +1: +#endif RESTORE_EXTRA_REGS RESTORE_C_REGS REMOVE_PT_GPREGS_FROM_STACK 8 @@ -1327,7 +1343,7 @@ ENTRY(nmi) * work, because we don't want to enable interrupts. */ SWAPGS - jmp restore_regs_and_iret + jmp restore_regs_and_return_to_usermode .Lnmi_from_kernel: /* diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index e26c25ca7756..9ca014a99968 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -337,7 +337,7 @@ ENTRY(entry_INT80_compat) /* Go back to user mode. */ TRACE_IRQS_ON SWAPGS - jmp restore_regs_and_iret + jmp restore_regs_and_return_to_usermode END(entry_INT80_compat) ENTRY(stub32_clone) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 189bf42dfa2b..08f067faa264 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -326,7 +326,7 @@ early_idt_handler_common: 20: decl early_recursion_flag(%rip) - jmp restore_regs_and_iret + jmp restore_regs_and_return_to_kernel END(early_idt_handler_common) __INITDATA -- cgit v1.2.3 From bd7dc5a6afac719d8ce4092391eef2c7e83c2a75 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:09 -0700 Subject: x86/entry/32: Pull the MSR_IA32_SYSENTER_CS update code out of native_load_sp0() This causes the MSR_IA32_SYSENTER_CS write to move out of the paravirt callback. This shouldn't affect Xen PV: Xen already ignores MSR_IA32_SYSENTER_ESP writes. In any event, Xen doesn't support vm86() in a useful way. Note to any potential backporters: This patch won't break lguest, as lguest didn't have any SYSENTER support at all. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/75cf09fe03ae778532d0ca6c65aa58e66bc2f90c.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 7 ------- arch/x86/include/asm/switch_to.h | 12 ++++++++++++ arch/x86/kernel/process_32.c | 4 +++- arch/x86/kernel/process_64.c | 2 +- arch/x86/kernel/vm86_32.c | 6 +++++- 5 files changed, 21 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index b390ff76e58f..0167e3e35a57 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -520,13 +520,6 @@ static inline void native_load_sp0(struct tss_struct *tss, struct thread_struct *thread) { tss->x86_tss.sp0 = thread->sp0; -#ifdef CONFIG_X86_32 - /* Only happens when SEP is enabled, no need to test "SEP"arately: */ - if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) { - tss->x86_tss.ss1 = thread->sysenter_cs; - wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); - } -#endif } static inline void native_swapgs(void) diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index fcc5cd387fd1..7ae8caffbada 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -72,4 +72,16 @@ do { \ ((last) = __switch_to_asm((prev), (next))); \ } while (0) +#ifdef CONFIG_X86_32 +static inline void refresh_sysenter_cs(struct thread_struct *thread) +{ + /* Only happens when SEP is enabled, no need to test "SEP"arately: */ + if (unlikely(this_cpu_read(cpu_tss.x86_tss.ss1) == thread->sysenter_cs)) + return; + + this_cpu_write(cpu_tss.x86_tss.ss1, thread->sysenter_cs); + wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); +} +#endif + #endif /* _ASM_X86_SWITCH_TO_H */ diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 11966251cd42..0936ed3da6b6 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -284,9 +284,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Reload esp0 and cpu_current_top_of_stack. This changes - * current_thread_info(). + * current_thread_info(). Refresh the SYSENTER configuration in + * case prev or next is vm86. */ load_sp0(tss, next); + refresh_sysenter_cs(next); this_cpu_write(cpu_current_top_of_stack, (unsigned long)task_stack_page(next_p) + THREAD_SIZE); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 302e7b2572d1..a6ff6d1a0110 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -464,7 +464,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ this_cpu_write(current_task, next_p); - /* Reload esp0 and ss1. This changes current_thread_info(). */ + /* Reload sp0. */ load_sp0(tss, next); /* diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 7924a5356c8a..5bc1c3ab6287 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -54,6 +54,7 @@ #include #include #include +#include /* * Known problems: @@ -149,6 +150,7 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) tsk->thread.sp0 = vm86->saved_sp0; tsk->thread.sysenter_cs = __KERNEL_CS; load_sp0(tss, &tsk->thread); + refresh_sysenter_cs(&tsk->thread); vm86->saved_sp0 = 0; put_cpu(); @@ -368,8 +370,10 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) /* make room for real-mode segments */ tsk->thread.sp0 += 16; - if (static_cpu_has(X86_FEATURE_SEP)) + if (static_cpu_has(X86_FEATURE_SEP)) { tsk->thread.sysenter_cs = 0; + refresh_sysenter_cs(&tsk->thread); + } load_sp0(tss, &tsk->thread); put_cpu(); -- cgit v1.2.3 From da51da189a24bb9b7e2d5a123be096e51a4695a5 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:10 -0700 Subject: x86/entry/64: Pass SP0 directly to load_sp0() load_sp0() had an odd signature: void load_sp0(struct tss_struct *tss, struct thread_struct *thread); Simplify it to: void load_sp0(unsigned long sp0); Also simplify a few get_cpu()/put_cpu() sequences to preempt_disable()/preempt_enable(). Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/2655d8b42ed940aa384fe18ee1129bbbcf730a08.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/paravirt.h | 5 ++--- arch/x86/include/asm/paravirt_types.h | 2 +- arch/x86/include/asm/processor.h | 9 ++++----- arch/x86/kernel/cpu/common.c | 4 ++-- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- arch/x86/kernel/vm86_32.c | 14 ++++++-------- arch/x86/xen/enlighten_pv.c | 7 +++---- 8 files changed, 20 insertions(+), 25 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 12deec722cf0..43d4f90edebc 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -15,10 +15,9 @@ #include #include -static inline void load_sp0(struct tss_struct *tss, - struct thread_struct *thread) +static inline void load_sp0(unsigned long sp0) { - PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread); + PVOP_VCALL1(pv_cpu_ops.load_sp0, sp0); } /* The paravirtualized CPUID instruction. */ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 280d94c36dad..a916788ac478 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -133,7 +133,7 @@ struct pv_cpu_ops { void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries); void (*free_ldt)(struct desc_struct *ldt, unsigned entries); - void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t); + void (*load_sp0)(unsigned long sp0); void (*set_iopl_mask)(unsigned mask); diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 0167e3e35a57..064b84722166 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -517,9 +517,9 @@ static inline void native_set_iopl_mask(unsigned mask) } static inline void -native_load_sp0(struct tss_struct *tss, struct thread_struct *thread) +native_load_sp0(unsigned long sp0) { - tss->x86_tss.sp0 = thread->sp0; + this_cpu_write(cpu_tss.x86_tss.sp0, sp0); } static inline void native_swapgs(void) @@ -544,10 +544,9 @@ static inline unsigned long current_top_of_stack(void) #else #define __cpuid native_cpuid -static inline void load_sp0(struct tss_struct *tss, - struct thread_struct *thread) +static inline void load_sp0(unsigned long sp0) { - native_load_sp0(tss, thread); + native_load_sp0(sp0); } #define set_iopl_mask native_set_iopl_mask diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 03bb004bb15e..4e7fb9c3bfa5 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1570,7 +1570,7 @@ void cpu_init(void) initialize_tlbstate_and_flush(); enter_lazy_tlb(&init_mm, me); - load_sp0(t, ¤t->thread); + load_sp0(current->thread.sp0); set_tss_desc(cpu, t); load_TR_desc(); load_mm_ldt(&init_mm); @@ -1625,7 +1625,7 @@ void cpu_init(void) initialize_tlbstate_and_flush(); enter_lazy_tlb(&init_mm, curr); - load_sp0(t, thread); + load_sp0(thread->sp0); set_tss_desc(cpu, t); load_TR_desc(); load_mm_ldt(&init_mm); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 0936ed3da6b6..40b85870e429 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -287,7 +287,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * current_thread_info(). Refresh the SYSENTER configuration in * case prev or next is vm86. */ - load_sp0(tss, next); + load_sp0(next->sp0); refresh_sysenter_cs(next); this_cpu_write(cpu_current_top_of_stack, (unsigned long)task_stack_page(next_p) + diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index a6ff6d1a0110..2124304fb77a 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -465,7 +465,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) this_cpu_write(current_task, next_p); /* Reload sp0. */ - load_sp0(tss, next); + load_sp0(next->sp0); /* * Now maybe reload the debug registers and handle I/O bitmaps diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 5bc1c3ab6287..0f1d92cd20ad 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -94,7 +94,6 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) { - struct tss_struct *tss; struct task_struct *tsk = current; struct vm86plus_struct __user *user; struct vm86 *vm86 = current->thread.vm86; @@ -146,13 +145,13 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) do_exit(SIGSEGV); } - tss = &per_cpu(cpu_tss, get_cpu()); + preempt_disable(); tsk->thread.sp0 = vm86->saved_sp0; tsk->thread.sysenter_cs = __KERNEL_CS; - load_sp0(tss, &tsk->thread); + load_sp0(tsk->thread.sp0); refresh_sysenter_cs(&tsk->thread); vm86->saved_sp0 = 0; - put_cpu(); + preempt_enable(); memcpy(®s->pt, &vm86->regs32, sizeof(struct pt_regs)); @@ -238,7 +237,6 @@ SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg) static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) { - struct tss_struct *tss; struct task_struct *tsk = current; struct vm86 *vm86 = tsk->thread.vm86; struct kernel_vm86_regs vm86regs; @@ -366,8 +364,8 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) vm86->saved_sp0 = tsk->thread.sp0; lazy_save_gs(vm86->regs32.gs); - tss = &per_cpu(cpu_tss, get_cpu()); /* make room for real-mode segments */ + preempt_disable(); tsk->thread.sp0 += 16; if (static_cpu_has(X86_FEATURE_SEP)) { @@ -375,8 +373,8 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) refresh_sysenter_cs(&tsk->thread); } - load_sp0(tss, &tsk->thread); - put_cpu(); + load_sp0(tsk->thread.sp0); + preempt_enable(); if (vm86->flags & VM86_SCREEN_BITMAP) mark_screen_rdonly(tsk->mm); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 8da4eff19c2a..e7b213047724 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -810,15 +810,14 @@ static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry, } } -static void xen_load_sp0(struct tss_struct *tss, - struct thread_struct *thread) +static void xen_load_sp0(unsigned long sp0) { struct multicall_space mcs; mcs = xen_mc_entry(0); - MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); + MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0); xen_mc_issue(PARAVIRT_LAZY_CPU); - tss->x86_tss.sp0 = thread->sp0; + this_cpu_write(cpu_tss.x86_tss.sp0, sp0); } void xen_set_iopl_mask(unsigned mask) -- cgit v1.2.3 From 20bb83443ea79087b5e5f8dab4e9d80bb9bf7acb Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:13 -0700 Subject: x86/entry/64: Stop initializing TSS.sp0 at boot In my quest to get rid of thread_struct::sp0, I want to clean up or remove all of its readers. Two of them are in cpu_init() (32-bit and 64-bit), and they aren't needed. This is because we never enter userspace at all on the threads that CPUs are initialized in. Poison the initial TSS.sp0 and stop initializing it on CPU init. The comment text mostly comes from Dave Hansen. Thanks! Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/ee4a00540ad28c6cff475fbcc7769a4460acc861.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 13 ++++++++++--- arch/x86/kernel/process.c | 8 +++++++- 2 files changed, 17 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4e7fb9c3bfa5..cdf79ab628c2 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1570,9 +1570,13 @@ void cpu_init(void) initialize_tlbstate_and_flush(); enter_lazy_tlb(&init_mm, me); - load_sp0(current->thread.sp0); + /* + * Initialize the TSS. Don't bother initializing sp0, as the initial + * task never enters user mode. + */ set_tss_desc(cpu, t); load_TR_desc(); + load_mm_ldt(&init_mm); clear_all_debug_regs(); @@ -1594,7 +1598,6 @@ void cpu_init(void) int cpu = smp_processor_id(); struct task_struct *curr = current; struct tss_struct *t = &per_cpu(cpu_tss, cpu); - struct thread_struct *thread = &curr->thread; wait_for_master_cpu(cpu); @@ -1625,9 +1628,13 @@ void cpu_init(void) initialize_tlbstate_and_flush(); enter_lazy_tlb(&init_mm, curr); - load_sp0(thread->sp0); + /* + * Initialize the TSS. Don't bother initializing sp0, as the initial + * task never enters user mode. + */ set_tss_desc(cpu, t); load_TR_desc(); + load_mm_ldt(&init_mm); t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index bd6b85fac666..ff8a9acbcf8b 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -48,7 +48,13 @@ */ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { .x86_tss = { - .sp0 = TOP_OF_INIT_STACK, + /* + * .sp0 is only used when entering ring 0 from a lower + * privilege level. Since the init task never runs anything + * but ring 0 code, there is no need for a valid value here. + * Poison it. + */ + .sp0 = (1UL << (BITS_PER_LONG-1)) + 1, #ifdef CONFIG_X86_32 .ss0 = __KERNEL_DS, .ss1 = __KERNEL_CS, -- cgit v1.2.3 From 46f5a10a721ce8dce8cc8fe55279b49e1c6b3288 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:14 -0700 Subject: x86/entry/64: Remove all remaining direct thread_struct::sp0 reads The only remaining readers in context switch code or vm86(), and they all just want to update TSS.sp0 to match the current task. Replace them all with a new helper update_sp0(). Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/2d231687f4ff288c9d9e98d7861b7df374246ac3.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/switch_to.h | 6 ++++++ arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- arch/x86/kernel/vm86_32.c | 4 ++-- 4 files changed, 10 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 7ae8caffbada..54e64d909725 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -84,4 +84,10 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread) } #endif +/* This is used when switching tasks or entering/exiting vm86 mode. */ +static inline void update_sp0(struct task_struct *task) +{ + load_sp0(task->thread.sp0); +} + #endif /* _ASM_X86_SWITCH_TO_H */ diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 40b85870e429..45bf0c5f93e1 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -287,7 +287,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * current_thread_info(). Refresh the SYSENTER configuration in * case prev or next is vm86. */ - load_sp0(next->sp0); + update_sp0(next_p); refresh_sysenter_cs(next); this_cpu_write(cpu_current_top_of_stack, (unsigned long)task_stack_page(next_p) + diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 2124304fb77a..45e380958392 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -465,7 +465,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) this_cpu_write(current_task, next_p); /* Reload sp0. */ - load_sp0(next->sp0); + update_sp0(next_p); /* * Now maybe reload the debug registers and handle I/O bitmaps diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 0f1d92cd20ad..a7b44c75c642 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -148,7 +148,7 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) preempt_disable(); tsk->thread.sp0 = vm86->saved_sp0; tsk->thread.sysenter_cs = __KERNEL_CS; - load_sp0(tsk->thread.sp0); + update_sp0(tsk); refresh_sysenter_cs(&tsk->thread); vm86->saved_sp0 = 0; preempt_enable(); @@ -373,7 +373,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) refresh_sysenter_cs(&tsk->thread); } - load_sp0(tsk->thread.sp0); + update_sp0(tsk); preempt_enable(); if (vm86->flags & VM86_SCREEN_BITMAP) -- cgit v1.2.3 From cd493a6deb8b78eca280d05f7fa73fd69403ae29 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:15 -0700 Subject: x86/entry/32: Fix cpu_current_top_of_stack initialization at boot cpu_current_top_of_stack's initialization forgot about TOP_OF_KERNEL_STACK_PADDING. This bug didn't matter because the idle threads never enter user mode. Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/e5e370a7e6e4fddd1c4e4cf619765d96bb874b21.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index ad59edd84de7..06c18fe1c09e 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -961,8 +961,7 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle) #ifdef CONFIG_X86_32 /* Stack for startup_32 can be just as for start_secondary onwards */ irq_ctx_init(cpu); - per_cpu(cpu_current_top_of_stack, cpu) = - (unsigned long)task_stack_page(idle) + THREAD_SIZE; + per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle); #else initial_gs = per_cpu_offset(cpu); #endif -- cgit v1.2.3 From d375cf1530595e33961a8844192cddab913650e3 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:16 -0700 Subject: x86/entry/64: Remove thread_struct::sp0 On x86_64, we can easily calculate sp0 when needed instead of storing it in thread_struct. On x86_32, a similar cleanup would be possible, but it would require cleaning up the vm86 code first, and that can wait for a later cleanup series. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/719cd9c66c548c4350d98a90f050aee8b17f8919.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/compat.h | 1 + arch/x86/include/asm/processor.h | 28 +++++++++------------------- arch/x86/include/asm/switch_to.h | 6 ++++++ arch/x86/kernel/process_64.c | 1 - 4 files changed, 16 insertions(+), 20 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 5343c19814b3..948b6d8ec46f 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -6,6 +6,7 @@ */ #include #include +#include #include #include #include diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index ad59cec14239..ae2ae6d80674 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -430,7 +430,9 @@ typedef struct { struct thread_struct { /* Cached TLS descriptors: */ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; +#ifdef CONFIG_X86_32 unsigned long sp0; +#endif unsigned long sp; #ifdef CONFIG_X86_32 unsigned long sysenter_cs; @@ -797,6 +799,13 @@ static inline void spin_lock_prefetch(const void *x) #define task_top_of_stack(task) ((unsigned long)(task_pt_regs(task) + 1)) +#define task_pt_regs(task) \ +({ \ + unsigned long __ptr = (unsigned long)task_stack_page(task); \ + __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \ + ((struct pt_regs *)__ptr) - 1; \ +}) + #ifdef CONFIG_X86_32 /* * User space process size: 3GB (default). @@ -816,23 +825,6 @@ static inline void spin_lock_prefetch(const void *x) .addr_limit = KERNEL_DS, \ } -/* - * TOP_OF_KERNEL_STACK_PADDING reserves 8 bytes on top of the ring0 stack. - * This is necessary to guarantee that the entire "struct pt_regs" - * is accessible even if the CPU haven't stored the SS/ESP registers - * on the stack (interrupt gate does not save these registers - * when switching to the same priv ring). - * Therefore beware: accessing the ss/esp fields of the - * "struct pt_regs" is possible, but they may contain the - * completely wrong values. - */ -#define task_pt_regs(task) \ -({ \ - unsigned long __ptr = (unsigned long)task_stack_page(task); \ - __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \ - ((struct pt_regs *)__ptr) - 1; \ -}) - #define KSTK_ESP(task) (task_pt_regs(task)->sp) #else @@ -866,11 +858,9 @@ static inline void spin_lock_prefetch(const void *x) #define STACK_TOP_MAX TASK_SIZE_MAX #define INIT_THREAD { \ - .sp0 = TOP_OF_INIT_STACK, \ .addr_limit = KERNEL_DS, \ } -#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) extern unsigned long KSTK_ESP(struct task_struct *task); #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 54e64d909725..010cd6e4eafc 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_SWITCH_TO_H #define _ASM_X86_SWITCH_TO_H +#include + struct task_struct; /* one of the stranger aspects of C forward declarations */ struct task_struct *__switch_to_asm(struct task_struct *prev, @@ -87,7 +89,11 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread) /* This is used when switching tasks or entering/exiting vm86 mode. */ static inline void update_sp0(struct task_struct *task) { +#ifdef CONFIG_X86_32 load_sp0(task->thread.sp0); +#else + load_sp0(task_top_of_stack(task)); +#endif } #endif /* _ASM_X86_SWITCH_TO_H */ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 45e380958392..eeeb34f85c25 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -274,7 +274,6 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, struct inactive_task_frame *frame; struct task_struct *me = current; - p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE; childregs = task_pt_regs(p); fork_frame = container_of(childregs, struct fork_frame, regs); frame = &fork_frame->frame; -- cgit v1.2.3 From 3383642c2f9d4f5b4fa37436db4a109a1a10018c Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:17 -0700 Subject: x86/traps: Use a new on_thread_stack() helper to clean up an assertion Let's keep the stack-related logic together rather than open-coding a comparison in an assertion in the traps code. Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/856b15bee1f55017b8f79d3758b0d51c48a08cf8.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 6 ++++++ arch/x86/kernel/traps.c | 3 +-- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index ae2ae6d80674..f10dae14f951 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -541,6 +541,12 @@ static inline unsigned long current_top_of_stack(void) #endif } +static inline bool on_thread_stack(void) +{ + return (unsigned long)(current_top_of_stack() - + current_stack_pointer) < THREAD_SIZE; +} + #ifdef CONFIG_PARAVIRT #include #else diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 67db4f43309e..42a9c4458f5d 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -141,8 +141,7 @@ void ist_begin_non_atomic(struct pt_regs *regs) * will catch asm bugs and any attempt to use ist_preempt_enable * from double_fault. */ - BUG_ON((unsigned long)(current_top_of_stack() - - current_stack_pointer) >= THREAD_SIZE); + BUG_ON(!on_thread_stack()); preempt_enable_no_resched(); } -- cgit v1.2.3