From b82a8dbd3d2f4563156f7150c6f2ecab6e960b30 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 4 Dec 2023 11:31:38 +0300 Subject: x86/coco: Disable 32-bit emulation by default on TDX and SEV MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The INT 0x80 instruction is used for 32-bit x86 Linux syscalls. The kernel expects to receive a software interrupt as a result of the INT 0x80 instruction. However, an external interrupt on the same vector triggers the same handler. The kernel interprets an external interrupt on vector 0x80 as a 32-bit system call that came from userspace. A VMM can inject external interrupts on any arbitrary vector at any time. This remains true even for TDX and SEV guests where the VMM is untrusted. Put together, this allows an untrusted VMM to trigger int80 syscall handling at any given point. The content of the guest register file at that moment defines what syscall is triggered and its arguments. It opens the guest OS to manipulation from the VMM side. Disable 32-bit emulation by default for TDX and SEV. User can override it with the ia32_emulation=y command line option. [ dhansen: reword the changelog ] Reported-by: Supraja Sridhara Reported-by: Benedict Schlüter Reported-by: Mark Kuhne Reported-by: Andrin Bertschi Reported-by: Shweta Shinde Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov (AMD) Cc: # v6.0+: 1da5c9b x86: Introduce ia32_enabled() Cc: # v6.0+ --- arch/x86/coco/tdx/tdx.c | 10 ++++++++++ arch/x86/include/asm/ia32.h | 7 +++++++ arch/x86/mm/mem_encrypt_amd.c | 11 +++++++++++ 3 files changed, 28 insertions(+) (limited to 'arch') diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 1b5d17a9f70d..545e54b829b2 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -891,5 +892,14 @@ void __init tdx_early_init(void) */ x86_cpuinit.parallel_bringup = false; + /* + * The VMM is capable of injecting interrupt 0x80 and triggering the + * compatibility syscall path. + * + * By default, the 32-bit emulation is disabled in order to ensure + * the safety of the VM. + */ + ia32_disable(); + pr_info("Guest detected\n"); } diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h index 5a2ae24b1204..9805629479d9 100644 --- a/arch/x86/include/asm/ia32.h +++ b/arch/x86/include/asm/ia32.h @@ -75,6 +75,11 @@ static inline bool ia32_enabled(void) return __ia32_enabled; } +static inline void ia32_disable(void) +{ + __ia32_enabled = false; +} + #else /* !CONFIG_IA32_EMULATION */ static inline bool ia32_enabled(void) @@ -82,6 +87,8 @@ static inline bool ia32_enabled(void) return IS_ENABLED(CONFIG_X86_32); } +static inline void ia32_disable(void) {} + #endif #endif /* _ASM_X86_IA32_H */ diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c index a68f2dda0948..70b91de2e053 100644 --- a/arch/x86/mm/mem_encrypt_amd.c +++ b/arch/x86/mm/mem_encrypt_amd.c @@ -32,6 +32,7 @@ #include #include #include +#include #include "mm_internal.h" @@ -481,6 +482,16 @@ void __init sme_early_init(void) */ if (sev_status & MSR_AMD64_SEV_ES_ENABLED) x86_cpuinit.parallel_bringup = false; + + /* + * The VMM is capable of injecting interrupt 0x80 and triggering the + * compatibility syscall path. + * + * By default, the 32-bit emulation is disabled in order to ensure + * the safety of the VM. + */ + if (sev_status & MSR_AMD64_SEV_ENABLED) + ia32_disable(); } void __init mem_encrypt_free_decrypted_mem(void) -- cgit v1.2.3 From be5341eb0d43b1e754799498bd2e8756cc167a41 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Dec 2023 11:31:39 +0300 Subject: x86/entry: Convert INT 0x80 emulation to IDTENTRY There is no real reason to have a separate ASM entry point implementation for the legacy INT 0x80 syscall emulation on 64-bit. IDTENTRY provides all the functionality needed with the only difference that it does not: - save the syscall number (AX) into pt_regs::orig_ax - set pt_regs::ax to -ENOSYS Both can be done safely in the C code of an IDTENTRY before invoking any of the syscall related functions which depend on this convention. Aside of ASM code reduction this prepares for detecting and handling a local APIC injected vector 0x80. [ kirill.shutemov: More verbose comments ] Suggested-by: Linus Torvalds Signed-off-by: Thomas Gleixner Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Borislav Petkov (AMD) Cc: # v6.0+ --- arch/x86/entry/common.c | 58 +++++++++++++++++++++++++++++- arch/x86/entry/entry_64_compat.S | 77 ---------------------------------------- arch/x86/include/asm/idtentry.h | 4 +++ arch/x86/include/asm/proto.h | 4 --- arch/x86/kernel/idt.c | 2 +- arch/x86/xen/enlighten_pv.c | 2 +- arch/x86/xen/xen-asm.S | 2 +- 7 files changed, 64 insertions(+), 85 deletions(-) (limited to 'arch') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index d813160b14d8..8ca4d57d68eb 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -167,7 +167,62 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr) } } -/* Handles int $0x80 */ +#ifdef CONFIG_IA32_EMULATION +/** + * int80_emulation - 32-bit legacy syscall entry + * + * This entry point can be used by 32-bit and 64-bit programs to perform + * 32-bit system calls. Instances of INT $0x80 can be found inline in + * various programs and libraries. It is also used by the vDSO's + * __kernel_vsyscall fallback for hardware that doesn't support a faster + * entry method. Restarted 32-bit system calls also fall back to INT + * $0x80 regardless of what instruction was originally used to do the + * system call. + * + * This is considered a slow path. It is not used by most libc + * implementations on modern hardware except during process startup. + * + * The arguments for the INT $0x80 based syscall are on stack in the + * pt_regs structure: + * eax: system call number + * ebx, ecx, edx, esi, edi, ebp: arg1 - arg 6 + */ +DEFINE_IDTENTRY_RAW(int80_emulation) +{ + int nr; + + /* Establish kernel context. */ + enter_from_user_mode(regs); + + instrumentation_begin(); + add_random_kstack_offset(); + + /* + * The low level idtentry code pushed -1 into regs::orig_ax + * and regs::ax contains the syscall number. + * + * User tracing code (ptrace or signal handlers) might assume + * that the regs::orig_ax contains a 32-bit number on invoking + * a 32-bit syscall. + * + * Establish the syscall convention by saving the 32bit truncated + * syscall number in regs::orig_ax and by invalidating regs::ax. + */ + regs->orig_ax = regs->ax & GENMASK(31, 0); + regs->ax = -ENOSYS; + + nr = syscall_32_enter(regs); + + local_irq_enable(); + nr = syscall_enter_from_user_mode_work(regs, nr); + do_syscall_32_irqs_on(regs, nr); + + instrumentation_end(); + syscall_exit_to_user_mode(regs); +} +#else /* CONFIG_IA32_EMULATION */ + +/* Handles int $0x80 on a 32bit kernel */ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs) { int nr = syscall_32_enter(regs); @@ -186,6 +241,7 @@ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs) instrumentation_end(); syscall_exit_to_user_mode(regs); } +#endif /* !CONFIG_IA32_EMULATION */ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) { diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 27c05d08558a..de94e2e84ecc 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -275,80 +275,3 @@ SYM_INNER_LABEL(entry_SYSRETL_compat_end, SYM_L_GLOBAL) ANNOTATE_NOENDBR int3 SYM_CODE_END(entry_SYSCALL_compat) - -/* - * 32-bit legacy system call entry. - * - * 32-bit x86 Linux system calls traditionally used the INT $0x80 - * instruction. INT $0x80 lands here. - * - * This entry point can be used by 32-bit and 64-bit programs to perform - * 32-bit system calls. Instances of INT $0x80 can be found inline in - * various programs and libraries. It is also used by the vDSO's - * __kernel_vsyscall fallback for hardware that doesn't support a faster - * entry method. Restarted 32-bit system calls also fall back to INT - * $0x80 regardless of what instruction was originally used to do the - * system call. - * - * This is considered a slow path. It is not used by most libc - * implementations on modern hardware except during process startup. - * - * Arguments: - * eax system call number - * ebx arg1 - * ecx arg2 - * edx arg3 - * esi arg4 - * edi arg5 - * ebp arg6 - */ -SYM_CODE_START(entry_INT80_compat) - UNWIND_HINT_ENTRY - ENDBR - /* - * Interrupts are off on entry. - */ - ASM_CLAC /* Do this early to minimize exposure */ - ALTERNATIVE "swapgs", "", X86_FEATURE_XENPV - - /* - * User tracing code (ptrace or signal handlers) might assume that - * the saved RAX contains a 32-bit number when we're invoking a 32-bit - * syscall. Just in case the high bits are nonzero, zero-extend - * the syscall number. (This could almost certainly be deleted - * with no ill effects.) - */ - movl %eax, %eax - - /* switch to thread stack expects orig_ax and rdi to be pushed */ - pushq %rax /* pt_regs->orig_ax */ - - /* Need to switch before accessing the thread stack. */ - SWITCH_TO_KERNEL_CR3 scratch_reg=%rax - - /* In the Xen PV case we already run on the thread stack. */ - ALTERNATIVE "", "jmp .Lint80_keep_stack", X86_FEATURE_XENPV - - movq %rsp, %rax - movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp - - pushq 5*8(%rax) /* regs->ss */ - pushq 4*8(%rax) /* regs->rsp */ - pushq 3*8(%rax) /* regs->eflags */ - pushq 2*8(%rax) /* regs->cs */ - pushq 1*8(%rax) /* regs->ip */ - pushq 0*8(%rax) /* regs->orig_ax */ -.Lint80_keep_stack: - - PUSH_AND_CLEAR_REGS rax=$-ENOSYS - UNWIND_HINT_REGS - - cld - - IBRS_ENTER - UNTRAIN_RET - - movq %rsp, %rdi - call do_int80_syscall_32 - jmp swapgs_restore_regs_and_return_to_usermode -SYM_CODE_END(entry_INT80_compat) diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 05fd175cec7d..13639e57e1f8 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -569,6 +569,10 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_UD, exc_invalid_op); DECLARE_IDTENTRY_RAW(X86_TRAP_BP, exc_int3); DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_PF, exc_page_fault); +#if defined(CONFIG_IA32_EMULATION) +DECLARE_IDTENTRY_RAW(IA32_SYSCALL_VECTOR, int80_emulation); +#endif + #ifdef CONFIG_X86_MCE #ifdef CONFIG_X86_64 DECLARE_IDTENTRY_MCE(X86_TRAP_MC, exc_machine_check); diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 4d84122bd643..484f4f0131a5 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -32,10 +32,6 @@ void entry_SYSCALL_compat(void); void entry_SYSCALL_compat_safe_stack(void); void entry_SYSRETL_compat_unsafe_stack(void); void entry_SYSRETL_compat_end(void); -void entry_INT80_compat(void); -#ifdef CONFIG_XEN_PV -void xen_entry_INT80_compat(void); -#endif #else /* !CONFIG_IA32_EMULATION */ #define entry_SYSCALL_compat NULL #define entry_SYSENTER_compat NULL diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 8857abc706e4..660b601f1d6c 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -121,7 +121,7 @@ static const __initconst struct idt_data def_idts[] = { static const struct idt_data ia32_idt[] __initconst = { #if defined(CONFIG_IA32_EMULATION) - SYSG(IA32_SYSCALL_VECTOR, entry_INT80_compat), + SYSG(IA32_SYSCALL_VECTOR, asm_int80_emulation), #elif defined(CONFIG_X86_32) SYSG(IA32_SYSCALL_VECTOR, entry_INT80_32), #endif diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index bbbfdd495ebd..aeb33e0a3f76 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -704,7 +704,7 @@ static struct trap_array_entry trap_array[] = { TRAP_ENTRY(exc_int3, false ), TRAP_ENTRY(exc_overflow, false ), #ifdef CONFIG_IA32_EMULATION - { entry_INT80_compat, xen_entry_INT80_compat, false }, + TRAP_ENTRY(int80_emulation, false ), #endif TRAP_ENTRY(exc_page_fault, false ), TRAP_ENTRY(exc_divide_error, false ), diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index 9e5e68008785..1a9cd18dfbd3 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -156,7 +156,7 @@ xen_pv_trap asm_xenpv_exc_machine_check #endif /* CONFIG_X86_MCE */ xen_pv_trap asm_exc_simd_coprocessor_error #ifdef CONFIG_IA32_EMULATION -xen_pv_trap entry_INT80_compat +xen_pv_trap asm_int80_emulation #endif xen_pv_trap asm_exc_xen_unknown_trap xen_pv_trap asm_exc_xen_hypervisor_callback -- cgit v1.2.3 From 55617fb991df535f953589586468612351575704 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Dec 2023 11:31:40 +0300 Subject: x86/entry: Do not allow external 0x80 interrupts The INT 0x80 instruction is used for 32-bit x86 Linux syscalls. The kernel expects to receive a software interrupt as a result of the INT 0x80 instruction. However, an external interrupt on the same vector also triggers the same codepath. An external interrupt on vector 0x80 will currently be interpreted as a 32-bit system call, and assuming that it was a user context. Panic on external interrupts on the vector. To distinguish software interrupts from external ones, the kernel checks the APIC ISR bit relevant to the 0x80 vector. For software interrupts, this bit will be 0. Signed-off-by: Thomas Gleixner Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Borislav Petkov (AMD) Cc: # v6.0+ --- arch/x86/entry/common.c | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 8ca4d57d68eb..6356060caaf3 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -26,6 +26,7 @@ #include #endif +#include #include #include #include @@ -168,6 +169,25 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr) } #ifdef CONFIG_IA32_EMULATION +static __always_inline bool int80_is_external(void) +{ + const unsigned int offs = (0x80 / 32) * 0x10; + const u32 bit = BIT(0x80 % 32); + + /* The local APIC on XENPV guests is fake */ + if (cpu_feature_enabled(X86_FEATURE_XENPV)) + return false; + + /* + * If vector 0x80 is set in the APIC ISR then this is an external + * interrupt. Either from broken hardware or injected by a VMM. + * + * Note: In guest mode this is only valid for secure guests where + * the secure module fully controls the vAPIC exposed to the guest. + */ + return apic_read(APIC_ISR + offs) & bit; +} + /** * int80_emulation - 32-bit legacy syscall entry * @@ -191,12 +211,27 @@ DEFINE_IDTENTRY_RAW(int80_emulation) { int nr; - /* Establish kernel context. */ + /* Kernel does not use INT $0x80! */ + if (unlikely(!user_mode(regs))) { + irqentry_enter(regs); + instrumentation_begin(); + panic("Unexpected external interrupt 0x80\n"); + } + + /* + * Establish kernel context for instrumentation, including for + * int80_is_external() below which calls into the APIC driver. + * Identical for soft and external interrupts. + */ enter_from_user_mode(regs); instrumentation_begin(); add_random_kstack_offset(); + /* Validate that this is a soft interrupt to the extent possible */ + if (unlikely(int80_is_external())) + panic("Unexpected external interrupt 0x80\n"); + /* * The low level idtentry code pushed -1 into regs::orig_ax * and regs::ax contains the syscall number. -- cgit v1.2.3 From f4116bfc44621882556bbf70f5284fbf429a5cf6 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 4 Dec 2023 11:31:41 +0300 Subject: x86/tdx: Allow 32-bit emulation by default 32-bit emulation was disabled on TDX to prevent a possible attack by a VMM injecting an interrupt on vector 0x80. Now that int80_emulation() has a check for external interrupts the limitation can be lifted. To distinguish software interrupts from external ones, int80_emulation() checks the APIC ISR bit relevant to the 0x80 vector. For software interrupts, this bit will be 0. On TDX, the VAPIC state (including ISR) is protected and cannot be manipulated by the VMM. The ISR bit is set by the microcode flow during the handling of posted interrupts. [ dhansen: more changelog tweaks ] Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov (AMD) Cc: # v6.0+ --- arch/x86/coco/tdx/tdx.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 545e54b829b2..cf1f13c82175 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -892,14 +892,5 @@ void __init tdx_early_init(void) */ x86_cpuinit.parallel_bringup = false; - /* - * The VMM is capable of injecting interrupt 0x80 and triggering the - * compatibility syscall path. - * - * By default, the 32-bit emulation is disabled in order to ensure - * the safety of the VM. - */ - ia32_disable(); - pr_info("Guest detected\n"); } -- cgit v1.2.3