diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-05-05 19:17:05 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-05-05 19:17:05 +0200 |
commit | d099637d074b9d8170b06365f575f6cf03d614f5 (patch) | |
tree | f34bb415e91cc9a43fcae7305b4aa1dd4f2ab21e /arch | |
parent | Merge tag 'irq-urgent-2024-05-05' of git://git.kernel.org/pub/scm/linux/kerne... (diff) | |
parent | x86/mm: Remove broken vsyscall emulation code from the page fault code (diff) | |
download | linux-d099637d074b9d8170b06365f575f6cf03d614f5.tar.xz linux-d099637d074b9d8170b06365f575f6cf03d614f5.zip |
Merge tag 'x86-urgent-2024-05-05' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull misc x86 fixes from Ingo Molnar:
- Remove the broken vsyscall emulation code from
the page fault code
- Fix kexec crash triggered by certain SEV RMP
table layouts
- Fix unchecked MSR access error when disabling
the x2APIC via iommu=off
* tag 'x86-urgent-2024-05-05' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/mm: Remove broken vsyscall emulation code from the page fault code
x86/apic: Don't access the APIC when disabling x2APIC
x86/sev: Add callback to apply RMP table fixups for kexec
x86/e820: Add a new e820 table update helper
Diffstat (limited to 'arch')
-rw-r--r-- | arch/x86/entry/vsyscall/vsyscall_64.c | 28 | ||||
-rw-r--r-- | arch/x86/include/asm/e820/api.h | 1 | ||||
-rw-r--r-- | arch/x86/include/asm/processor.h | 1 | ||||
-rw-r--r-- | arch/x86/include/asm/sev.h | 2 | ||||
-rw-r--r-- | arch/x86/kernel/apic/apic.c | 16 | ||||
-rw-r--r-- | arch/x86/kernel/e820.c | 7 | ||||
-rw-r--r-- | arch/x86/mm/fault.c | 33 | ||||
-rw-r--r-- | arch/x86/mm/mem_encrypt.c | 7 | ||||
-rw-r--r-- | arch/x86/virt/svm/sev.c | 36 |
9 files changed, 64 insertions, 67 deletions
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index a3c0df11d0e6..2fb7d53cf333 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -98,11 +98,6 @@ static int addr_to_vsyscall_nr(unsigned long addr) static bool write_ok_or_segv(unsigned long ptr, size_t size) { - /* - * XXX: if access_ok, get_user, and put_user handled - * sig_on_uaccess_err, this could go away. - */ - if (!access_ok((void __user *)ptr, size)) { struct thread_struct *thread = ¤t->thread; @@ -120,10 +115,8 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size) bool emulate_vsyscall(unsigned long error_code, struct pt_regs *regs, unsigned long address) { - struct task_struct *tsk; unsigned long caller; int vsyscall_nr, syscall_nr, tmp; - int prev_sig_on_uaccess_err; long ret; unsigned long orig_dx; @@ -172,8 +165,6 @@ bool emulate_vsyscall(unsigned long error_code, goto sigsegv; } - tsk = current; - /* * Check for access_ok violations and find the syscall nr. * @@ -234,12 +225,8 @@ bool emulate_vsyscall(unsigned long error_code, goto do_ret; /* skip requested */ /* - * With a real vsyscall, page faults cause SIGSEGV. We want to - * preserve that behavior to make writing exploits harder. + * With a real vsyscall, page faults cause SIGSEGV. */ - prev_sig_on_uaccess_err = current->thread.sig_on_uaccess_err; - current->thread.sig_on_uaccess_err = 1; - ret = -EFAULT; switch (vsyscall_nr) { case 0: @@ -262,23 +249,12 @@ bool emulate_vsyscall(unsigned long error_code, break; } - current->thread.sig_on_uaccess_err = prev_sig_on_uaccess_err; - check_fault: if (ret == -EFAULT) { /* Bad news -- userspace fed a bad pointer to a vsyscall. */ warn_bad_vsyscall(KERN_INFO, regs, "vsyscall fault (exploit attempt?)"); - - /* - * If we failed to generate a signal for any reason, - * generate one here. (This should be impossible.) - */ - if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) && - !sigismember(&tsk->pending.signal, SIGSEGV))) - goto sigsegv; - - return true; /* Don't emulate the ret. */ + goto sigsegv; } regs->ax = ret; diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h index e8f58ddd06d9..2e74a7f0e935 100644 --- a/arch/x86/include/asm/e820/api.h +++ b/arch/x86/include/asm/e820/api.h @@ -17,6 +17,7 @@ extern bool e820__mapped_all(u64 start, u64 end, enum e820_type type); extern void e820__range_add (u64 start, u64 size, enum e820_type type); extern u64 e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type); extern u64 e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type); +extern u64 e820__range_update_table(struct e820_table *t, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type); extern void e820__print_table(char *who); extern int e820__update_table(struct e820_table *table); diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 811548f131f4..78e51b0d6433 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -472,7 +472,6 @@ struct thread_struct { unsigned long iopl_emul; unsigned int iopl_warn:1; - unsigned int sig_on_uaccess_err:1; /* * Protection Keys Register for Userspace. Loaded immediately on diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 7f57382afee4..93ed60080cfe 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -269,6 +269,7 @@ int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid, bool immut int rmp_make_shared(u64 pfn, enum pg_level level); void snp_leak_pages(u64 pfn, unsigned int npages); void kdump_sev_callback(void); +void snp_fixup_e820_tables(void); #else static inline bool snp_probe_rmptable_info(void) { return false; } static inline int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level) { return -ENODEV; } @@ -282,6 +283,7 @@ static inline int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 as static inline int rmp_make_shared(u64 pfn, enum pg_level level) { return -ENODEV; } static inline void snp_leak_pages(u64 pfn, unsigned int npages) {} static inline void kdump_sev_callback(void) { } +static inline void snp_fixup_e820_tables(void) {} #endif #endif diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index c342c4aa9c68..803dcfb0e346 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1771,7 +1771,7 @@ void x2apic_setup(void) __x2apic_enable(); } -static __init void apic_set_fixmap(void); +static __init void apic_set_fixmap(bool read_apic); static __init void x2apic_disable(void) { @@ -1793,7 +1793,12 @@ static __init void x2apic_disable(void) } __x2apic_disable(); - apic_set_fixmap(); + /* + * Don't reread the APIC ID as it was already done from + * check_x2apic() and the APIC driver still is a x2APIC variant, + * which fails to do the read after x2APIC was disabled. + */ + apic_set_fixmap(false); } static __init void x2apic_enable(void) @@ -2057,13 +2062,14 @@ void __init init_apic_mappings(void) } } -static __init void apic_set_fixmap(void) +static __init void apic_set_fixmap(bool read_apic) { set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); apic_mmio_base = APIC_BASE; apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", apic_mmio_base, mp_lapic_addr); - apic_read_boot_cpu_id(false); + if (read_apic) + apic_read_boot_cpu_id(false); } void __init register_lapic_address(unsigned long address) @@ -2073,7 +2079,7 @@ void __init register_lapic_address(unsigned long address) mp_lapic_addr = address; if (!x2apic_mode) - apic_set_fixmap(); + apic_set_fixmap(true); } /* diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 6f1b379e3b38..68b09f718f10 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -532,9 +532,10 @@ u64 __init e820__range_update(u64 start, u64 size, enum e820_type old_type, enum return __e820__range_update(e820_table, start, size, old_type, new_type); } -static u64 __init e820__range_update_kexec(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) +u64 __init e820__range_update_table(struct e820_table *t, u64 start, u64 size, + enum e820_type old_type, enum e820_type new_type) { - return __e820__range_update(e820_table_kexec, start, size, old_type, new_type); + return __e820__range_update(t, start, size, old_type, new_type); } /* Remove a range of memory from the E820 table: */ @@ -806,7 +807,7 @@ u64 __init e820__memblock_alloc_reserved(u64 size, u64 align) addr = memblock_phys_alloc(size, align); if (addr) { - e820__range_update_kexec(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED); + e820__range_update_table(e820_table_kexec, addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED); pr_info("update e820_table_kexec for e820__memblock_alloc_reserved()\n"); e820__update_table_kexec(); } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 622d12ec7f08..bba4e020dd64 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -723,39 +723,8 @@ kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code, WARN_ON_ONCE(user_mode(regs)); /* Are we prepared to handle this kernel fault? */ - if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) { - /* - * Any interrupt that takes a fault gets the fixup. This makes - * the below recursive fault logic only apply to a faults from - * task context. - */ - if (in_interrupt()) - return; - - /* - * Per the above we're !in_interrupt(), aka. task context. - * - * In this case we need to make sure we're not recursively - * faulting through the emulate_vsyscall() logic. - */ - if (current->thread.sig_on_uaccess_err && signal) { - sanitize_error_code(address, &error_code); - - set_signal_archinfo(address, error_code); - - if (si_code == SEGV_PKUERR) { - force_sig_pkuerr((void __user *)address, pkey); - } else { - /* XXX: hwpoison faults will set the wrong code. */ - force_sig_fault(signal, si_code, (void __user *)address); - } - } - - /* - * Barring that, we can do the fixup and be happy. - */ + if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) return; - } /* * AMD erratum #91 manifests as a spurious page fault on a PREFETCH diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 6f3b3e028718..0a120d85d7bb 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -102,6 +102,13 @@ void __init mem_encrypt_setup_arch(void) phys_addr_t total_mem = memblock_phys_mem_size(); unsigned long size; + /* + * Do RMP table fixups after the e820 tables have been setup by + * e820__memory_setup(). + */ + if (cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + snp_fixup_e820_tables(); + if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) return; diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c index ab0e8448bb6e..0ae10535c699 100644 --- a/arch/x86/virt/svm/sev.c +++ b/arch/x86/virt/svm/sev.c @@ -163,6 +163,42 @@ bool snp_probe_rmptable_info(void) return true; } +static void __init __snp_fixup_e820_tables(u64 pa) +{ + if (IS_ALIGNED(pa, PMD_SIZE)) + return; + + /* + * Handle cases where the RMP table placement by the BIOS is not + * 2M aligned and the kexec kernel could try to allocate + * from within that chunk which then causes a fatal RMP fault. + * + * The e820_table needs to be updated as it is converted to + * kernel memory resources and used by KEXEC_FILE_LOAD syscall + * to load kexec segments. + * + * The e820_table_firmware needs to be updated as it is exposed + * to sysfs and used by the KEXEC_LOAD syscall to load kexec + * segments. + * + * The e820_table_kexec needs to be updated as it passed to + * the kexec-ed kernel. + */ + pa = ALIGN_DOWN(pa, PMD_SIZE); + if (e820__mapped_any(pa, pa + PMD_SIZE, E820_TYPE_RAM)) { + pr_info("Reserving start/end of RMP table on a 2MB boundary [0x%016llx]\n", pa); + e820__range_update(pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); + e820__range_update_table(e820_table_kexec, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); + e820__range_update_table(e820_table_firmware, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); + } +} + +void __init snp_fixup_e820_tables(void) +{ + __snp_fixup_e820_tables(probed_rmp_base); + __snp_fixup_e820_tables(probed_rmp_base + probed_rmp_size); +} + /* * Do the necessary preparations which are verified by the firmware as * described in the SNP_INIT_EX firmware command description in the SNP |