diff options
author | Ingo Molnar <mingo@kernel.org> | 2024-03-12 09:49:52 +0100 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2024-03-12 09:55:57 +0100 |
commit | 2e2bc42c8381d2c0e9604b59e49264821da29368 (patch) | |
tree | c158510b5e7942b3a0d6eb6807cbeacf96035798 /arch/x86/mm | |
parent | x86/sev: Move early startup code into .head.text section (diff) | |
parent | Merge tag 'x86_tdx_for_6.9' of git://git.kernel.org/pub/scm/linux/kernel/git/... (diff) | |
download | linux-2e2bc42c8381d2c0e9604b59e49264821da29368.tar.xz linux-2e2bc42c8381d2c0e9604b59e49264821da29368.zip |
Merge branch 'linus' into x86/boot, to resolve conflict
There's a new conflict with Linus's upstream tree, because
in the following merge conflict resolution in <asm/coco.h>:
38b334fc767e Merge tag 'x86_sev_for_v6.9_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Linus has resolved the conflicting placement of 'cc_mask' better
than the original commit:
1c811d403afd x86/sev: Fix position dependent variable references in startup code
... which was also done by an internal merge resolution:
2e5fc4786b7a Merge branch 'x86/sev' into x86/boot, to resolve conflicts and to pick up dependent tree
But Linus is right in 38b334fc767e, the 'cc_mask' declaration is sufficient
within the #ifdef CONFIG_ARCH_HAS_CC_PLATFORM block.
So instead of forcing Linus to do the same resolution again, merge in Linus's
tree and follow his conflict resolution.
Conflicts:
arch/x86/include/asm/coco.h
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'arch/x86/mm')
-rw-r--r-- | arch/x86/mm/Makefile | 3 | ||||
-rw-r--r-- | arch/x86/mm/amdtopology.c | 34 | ||||
-rw-r--r-- | arch/x86/mm/debug_pagetables.c | 4 | ||||
-rw-r--r-- | arch/x86/mm/dump_pagetables.c | 4 | ||||
-rw-r--r-- | arch/x86/mm/extable.c | 78 | ||||
-rw-r--r-- | arch/x86/mm/fault.c | 32 | ||||
-rw-r--r-- | arch/x86/mm/mem_encrypt.c | 5 | ||||
-rw-r--r-- | arch/x86/mm/pat/memtype.c | 9 | ||||
-rw-r--r-- | arch/x86/mm/pat/set_memory.c | 43 | ||||
-rw-r--r-- | arch/x86/mm/pgtable.c | 4 | ||||
-rw-r--r-- | arch/x86/mm/tlb.c | 10 |
11 files changed, 156 insertions, 70 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index c80febc44cd2..428048e73bd2 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -16,6 +16,7 @@ KASAN_SANITIZE_pgprot.o := n KCSAN_SANITIZE := n # Avoid recursion by not calling KMSAN hooks for CEA code. KMSAN_SANITIZE_cpu_entry_area.o := n +KMSAN_SANITIZE_mem_encrypt_identity.o := n ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_mem_encrypt.o = -pg @@ -60,7 +61,7 @@ obj-$(CONFIG_NUMA_EMU) += numa_emulation.o obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o -obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o +obj-$(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION) += pti.o obj-$(CONFIG_X86_MEM_ENCRYPT) += mem_encrypt.o obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_amd.o diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c index b3ca7d23e4b0..9332b36a1091 100644 --- a/arch/x86/mm/amdtopology.c +++ b/arch/x86/mm/amdtopology.c @@ -54,13 +54,11 @@ static __init int find_northbridge(void) int __init amd_numa_init(void) { - u64 start = PFN_PHYS(0); + unsigned int numnodes, cores, apicid; + u64 prevbase, start = PFN_PHYS(0); u64 end = PFN_PHYS(max_pfn); - unsigned numnodes; - u64 prevbase; - int i, j, nb; u32 nodeid, reg; - unsigned int bits, cores, apicid_base; + int i, j, nb; if (!early_pci_allowed()) return -EINVAL; @@ -158,26 +156,18 @@ int __init amd_numa_init(void) return -ENOENT; /* - * We seem to have valid NUMA configuration. Map apicids to nodes - * using the coreid bits from early_identify_cpu. + * We seem to have valid NUMA configuration. Map apicids to nodes + * using the size of the core domain in the APIC space. */ - bits = boot_cpu_data.x86_coreid_bits; - cores = 1 << bits; - apicid_base = 0; + cores = topology_get_domain_size(TOPO_CORE_DOMAIN); - /* - * get boot-time SMP configuration: - */ - early_get_smp_config(); + apicid = boot_cpu_physical_apicid; + if (apicid > 0) + pr_info("BSP APIC ID: %02x\n", apicid); - if (boot_cpu_physical_apicid > 0) { - pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid); - apicid_base = boot_cpu_physical_apicid; + for_each_node_mask(i, numa_nodes_parsed) { + for (j = 0; j < cores; j++, apicid++) + set_apicid_to_node(apicid, i); } - - for_each_node_mask(i, numa_nodes_parsed) - for (j = apicid_base; j < cores + apicid_base; j++) - set_apicid_to_node((i << bits) + j, i); - return 0; } diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c index b43301cb2a80..ae5c213a1cb0 100644 --- a/arch/x86/mm/debug_pagetables.c +++ b/arch/x86/mm/debug_pagetables.c @@ -22,7 +22,7 @@ static int ptdump_curknl_show(struct seq_file *m, void *v) DEFINE_SHOW_ATTRIBUTE(ptdump_curknl); -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION static int ptdump_curusr_show(struct seq_file *m, void *v) { if (current->mm->pgd) @@ -54,7 +54,7 @@ static int __init pt_dump_debug_init(void) debugfs_create_file("current_kernel", 0400, dir, NULL, &ptdump_curknl_fops); -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION debugfs_create_file("current_user", 0400, dir, NULL, &ptdump_curusr_fops); #endif diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index e1b599ecbbc2..b7b88c1d91ec 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -408,7 +408,7 @@ void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm, bool user) { pgd_t *pgd = mm->pgd; -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION if (user && boot_cpu_has(X86_FEATURE_PTI)) pgd = kernel_to_user_pgdp(pgd); #endif @@ -418,7 +418,7 @@ EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs); void ptdump_walk_user_pgd_level_checkwx(void) { -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION pgd_t *pgd = INIT_PGD; if (!(__supported_pte_mask & _PAGE_NX) || diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 271dcb2deabc..b522933bfa56 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -6,6 +6,7 @@ #include <xen/xen.h> #include <asm/fpu/api.h> +#include <asm/fred.h> #include <asm/sev.h> #include <asm/traps.h> #include <asm/kdebug.h> @@ -223,6 +224,79 @@ static bool ex_handler_ucopy_len(const struct exception_table_entry *fixup, return ex_handler_uaccess(fixup, regs, trapnr, fault_address); } +#ifdef CONFIG_X86_FRED +static bool ex_handler_eretu(const struct exception_table_entry *fixup, + struct pt_regs *regs, unsigned long error_code) +{ + struct pt_regs *uregs = (struct pt_regs *)(regs->sp - offsetof(struct pt_regs, orig_ax)); + unsigned short ss = uregs->ss; + unsigned short cs = uregs->cs; + + /* + * Move the NMI bit from the invalid stack frame, which caused ERETU + * to fault, to the fault handler's stack frame, thus to unblock NMI + * with the fault handler's ERETS instruction ASAP if NMI is blocked. + */ + regs->fred_ss.nmi = uregs->fred_ss.nmi; + + /* + * Sync event information to uregs, i.e., the ERETU return frame, but + * is it safe to write to the ERETU return frame which is just above + * current event stack frame? + * + * The RSP used by FRED to push a stack frame is not the value in %rsp, + * it is calculated from %rsp with the following 2 steps: + * 1) RSP = %rsp - (IA32_FRED_CONFIG & 0x1c0) // Reserve N*64 bytes + * 2) RSP = RSP & ~0x3f // Align to a 64-byte cache line + * when an event delivery doesn't trigger a stack level change. + * + * Here is an example with N*64 (N=1) bytes reserved: + * + * 64-byte cache line ==> ______________ + * |___Reserved___| + * |__Event_data__| + * |_____SS_______| + * |_____RSP______| + * |_____FLAGS____| + * |_____CS_______| + * |_____IP_______| + * 64-byte cache line ==> |__Error_code__| <== ERETU return frame + * |______________| + * |______________| + * |______________| + * |______________| + * |______________| + * |______________| + * |______________| + * 64-byte cache line ==> |______________| <== RSP after step 1) and 2) + * |___Reserved___| + * |__Event_data__| + * |_____SS_______| + * |_____RSP______| + * |_____FLAGS____| + * |_____CS_______| + * |_____IP_______| + * 64-byte cache line ==> |__Error_code__| <== ERETS return frame + * + * Thus a new FRED stack frame will always be pushed below a previous + * FRED stack frame ((N*64) bytes may be reserved between), and it is + * safe to write to a previous FRED stack frame as they never overlap. + */ + fred_info(uregs)->edata = fred_event_data(regs); + uregs->ssx = regs->ssx; + uregs->fred_ss.ss = ss; + /* The NMI bit was moved away above */ + uregs->fred_ss.nmi = 0; + uregs->csx = regs->csx; + uregs->fred_cs.sl = 0; + uregs->fred_cs.wfe = 0; + uregs->cs = cs; + uregs->orig_ax = error_code; + + return ex_handler_default(fixup, regs); +} +#endif + int ex_get_fixup_type(unsigned long ip) { const struct exception_table_entry *e = search_exception_tables(ip); @@ -300,6 +374,10 @@ int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code, return ex_handler_ucopy_len(e, regs, trapnr, fault_addr, reg, imm); case EX_TYPE_ZEROPAD: return ex_handler_zeropad(e, regs, fault_addr); +#ifdef CONFIG_X86_FRED + case EX_TYPE_ERETU: + return ex_handler_eretu(e, regs, error_code); +#endif } BUG(); } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 8224567abdb6..402e08f6b7ec 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -34,6 +34,7 @@ #include <asm/kvm_para.h> /* kvm_handle_async_pf */ #include <asm/vdso.h> /* fixup_vdso_exception() */ #include <asm/irq_stack.h> +#include <asm/fred.h> #include <asm/sev.h> /* snp_dump_hva_rmpentry() */ #define CREATE_TRACE_POINTS @@ -1298,21 +1299,14 @@ void do_user_addr_fault(struct pt_regs *regs, return; } - /* - * It's safe to allow irq's after cr2 has been saved and the - * vmalloc fault has been handled. - * - * User-mode registers count as a user access even for any - * potential system fault or CPU buglet: - */ - if (user_mode(regs)) { - local_irq_enable(); - flags |= FAULT_FLAG_USER; - } else { - if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); + /* Legacy check - remove this after verifying that it doesn't trigger */ + if (WARN_ON_ONCE(!(regs->flags & X86_EFLAGS_IF))) { + bad_area_nosemaphore(regs, error_code, address); + return; } + local_irq_enable(); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); /* @@ -1328,6 +1322,14 @@ void do_user_addr_fault(struct pt_regs *regs, if (error_code & X86_PF_INSTR) flags |= FAULT_FLAG_INSTRUCTION; + /* + * We set FAULT_FLAG_USER based on the register state, not + * based on X86_PF_USER. User space accesses that cause + * system page faults are still user accesses. + */ + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; + #ifdef CONFIG_X86_64 /* * Faults in the vsyscall page might need emulation. The @@ -1514,8 +1516,10 @@ handle_page_fault(struct pt_regs *regs, unsigned long error_code, DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) { - unsigned long address = read_cr2(); irqentry_state_t state; + unsigned long address; + + address = cpu_feature_enabled(X86_FEATURE_FRED) ? fred_event_data(regs) : read_cr2(); prefetchw(¤t->mm->mmap_lock); diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index d035bce3a2b0..6f3b3e028718 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -14,6 +14,8 @@ #include <linux/mem_encrypt.h> #include <linux/virtio_anchor.h> +#include <asm/sev.h> + /* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */ bool force_dma_unencrypted(struct device *dev) { @@ -74,6 +76,9 @@ static void print_mem_encrypt_feature_info(void) pr_cont(" SEV-SNP"); pr_cont("\n"); + + sev_show_status(); + break; default: pr_cont("Unknown\n"); diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index 0904d7e8e126..0d72183b5dd0 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -240,6 +240,8 @@ void pat_cpu_init(void) } wrmsrl(MSR_IA32_CR_PAT, pat_msr_val); + + __flush_tlb_all(); } /** @@ -296,13 +298,8 @@ void __init pat_bp_init(void) /* * Xen PV doesn't allow to set PAT MSR, but all cache modes are * supported. - * When running as TDX guest setting the PAT MSR won't work either - * due to the requirement to set CR0.CD when doing so. Rely on - * firmware to have set the PAT MSR correctly. */ - if (pat_disabled || - cpu_feature_enabled(X86_FEATURE_XENPV) || - cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { + if (pat_disabled || cpu_feature_enabled(X86_FEATURE_XENPV)) { init_cache_modes(pat_msr_val); return; } diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index e9b448d1b1b7..e5b2985a7c51 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -755,10 +755,14 @@ pmd_t *lookup_pmd_address(unsigned long address) * areas on 32-bit NUMA systems. The percpu areas can * end up in this kind of memory, for instance. * - * This could be optimized, but it is only intended to be - * used at initialization time, and keeping it - * unoptimized should increase the testing coverage for - * the more obscure platforms. + * Note that as long as the PTEs are well-formed with correct PFNs, this + * works without checking the PRESENT bit in the leaf PTE. This is unlike + * the similar vmalloc_to_page() and derivatives. Callers may depend on + * this behavior. + * + * This could be optimized, but it is only used in paths that are not perf + * sensitive, and keeping it unoptimized should increase the testing coverage + * for the more obscure platforms. */ phys_addr_t slow_virt_to_phys(void *__virt_addr) { @@ -2041,17 +2045,12 @@ int set_mce_nospec(unsigned long pfn) return rc; } -static int set_memory_p(unsigned long *addr, int numpages) -{ - return change_page_attr_set(addr, numpages, __pgprot(_PAGE_PRESENT), 0); -} - /* Restore full speculative operation to the pfn. */ int clear_mce_nospec(unsigned long pfn) { unsigned long addr = (unsigned long) pfn_to_kaddr(pfn); - return set_memory_p(&addr, 1); + return set_memory_p(addr, 1); } EXPORT_SYMBOL_GPL(clear_mce_nospec); #endif /* CONFIG_X86_64 */ @@ -2104,6 +2103,11 @@ int set_memory_np_noalias(unsigned long addr, int numpages) CPA_NO_CHECK_ALIAS, NULL); } +int set_memory_p(unsigned long addr, int numpages) +{ + return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_PRESENT), 0); +} + int set_memory_4k(unsigned long addr, int numpages) { return change_page_attr_set_clr(&addr, numpages, __pgprot(0), @@ -2153,7 +2157,7 @@ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc) /* Notify hypervisor that we are about to set/clr encryption attribute. */ if (!x86_platform.guest.enc_status_change_prepare(addr, numpages, enc)) - return -EIO; + goto vmm_fail; ret = __change_page_attr_set_clr(&cpa, 1); @@ -2166,13 +2170,20 @@ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc) */ cpa_flush(&cpa, 0); + if (ret) + return ret; + /* Notify hypervisor that we have successfully set/clr encryption attribute. */ - if (!ret) { - if (!x86_platform.guest.enc_status_change_finish(addr, numpages, enc)) - ret = -EIO; - } + if (!x86_platform.guest.enc_status_change_finish(addr, numpages, enc)) + goto vmm_fail; - return ret; + return 0; + +vmm_fail: + WARN_ONCE(1, "CPA VMM failure to convert memory (addr=%p, numpages=%d) to %s.\n", + (void *)addr, numpages, enc ? "private" : "shared"); + + return -EIO; } static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 0cbc1b8e8e3d..cceb779d882d 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -293,7 +293,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) for (i = 0; i < PREALLOCATED_PMDS; i++) mop_up_one_pmd(mm, &pgdp[i]); -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION if (!boot_cpu_has(X86_FEATURE_PTI)) return; @@ -325,7 +325,7 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) } } -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION static void pgd_prepopulate_user_pmd(struct mm_struct *mm, pgd_t *k_pgd, pmd_t *pmds[]) { diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 5768d386efab..4af930947380 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -89,10 +89,10 @@ #define CR3_HW_ASID_BITS 12 /* - * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for + * When enabled, MITIGATION_PAGE_TABLE_ISOLATION consumes a single bit for * user/kernel switches */ -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION # define PTI_CONSUMED_PCID_BITS 1 #else # define PTI_CONSUMED_PCID_BITS 0 @@ -114,7 +114,7 @@ static inline u16 kern_pcid(u16 asid) { VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* * Make sure that the dynamic ASID space does not conflict with the * bit we are using to switch between user and kernel ASIDs. @@ -149,7 +149,7 @@ static inline u16 kern_pcid(u16 asid) static inline u16 user_pcid(u16 asid) { u16 ret = kern_pcid(asid); -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION ret |= 1 << X86_CR3_PTI_PCID_USER_BIT; #endif return ret; @@ -262,7 +262,7 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, static inline void invalidate_user_asid(u16 asid) { /* There is no user ASID if address space separation is off */ - if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) + if (!IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)) return; /* |