diff options
Diffstat (limited to 'arch/x86/mm')
-rw-r--r-- | arch/x86/mm/extable.c | 24 | ||||
-rw-r--r-- | arch/x86/mm/fault.c | 80 | ||||
-rw-r--r-- | arch/x86/mm/init_64.c | 20 | ||||
-rw-r--r-- | arch/x86/mm/ioremap.c | 2 | ||||
-rw-r--r-- | arch/x86/mm/numa_emulation.c | 2 | ||||
-rw-r--r-- | arch/x86/mm/pat/set_memory.c | 2 | ||||
-rw-r--r-- | arch/x86/mm/tlb.c | 37 |
7 files changed, 114 insertions, 53 deletions
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 1d6cb07f4f86..5829457f7ca3 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -80,6 +80,18 @@ __visible bool ex_handler_uaccess(const struct exception_table_entry *fixup, } EXPORT_SYMBOL(ex_handler_uaccess); +__visible bool ex_handler_copy(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr) +{ + WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?"); + regs->ip = ex_fixup_addr(fixup); + regs->ax = trapnr; + return true; +} +EXPORT_SYMBOL(ex_handler_copy); + __visible bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup, struct pt_regs *regs, int trapnr, unsigned long error_code, @@ -125,17 +137,21 @@ __visible bool ex_handler_clear_fs(const struct exception_table_entry *fixup, } EXPORT_SYMBOL(ex_handler_clear_fs); -__visible bool ex_has_fault_handler(unsigned long ip) +enum handler_type ex_get_fault_handler_type(unsigned long ip) { const struct exception_table_entry *e; ex_handler_t handler; e = search_exception_tables(ip); if (!e) - return false; + return EX_HANDLER_NONE; handler = ex_fixup_handler(e); - - return handler == ex_handler_fault; + if (handler == ex_handler_fault) + return EX_HANDLER_FAULT; + else if (handler == ex_handler_uaccess || handler == ex_handler_copy) + return EX_HANDLER_UACCESS; + else + return EX_HANDLER_OTHER; } int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code, diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 35f1498e9832..42606a04ae85 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -190,6 +190,53 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) return pmd_k; } +/* + * Handle a fault on the vmalloc or module mapping area + * + * This is needed because there is a race condition between the time + * when the vmalloc mapping code updates the PMD to the point in time + * where it synchronizes this update with the other page-tables in the + * system. + * + * In this race window another thread/CPU can map an area on the same + * PMD, finds it already present and does not synchronize it with the + * rest of the system yet. As a result v[mz]alloc might return areas + * which are not mapped in every page-table in the system, causing an + * unhandled page-fault when they are accessed. + */ +static noinline int vmalloc_fault(unsigned long address) +{ + unsigned long pgd_paddr; + pmd_t *pmd_k; + pte_t *pte_k; + + /* Make sure we are in vmalloc area: */ + if (!(address >= VMALLOC_START && address < VMALLOC_END)) + return -1; + + /* + * Synchronize this task's top level page-table + * with the 'reference' page table. + * + * Do _not_ use "current" here. We might be inside + * an interrupt in the middle of a task switch.. + */ + pgd_paddr = read_cr3_pa(); + pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); + if (!pmd_k) + return -1; + + if (pmd_large(*pmd_k)) + return 0; + + pte_k = pte_offset_kernel(pmd_k, address); + if (!pte_present(*pte_k)) + return -1; + + return 0; +} +NOKPROBE_SYMBOL(vmalloc_fault); + void arch_sync_kernel_mappings(unsigned long start, unsigned long end) { unsigned long addr; @@ -1081,7 +1128,7 @@ access_error(unsigned long error_code, struct vm_area_struct *vma) return 0; } -static int fault_in_kernel_space(unsigned long address) +bool fault_in_kernel_space(unsigned long address) { /* * On 64-bit systems, the vsyscall page is at an address above @@ -1110,6 +1157,37 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, */ WARN_ON_ONCE(hw_error_code & X86_PF_PK); +#ifdef CONFIG_X86_32 + /* + * We can fault-in kernel-space virtual memory on-demand. The + * 'reference' page table is init_mm.pgd. + * + * NOTE! We MUST NOT take any locks for this case. We may + * be in an interrupt or a critical region, and should + * only copy the information from the master page table, + * nothing more. + * + * Before doing this on-demand faulting, ensure that the + * fault is not any of the following: + * 1. A fault on a PTE with a reserved bit set. + * 2. A fault caused by a user-mode access. (Do not demand- + * fault kernel memory due to user-mode accesses). + * 3. A fault caused by a page-level protection violation. + * (A demand fault would be on a non-present page which + * would have X86_PF_PROT==0). + * + * This is only needed to close a race condition on x86-32 in + * the vmalloc mapping/unmapping code. See the comment above + * vmalloc_fault() for details. On x86-64 the race does not + * exist as the vmalloc mappings don't need to be synchronized + * there. + */ + if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { + if (vmalloc_fault(address) >= 0) + return; + } +#endif + /* Was the fault spurious, caused by lazy TLB invalidation? */ if (spurious_kernel_fault(hw_error_code, address)) return; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index a4ac13cc3fdc..b5a3fa4033d3 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -217,11 +217,6 @@ static void sync_global_pgds(unsigned long start, unsigned long end) sync_global_pgds_l4(start, end); } -void arch_sync_kernel_mappings(unsigned long start, unsigned long end) -{ - sync_global_pgds(start, end); -} - /* * NOTE: This function is marked __ref because it calls __init function * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. @@ -1257,14 +1252,19 @@ static void __init preallocate_vmalloc_pages(void) if (!p4d) goto failed; - /* - * With 5-level paging the P4D level is not folded. So the PGDs - * are now populated and there is no need to walk down to the - * PUD level. - */ if (pgtable_l5_enabled()) continue; + /* + * The goal here is to allocate all possibly required + * hardware page tables pointed to by the top hardware + * level. + * + * On 4-level systems, the P4D layer is folded away and + * the above code does no preallocation. Below, go down + * to the pud _software_ level to ensure the second + * hardware level is allocated on 4-level systems too. + */ lvl = "pud"; pud = pud_alloc(&init_mm, p4d, addr); if (!pud) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 84d85dbd1dad..9e5ccc56f8e0 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -574,7 +574,7 @@ static bool memremap_should_map_decrypted(resource_size_t phys_addr, /* For SEV, these areas are encrypted */ if (sev_active()) break; - /* Fallthrough */ + fallthrough; case E820_TYPE_PRAM: return true; diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index c5174b4e318b..683cd12f4793 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -321,7 +321,7 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, u64 addr, u64 max_addr, u64 size) { return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size, - 0, NULL, NUMA_NO_NODE); + 0, NULL, 0); } static int __init setup_emu2phys_nid(int *dfl_phys_nid) diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index d1b2a889f035..40baa90e74f4 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -1999,7 +1999,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) /* * Before changing the encryption attribute, we need to flush caches. */ - cpa_flush(&cpa, 1); + cpa_flush(&cpa, !this_cpu_has(X86_FEATURE_SME_COHERENT)); ret = __change_page_attr_set_clr(&cpa, 1); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 1a3569b43aa5..11666ba19b62 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -14,7 +14,6 @@ #include <asm/nospec-branch.h> #include <asm/cache.h> #include <asm/apic.h> -#include <asm/uv/uv.h> #include "mm_internal.h" @@ -555,21 +554,12 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); load_new_mm_cr3(next->pgd, new_asid, true); - /* - * NB: This gets called via leave_mm() in the idle path - * where RCU functions differently. Tracing normally - * uses RCU, so we need to use the _rcuidle variant. - * - * (There is no good reason for this. The idle code should - * be rearranged to call this before rcu_idle_enter().) - */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } else { /* The new ASID is already up to date. */ load_new_mm_cr3(next->pgd, new_asid, false); - /* See above wrt _rcuidle. */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); } /* Make sure we write CR3 before loaded_mm. */ @@ -809,29 +799,6 @@ STATIC_NOPV void native_flush_tlb_others(const struct cpumask *cpumask, trace_tlb_flush(TLB_REMOTE_SEND_IPI, (info->end - info->start) >> PAGE_SHIFT); - if (is_uv_system()) { - /* - * This whole special case is confused. UV has a "Broadcast - * Assist Unit", which seems to be a fancy way to send IPIs. - * Back when x86 used an explicit TLB flush IPI, UV was - * optimized to use its own mechanism. These days, x86 uses - * smp_call_function_many(), but UV still uses a manual IPI, - * and that IPI's action is out of date -- it does a manual - * flush instead of calling flush_tlb_func_remote(). This - * means that the percpu tlb_gen variables won't be updated - * and we'll do pointless flushes on future context switches. - * - * Rather than hooking native_flush_tlb_others() here, I think - * that UV should be updated so that smp_call_function_many(), - * etc, are optimal on UV. - */ - cpumask = uv_flush_tlb_others(cpumask, info); - if (cpumask) - smp_call_function_many(cpumask, flush_tlb_func_remote, - (void *)info, 1); - return; - } - /* * If no page tables were freed, we can skip sending IPIs to * CPUs in lazy TLB mode. They will flush the CPU themselves |