diff options
Diffstat (limited to 'arch/x86/mm')
-rw-r--r-- | arch/x86/mm/Makefile | 3 | ||||
-rw-r--r-- | arch/x86/mm/extable.c | 96 | ||||
-rw-r--r-- | arch/x86/mm/hugetlbpage.c | 4 | ||||
-rw-r--r-- | arch/x86/mm/init.c | 8 | ||||
-rw-r--r-- | arch/x86/mm/init_32.c | 2 | ||||
-rw-r--r-- | arch/x86/mm/init_64.c | 4 | ||||
-rw-r--r-- | arch/x86/mm/ioremap.c | 4 | ||||
-rw-r--r-- | arch/x86/mm/pageattr.c | 12 | ||||
-rw-r--r-- | arch/x86/mm/pat.c | 109 | ||||
-rw-r--r-- | arch/x86/mm/setup_nx.c | 5 | ||||
-rw-r--r-- | arch/x86/mm/tlb.c | 116 |
11 files changed, 285 insertions, 78 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index f98913258c63..62c0043a5fd5 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -2,7 +2,7 @@ KCOV_INSTRUMENT_tlb.o := n obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ - pat.o pgtable.o physaddr.o gup.o setup_nx.o + pat.o pgtable.o physaddr.o gup.o setup_nx.o tlb.o # Make sure __phys_addr has no stackprotector nostackp := $(call cc-option, -fno-stack-protector) @@ -12,7 +12,6 @@ CFLAGS_setup_nx.o := $(nostackp) CFLAGS_fault.o := -I$(src)/../include/asm/trace obj-$(CONFIG_X86_PAT) += pat_rbtree.o -obj-$(CONFIG_SMP) += tlb.o obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 82447b3fba38..4bb53b89f3c5 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -1,5 +1,6 @@ #include <linux/module.h> #include <asm/uaccess.h> +#include <asm/traps.h> typedef bool (*ex_handler_t)(const struct exception_table_entry *, struct pt_regs *, int); @@ -42,6 +43,43 @@ bool ex_handler_ext(const struct exception_table_entry *fixup, } EXPORT_SYMBOL(ex_handler_ext); +bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + WARN_ONCE(1, "unchecked MSR access error: RDMSR from 0x%x\n", + (unsigned int)regs->cx); + + /* Pretend that the read succeeded and returned 0. */ + regs->ip = ex_fixup_addr(fixup); + regs->ax = 0; + regs->dx = 0; + return true; +} +EXPORT_SYMBOL(ex_handler_rdmsr_unsafe); + +bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + WARN_ONCE(1, "unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x)\n", + (unsigned int)regs->cx, + (unsigned int)regs->dx, (unsigned int)regs->ax); + + /* Pretend that the write succeeded. */ + regs->ip = ex_fixup_addr(fixup); + return true; +} +EXPORT_SYMBOL(ex_handler_wrmsr_unsafe); + +bool ex_handler_clear_fs(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + if (static_cpu_has(X86_BUG_NULL_SEG)) + asm volatile ("mov %0, %%fs" : : "rm" (__USER_DS)); + asm volatile ("mov %0, %%fs" : : "rm" (0)); + return ex_handler_default(fixup, regs, trapnr); +} +EXPORT_SYMBOL(ex_handler_clear_fs); + bool ex_has_fault_handler(unsigned long ip) { const struct exception_table_entry *e; @@ -82,24 +120,46 @@ int fixup_exception(struct pt_regs *regs, int trapnr) return handler(e, regs, trapnr); } +extern unsigned int early_recursion_flag; + /* Restricted version used during very early boot */ -int __init early_fixup_exception(unsigned long *ip) +void __init early_fixup_exception(struct pt_regs *regs, int trapnr) { - const struct exception_table_entry *e; - unsigned long new_ip; - ex_handler_t handler; - - e = search_exception_tables(*ip); - if (!e) - return 0; - - new_ip = ex_fixup_addr(e); - handler = ex_fixup_handler(e); - - /* special handling not supported during early boot */ - if (handler != ex_handler_default) - return 0; - - *ip = new_ip; - return 1; + /* Ignore early NMIs. */ + if (trapnr == X86_TRAP_NMI) + return; + + if (early_recursion_flag > 2) + goto halt_loop; + + if (regs->cs != __KERNEL_CS) + goto fail; + + /* + * The full exception fixup machinery is available as soon as + * the early IDT is loaded. This means that it is the + * responsibility of extable users to either function correctly + * when handlers are invoked early or to simply avoid causing + * exceptions before they're ready to handle them. + * + * This is better than filtering which handlers can be used, + * because refusing to call a handler here is guaranteed to + * result in a hard-to-debug panic. + * + * Keep in mind that not all vectors actually get here. Early + * fage faults, for example, are special. + */ + if (fixup_exception(regs, trapnr)) + return; + +fail: + early_printk("PANIC: early exception 0x%02x IP %lx:%lx error %lx cr2 0x%lx\n", + (unsigned)trapnr, (unsigned long)regs->cs, regs->ip, + regs->orig_ax, read_cr2()); + + show_regs(regs); + +halt_loop: + while (true) + halt(); } diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 740d7ac03a55..14a95054d4e0 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -162,7 +162,7 @@ static __init int setup_hugepagesz(char *opt) unsigned long ps = memparse(opt, &opt); if (ps == PMD_SIZE) { hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); - } else if (ps == PUD_SIZE && cpu_has_gbpages) { + } else if (ps == PUD_SIZE && boot_cpu_has(X86_FEATURE_GBPAGES)) { hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); } else { printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n", @@ -177,7 +177,7 @@ __setup("hugepagesz=", setup_hugepagesz); static __init int gigantic_pages_init(void) { /* With compaction or CMA we can allocate gigantic pages at runtime */ - if (cpu_has_gbpages && !size_to_hstate(1UL << PUD_SHIFT)) + if (boot_cpu_has(X86_FEATURE_GBPAGES) && !size_to_hstate(1UL << PUD_SHIFT)) hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); return 0; } diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 9d56f271d519..372aad2b3291 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -157,23 +157,23 @@ static void __init probe_page_size_mask(void) * This will simplify cpa(), which otherwise needs to support splitting * large pages into small in interrupt context, etc. */ - if (cpu_has_pse && !debug_pagealloc_enabled()) + if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled()) page_size_mask |= 1 << PG_LEVEL_2M; #endif /* Enable PSE if available */ - if (cpu_has_pse) + if (boot_cpu_has(X86_FEATURE_PSE)) cr4_set_bits_and_update_boot(X86_CR4_PSE); /* Enable PGE if available */ - if (cpu_has_pge) { + if (boot_cpu_has(X86_FEATURE_PGE)) { cr4_set_bits_and_update_boot(X86_CR4_PGE); __supported_pte_mask |= _PAGE_GLOBAL; } else __supported_pte_mask &= ~_PAGE_GLOBAL; /* Enable 1 GB linear kernel mappings if available: */ - if (direct_gbpages && cpu_has_gbpages) { + if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) { printk(KERN_INFO "Using GB pages for direct mapping\n"); page_size_mask |= 1 << PG_LEVEL_1G; } else { diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index f2ee42d61894..84df150ee77e 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -284,7 +284,7 @@ kernel_physical_mapping_init(unsigned long start, */ mapping_iter = 1; - if (!cpu_has_pse) + if (!boot_cpu_has(X86_FEATURE_PSE)) use_pse = 0; repeat: diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 65cfbeefbec4..bce2e5d9edd4 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1223,7 +1223,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) struct vmem_altmap *altmap = to_vmem_altmap(start); int err; - if (cpu_has_pse) + if (boot_cpu_has(X86_FEATURE_PSE)) err = vmemmap_populate_hugepages(start, end, node, altmap); else if (altmap) { pr_err_once("%s: no cpu support for altmap allocations\n", @@ -1266,7 +1266,7 @@ void register_page_bootmem_memmap(unsigned long section_nr, } get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO); - if (!cpu_has_pse) { + if (!boot_cpu_has(X86_FEATURE_PSE)) { next = (addr + PAGE_SIZE) & PAGE_MASK; pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 0d8d53d1f5cc..f0894910bdd7 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -378,7 +378,7 @@ EXPORT_SYMBOL(iounmap); int __init arch_ioremap_pud_supported(void) { #ifdef CONFIG_X86_64 - return cpu_has_gbpages; + return boot_cpu_has(X86_FEATURE_GBPAGES); #else return 0; #endif @@ -386,7 +386,7 @@ int __init arch_ioremap_pud_supported(void) int __init arch_ioremap_pmd_supported(void) { - return cpu_has_pse; + return boot_cpu_has(X86_FEATURE_PSE); } /* diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 01be9ec3bf79..7a1f7bbf4105 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1055,7 +1055,7 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, /* * Map everything starting from the Gb boundary, possibly with 1G pages */ - while (cpu_has_gbpages && end - start >= PUD_SIZE) { + while (boot_cpu_has(X86_FEATURE_GBPAGES) && end - start >= PUD_SIZE) { set_pud(pud, __pud(cpa->pfn << PAGE_SHIFT | _PAGE_PSE | massage_pgprot(pud_pgprot))); @@ -1125,8 +1125,14 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr, int primary) { - if (cpa->pgd) + if (cpa->pgd) { + /* + * Right now, we only execute this code path when mapping + * the EFI virtual memory map regions, no other users + * provide a ->pgd value. This may change in the future. + */ return populate_pgd(cpa, vaddr); + } /* * Ignore all non primary paths. @@ -1460,7 +1466,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, * error case we fall back to cpa_flush_all (which uses * WBINVD): */ - if (!ret && cpu_has_clflush) { + if (!ret && boot_cpu_has(X86_FEATURE_CLFLUSH)) { if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { cpa_flush_array(addr, numpages, cache, cpa.flags, pages); diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index faec01e7a17d..fb0604f11eec 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -40,11 +40,22 @@ static bool boot_cpu_done; static int __read_mostly __pat_enabled = IS_ENABLED(CONFIG_X86_PAT); +static void init_cache_modes(void); -static inline void pat_disable(const char *reason) +void pat_disable(const char *reason) { + if (!__pat_enabled) + return; + + if (boot_cpu_done) { + WARN_ONCE(1, "x86/PAT: PAT cannot be disabled after initialization\n"); + return; + } + __pat_enabled = 0; pr_info("x86/PAT: %s\n", reason); + + init_cache_modes(); } static int __init nopat(char *str) @@ -181,7 +192,7 @@ static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg) * configuration. * Using lower indices is preferred, so we start with highest index. */ -void pat_init_cache_modes(u64 pat) +static void __init_cache_modes(u64 pat) { enum page_cache_mode cache; char pat_msg[33]; @@ -202,14 +213,11 @@ static void pat_bsp_init(u64 pat) { u64 tmp_pat; - if (!cpu_has_pat) { + if (!boot_cpu_has(X86_FEATURE_PAT)) { pat_disable("PAT not supported by CPU."); return; } - if (!pat_enabled()) - goto done; - rdmsrl(MSR_IA32_CR_PAT, tmp_pat); if (!tmp_pat) { pat_disable("PAT MSR is 0, disabled."); @@ -218,16 +226,12 @@ static void pat_bsp_init(u64 pat) wrmsrl(MSR_IA32_CR_PAT, pat); -done: - pat_init_cache_modes(pat); + __init_cache_modes(pat); } static void pat_ap_init(u64 pat) { - if (!pat_enabled()) - return; - - if (!cpu_has_pat) { + if (!boot_cpu_has(X86_FEATURE_PAT)) { /* * If this happens we are on a secondary CPU, but switched to * PAT on the boot CPU. We have no way to undo PAT. @@ -238,18 +242,32 @@ static void pat_ap_init(u64 pat) wrmsrl(MSR_IA32_CR_PAT, pat); } -void pat_init(void) +static void init_cache_modes(void) { - u64 pat; - struct cpuinfo_x86 *c = &boot_cpu_data; + u64 pat = 0; + static int init_cm_done; - if (!pat_enabled()) { + if (init_cm_done) + return; + + if (boot_cpu_has(X86_FEATURE_PAT)) { + /* + * CPU supports PAT. Set PAT table to be consistent with + * PAT MSR. This case supports "nopat" boot option, and + * virtual machine environments which support PAT without + * MTRRs. In specific, Xen has unique setup to PAT MSR. + * + * If PAT MSR returns 0, it is considered invalid and emulates + * as No PAT. + */ + rdmsrl(MSR_IA32_CR_PAT, pat); + } + + if (!pat) { /* * No PAT. Emulate the PAT table that corresponds to the two - * cache bits, PWT (Write Through) and PCD (Cache Disable). This - * setup is the same as the BIOS default setup when the system - * has PAT but the "nopat" boot option has been specified. This - * emulated PAT table is used when MSR_IA32_CR_PAT returns 0. + * cache bits, PWT (Write Through) and PCD (Cache Disable). + * This setup is also the same as the BIOS default setup. * * PTE encoding: * @@ -266,10 +284,36 @@ void pat_init(void) */ pat = PAT(0, WB) | PAT(1, WT) | PAT(2, UC_MINUS) | PAT(3, UC) | PAT(4, WB) | PAT(5, WT) | PAT(6, UC_MINUS) | PAT(7, UC); + } + + __init_cache_modes(pat); + + init_cm_done = 1; +} + +/** + * pat_init - Initialize PAT MSR and PAT table + * + * This function initializes PAT MSR and PAT table with an OS-defined value + * to enable additional cache attributes, WC and WT. + * + * This function must be called on all CPUs using the specific sequence of + * operations defined in Intel SDM. mtrr_rendezvous_handler() provides this + * procedure for PAT. + */ +void pat_init(void) +{ + u64 pat; + struct cpuinfo_x86 *c = &boot_cpu_data; + + if (!pat_enabled()) { + init_cache_modes(); + return; + } - } else if ((c->x86_vendor == X86_VENDOR_INTEL) && - (((c->x86 == 0x6) && (c->x86_model <= 0xd)) || - ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) { + if ((c->x86_vendor == X86_VENDOR_INTEL) && + (((c->x86 == 0x6) && (c->x86_model <= 0xd)) || + ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) { /* * PAT support with the lower four entries. Intel Pentium 2, * 3, M, and 4 are affected by PAT errata, which makes the @@ -734,25 +778,6 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, if (file->f_flags & O_DSYNC) pcm = _PAGE_CACHE_MODE_UC_MINUS; -#ifdef CONFIG_X86_32 - /* - * On the PPro and successors, the MTRRs are used to set - * memory types for physical addresses outside main memory, - * so blindly setting UC or PWT on those pages is wrong. - * For Pentiums and earlier, the surround logic should disable - * caching for the high addresses through the KEN pin, but - * we maintain the tradition of paranoia in this code. - */ - if (!pat_enabled() && - !(boot_cpu_has(X86_FEATURE_MTRR) || - boot_cpu_has(X86_FEATURE_K6_MTRR) || - boot_cpu_has(X86_FEATURE_CYRIX_ARR) || - boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) && - (pfn << PAGE_SHIFT) >= __pa(high_memory)) { - pcm = _PAGE_CACHE_MODE_UC; - } -#endif - *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) | cachemode2protval(pcm)); return 1; diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c index 8bea84724a7d..f65a33f505b6 100644 --- a/arch/x86/mm/setup_nx.c +++ b/arch/x86/mm/setup_nx.c @@ -32,8 +32,9 @@ early_param("noexec", noexec_setup); void x86_configure_nx(void) { - /* If disable_nx is set, clear NX on all new mappings going forward. */ - if (disable_nx) + if (boot_cpu_has(X86_FEATURE_NX) && !disable_nx) + __supported_pte_mask |= _PAGE_NX; + else __supported_pte_mask &= ~_PAGE_NX; } diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index fe9b9f776361..5643fd0b1a7d 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -28,6 +28,8 @@ * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi */ +#ifdef CONFIG_SMP + struct flush_tlb_info { struct mm_struct *flush_mm; unsigned long flush_start; @@ -57,6 +59,118 @@ void leave_mm(int cpu) } EXPORT_SYMBOL_GPL(leave_mm); +#endif /* CONFIG_SMP */ + +void switch_mm(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) +{ + unsigned long flags; + + local_irq_save(flags); + switch_mm_irqs_off(prev, next, tsk); + local_irq_restore(flags); +} + +void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) +{ + unsigned cpu = smp_processor_id(); + + if (likely(prev != next)) { +#ifdef CONFIG_SMP + this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); + this_cpu_write(cpu_tlbstate.active_mm, next); +#endif + cpumask_set_cpu(cpu, mm_cpumask(next)); + + /* + * Re-load page tables. + * + * This logic has an ordering constraint: + * + * CPU 0: Write to a PTE for 'next' + * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI. + * CPU 1: set bit 1 in next's mm_cpumask + * CPU 1: load from the PTE that CPU 0 writes (implicit) + * + * We need to prevent an outcome in which CPU 1 observes + * the new PTE value and CPU 0 observes bit 1 clear in + * mm_cpumask. (If that occurs, then the IPI will never + * be sent, and CPU 0's TLB will contain a stale entry.) + * + * The bad outcome can occur if either CPU's load is + * reordered before that CPU's store, so both CPUs must + * execute full barriers to prevent this from happening. + * + * Thus, switch_mm needs a full barrier between the + * store to mm_cpumask and any operation that could load + * from next->pgd. TLB fills are special and can happen + * due to instruction fetches or for no reason at all, + * and neither LOCK nor MFENCE orders them. + * Fortunately, load_cr3() is serializing and gives the + * ordering guarantee we need. + * + */ + load_cr3(next->pgd); + + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + + /* Stop flush ipis for the previous mm */ + cpumask_clear_cpu(cpu, mm_cpumask(prev)); + + /* Load per-mm CR4 state */ + load_mm_cr4(next); + +#ifdef CONFIG_MODIFY_LDT_SYSCALL + /* + * Load the LDT, if the LDT is different. + * + * It's possible that prev->context.ldt doesn't match + * the LDT register. This can happen if leave_mm(prev) + * was called and then modify_ldt changed + * prev->context.ldt but suppressed an IPI to this CPU. + * In this case, prev->context.ldt != NULL, because we + * never set context.ldt to NULL while the mm still + * exists. That means that next->context.ldt != + * prev->context.ldt, because mms never share an LDT. + */ + if (unlikely(prev->context.ldt != next->context.ldt)) + load_mm_ldt(next); +#endif + } +#ifdef CONFIG_SMP + else { + this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); + BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next); + + if (!cpumask_test_cpu(cpu, mm_cpumask(next))) { + /* + * On established mms, the mm_cpumask is only changed + * from irq context, from ptep_clear_flush() while in + * lazy tlb mode, and here. Irqs are blocked during + * schedule, protecting us from simultaneous changes. + */ + cpumask_set_cpu(cpu, mm_cpumask(next)); + + /* + * We were in lazy tlb mode and leave_mm disabled + * tlb flush IPI delivery. We must reload CR3 + * to make sure to use no freed page tables. + * + * As above, load_cr3() is serializing and orders TLB + * fills with respect to the mm_cpumask write. + */ + load_cr3(next->pgd); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + load_mm_cr4(next); + load_mm_ldt(next); + } + } +#endif +} + +#ifdef CONFIG_SMP + /* * The flush IPI assumes that a thread switch happens in this order: * [cpu0: the cpu that switches] @@ -353,3 +467,5 @@ static int __init create_tlb_single_page_flush_ceiling(void) return 0; } late_initcall(create_tlb_single_page_flush_ceiling); + +#endif /* CONFIG_SMP */ |