diff options
Diffstat (limited to 'arch/x86/mm')
-rw-r--r-- | arch/x86/mm/cpu_entry_area.c | 4 | ||||
-rw-r--r-- | arch/x86/mm/dump_pagetables.c | 35 | ||||
-rw-r--r-- | arch/x86/mm/fault.c | 195 | ||||
-rw-r--r-- | arch/x86/mm/highmem_32.c | 50 | ||||
-rw-r--r-- | arch/x86/mm/hugetlbpage.c | 23 | ||||
-rw-r--r-- | arch/x86/mm/init.c | 48 | ||||
-rw-r--r-- | arch/x86/mm/init_64.c | 24 | ||||
-rw-r--r-- | arch/x86/mm/ioremap.c | 10 | ||||
-rw-r--r-- | arch/x86/mm/kmmio.c | 4 | ||||
-rw-r--r-- | arch/x86/mm/mem_encrypt.c | 2 | ||||
-rw-r--r-- | arch/x86/mm/mmap.c | 2 | ||||
-rw-r--r-- | arch/x86/mm/mmio-mod.c | 4 | ||||
-rw-r--r-- | arch/x86/mm/numa.c | 11 | ||||
-rw-r--r-- | arch/x86/mm/numa_32.c | 34 | ||||
-rw-r--r-- | arch/x86/mm/pat/set_memory.c | 19 | ||||
-rw-r--r-- | arch/x86/mm/pgtable.c | 16 | ||||
-rw-r--r-- | arch/x86/mm/pgtable_32.c | 2 | ||||
-rw-r--r-- | arch/x86/mm/pti.c | 8 | ||||
-rw-r--r-- | arch/x86/mm/tlb.c | 421 |
19 files changed, 540 insertions, 372 deletions
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index 56f9189bbadb..5199d8a1daf1 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -17,7 +17,7 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(struct exception_stacks, exception_stacks); DEFINE_PER_CPU(struct cea_exception_stacks*, cea_exception_stacks); #endif -#if defined(CONFIG_X86_32) && defined(CONFIG_DOUBLEFAULT) +#ifdef CONFIG_X86_32 DECLARE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack); #endif @@ -114,12 +114,10 @@ static void __init percpu_setup_exception_stacks(unsigned int cpu) #else static inline void percpu_setup_exception_stacks(unsigned int cpu) { -#ifdef CONFIG_DOUBLEFAULT struct cpu_entry_area *cea = get_cpu_entry_area(cpu); cea_map_percpu_pages(&cea->doublefault_stack, &per_cpu(doublefault_stack, cpu), 1, PAGE_KERNEL); -#endif } #endif diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 69309cd56fdf..ea9010113f69 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -249,10 +249,22 @@ static void note_wx(struct pg_state *st, unsigned long addr) (void *)st->start_address); } -static inline pgprotval_t effective_prot(pgprotval_t prot1, pgprotval_t prot2) +static void effective_prot(struct ptdump_state *pt_st, int level, u64 val) { - return (prot1 & prot2 & (_PAGE_USER | _PAGE_RW)) | - ((prot1 | prot2) & _PAGE_NX); + struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); + pgprotval_t prot = val & PTE_FLAGS_MASK; + pgprotval_t effective; + + if (level > 0) { + pgprotval_t higher_prot = st->prot_levels[level - 1]; + + effective = (higher_prot & prot & (_PAGE_USER | _PAGE_RW)) | + ((higher_prot | prot) & _PAGE_NX); + } else { + effective = prot; + } + + st->prot_levels[level] = effective; } /* @@ -261,7 +273,7 @@ static inline pgprotval_t effective_prot(pgprotval_t prot1, pgprotval_t prot2) * print what we collected so far. */ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, - unsigned long val) + u64 val) { struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); pgprotval_t new_prot, new_eff; @@ -270,16 +282,10 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, struct seq_file *m = st->seq; new_prot = val & PTE_FLAGS_MASK; - - if (level > 0) { - new_eff = effective_prot(st->prot_levels[level - 1], - new_prot); - } else { - new_eff = new_prot; - } - - if (level >= 0) - st->prot_levels[level] = new_eff; + if (!val) + new_eff = 0; + else + new_eff = st->prot_levels[level]; /* * If we have a "break" in the series, we need to flush the state that @@ -374,6 +380,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, struct pg_state st = { .ptdump = { .note_page = note_page, + .effective_prot = effective_prot, .range = ptdump_ranges }, .level = -1, diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a51df516b87b..c5437f2964ee 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -30,6 +30,7 @@ #include <asm/desc.h> /* store_idt(), ... */ #include <asm/cpu_entry_area.h> /* exception stack */ #include <asm/pgtable_areas.h> /* VMALLOC_START, ... */ +#include <asm/kvm_para.h> /* kvm_handle_async_pf */ #define CREATE_TRACE_POINTS #include <asm/trace/exceptions.h> @@ -190,16 +191,13 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) return pmd_k; } -static void vmalloc_sync(void) +void arch_sync_kernel_mappings(unsigned long start, unsigned long end) { - unsigned long address; - - if (SHARED_KERNEL_PMD) - return; + unsigned long addr; - for (address = VMALLOC_START & PMD_MASK; - address >= TASK_SIZE_MAX && address < VMALLOC_END; - address += PMD_SIZE) { + for (addr = start & PMD_MASK; + addr >= TASK_SIZE_MAX && addr < VMALLOC_END; + addr += PMD_SIZE) { struct page *page; spin_lock(&pgd_lock); @@ -210,61 +208,13 @@ static void vmalloc_sync(void) pgt_lock = &pgd_page_get_mm(page)->page_table_lock; spin_lock(pgt_lock); - vmalloc_sync_one(page_address(page), address); + vmalloc_sync_one(page_address(page), addr); spin_unlock(pgt_lock); } spin_unlock(&pgd_lock); } } -void vmalloc_sync_mappings(void) -{ - vmalloc_sync(); -} - -void vmalloc_sync_unmappings(void) -{ - vmalloc_sync(); -} - -/* - * 32-bit: - * - * Handle a fault on the vmalloc or module mapping area - */ -static noinline int vmalloc_fault(unsigned long address) -{ - unsigned long pgd_paddr; - pmd_t *pmd_k; - pte_t *pte_k; - - /* Make sure we are in vmalloc area: */ - if (!(address >= VMALLOC_START && address < VMALLOC_END)) - return -1; - - /* - * Synchronize this task's top level page-table - * with the 'reference' page table. - * - * Do _not_ use "current" here. We might be inside - * an interrupt in the middle of a task switch.. - */ - pgd_paddr = read_cr3_pa(); - pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); - if (!pmd_k) - return -1; - - if (pmd_large(*pmd_k)) - return 0; - - pte_k = pte_offset_kernel(pmd_k, address); - if (!pte_present(*pte_k)) - return -1; - - return 0; -} -NOKPROBE_SYMBOL(vmalloc_fault); - /* * Did it hit the DOS screen memory VA from vm86 mode? */ @@ -329,96 +279,6 @@ out: #else /* CONFIG_X86_64: */ -void vmalloc_sync_mappings(void) -{ - /* - * 64-bit mappings might allocate new p4d/pud pages - * that need to be propagated to all tasks' PGDs. - */ - sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END); -} - -void vmalloc_sync_unmappings(void) -{ - /* - * Unmappings never allocate or free p4d/pud pages. - * No work is required here. - */ -} - -/* - * 64-bit: - * - * Handle a fault on the vmalloc area - */ -static noinline int vmalloc_fault(unsigned long address) -{ - pgd_t *pgd, *pgd_k; - p4d_t *p4d, *p4d_k; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - /* Make sure we are in vmalloc area: */ - if (!(address >= VMALLOC_START && address < VMALLOC_END)) - return -1; - - /* - * Copy kernel mappings over when needed. This can also - * happen within a race in page table update. In the later - * case just flush: - */ - pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address); - pgd_k = pgd_offset_k(address); - if (pgd_none(*pgd_k)) - return -1; - - if (pgtable_l5_enabled()) { - if (pgd_none(*pgd)) { - set_pgd(pgd, *pgd_k); - arch_flush_lazy_mmu_mode(); - } else { - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_k)); - } - } - - /* With 4-level paging, copying happens on the p4d level. */ - p4d = p4d_offset(pgd, address); - p4d_k = p4d_offset(pgd_k, address); - if (p4d_none(*p4d_k)) - return -1; - - if (p4d_none(*p4d) && !pgtable_l5_enabled()) { - set_p4d(p4d, *p4d_k); - arch_flush_lazy_mmu_mode(); - } else { - BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_k)); - } - - BUILD_BUG_ON(CONFIG_PGTABLE_LEVELS < 4); - - pud = pud_offset(p4d, address); - if (pud_none(*pud)) - return -1; - - if (pud_large(*pud)) - return 0; - - pmd = pmd_offset(pud, address); - if (pmd_none(*pmd)) - return -1; - - if (pmd_large(*pmd)) - return 0; - - pte = pte_offset_kernel(pmd, address); - if (!pte_present(*pte)) - return -1; - - return 0; -} -NOKPROBE_SYMBOL(vmalloc_fault); - #ifdef CONFIG_CPU_SUP_AMD static const char errata93_warning[] = KERN_ERR @@ -1257,29 +1117,6 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, */ WARN_ON_ONCE(hw_error_code & X86_PF_PK); - /* - * We can fault-in kernel-space virtual memory on-demand. The - * 'reference' page table is init_mm.pgd. - * - * NOTE! We MUST NOT take any locks for this case. We may - * be in an interrupt or a critical region, and should - * only copy the information from the master page table, - * nothing more. - * - * Before doing this on-demand faulting, ensure that the - * fault is not any of the following: - * 1. A fault on a PTE with a reserved bit set. - * 2. A fault caused by a user-mode access. (Do not demand- - * fault kernel memory due to user-mode accesses). - * 3. A fault caused by a page-level protection violation. - * (A demand fault would be on a non-present page which - * would have X86_PF_PROT==0). - */ - if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { - if (vmalloc_fault(address) >= 0) - return; - } - /* Was the fault spurious, caused by lazy TLB invalidation? */ if (spurious_kernel_fault(hw_error_code, address)) return; @@ -1523,6 +1360,24 @@ do_page_fault(struct pt_regs *regs, unsigned long hw_error_code, unsigned long address) { prefetchw(¤t->mm->mmap_sem); + /* + * KVM has two types of events that are, logically, interrupts, but + * are unfortunately delivered using the #PF vector. These events are + * "you just accessed valid memory, but the host doesn't have it right + * now, so I'll put you to sleep if you continue" and "that memory + * you tried to access earlier is available now." + * + * We are relying on the interrupted context being sane (valid RSP, + * relevant locks not held, etc.), which is fine as long as the + * interrupted context had IF=1. We are also relying on the KVM + * async pf type field and CR2 being read consistently instead of + * getting values from real and async page faults mixed up. + * + * Fingers crossed. + */ + if (kvm_handle_async_pf(regs, (u32)address)) + return; + trace_page_fault_entries(regs, hw_error_code, address); if (unlikely(kmmio_fault(regs, address))) diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 0a1898b8552e..075fe51317b0 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -4,44 +4,11 @@ #include <linux/swap.h> /* for totalram_pages */ #include <linux/memblock.h> -void *kmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - return kmap_high(page); -} -EXPORT_SYMBOL(kmap); - -void kunmap(struct page *page) -{ - if (in_interrupt()) - BUG(); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} -EXPORT_SYMBOL(kunmap); - -/* - * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because - * no global lock is needed and because the kmap code must perform a global TLB - * invalidation when the kmap pool wraps. - * - * However when holding an atomic kmap it is not legal to sleep, so atomic - * kmaps are appropriate for short, tight code paths only. - */ -void *kmap_atomic_prot(struct page *page, pgprot_t prot) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned long vaddr; int idx, type; - preempt_disable(); - pagefault_disable(); - - if (!PageHighMem(page)) - return page_address(page); - type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); @@ -51,13 +18,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot) return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic_prot); - -void *kmap_atomic(struct page *page) -{ - return kmap_atomic_prot(page, kmap_prot); -} -EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kmap_atomic_high_prot); /* * This is the same as kmap_atomic() but can map memory that doesn't @@ -69,7 +30,7 @@ void *kmap_atomic_pfn(unsigned long pfn) } EXPORT_SYMBOL_GPL(kmap_atomic_pfn); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; @@ -99,11 +60,8 @@ void __kunmap_atomic(void *kvaddr) BUG_ON(vaddr >= (unsigned long)high_memory); } #endif - - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); void __init set_highmem_pages_init(void) { diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 5bfd5aef5378..cf5781142716 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -181,28 +181,21 @@ get_unmapped_area: #endif /* CONFIG_HUGETLB_PAGE */ #ifdef CONFIG_X86_64 -static __init int setup_hugepagesz(char *opt) +bool __init arch_hugetlb_valid_size(unsigned long size) { - unsigned long ps = memparse(opt, &opt); - if (ps == PMD_SIZE) { - hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); - } else if (ps == PUD_SIZE && boot_cpu_has(X86_FEATURE_GBPAGES)) { - hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); - } else { - hugetlb_bad_size(); - printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n", - ps >> 20); - return 0; - } - return 1; + if (size == PMD_SIZE) + return true; + else if (size == PUD_SIZE && boot_cpu_has(X86_FEATURE_GBPAGES)) + return true; + else + return false; } -__setup("hugepagesz=", setup_hugepagesz); #ifdef CONFIG_CONTIG_ALLOC static __init int gigantic_pages_init(void) { /* With compaction or CMA we can allocate gigantic pages at runtime */ - if (boot_cpu_has(X86_FEATURE_GBPAGES) && !size_to_hstate(1UL << PUD_SHIFT)) + if (boot_cpu_has(X86_FEATURE_GBPAGES)) hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); return 0; } diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 1bba16c5742b..112d3b98a3b6 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -49,7 +49,7 @@ * Index into __pte2cachemode_tbl[] are the caching attribute bits of the pte * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2. */ -uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { +static uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { [_PAGE_CACHE_MODE_WB ] = 0 | 0 , [_PAGE_CACHE_MODE_WC ] = 0 | _PAGE_PCD, [_PAGE_CACHE_MODE_UC_MINUS] = 0 | _PAGE_PCD, @@ -57,9 +57,16 @@ uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { [_PAGE_CACHE_MODE_WT ] = 0 | _PAGE_PCD, [_PAGE_CACHE_MODE_WP ] = 0 | _PAGE_PCD, }; -EXPORT_SYMBOL(__cachemode2pte_tbl); -uint8_t __pte2cachemode_tbl[8] = { +unsigned long cachemode2protval(enum page_cache_mode pcm) +{ + if (likely(pcm == 0)) + return 0; + return __cachemode2pte_tbl[pcm]; +} +EXPORT_SYMBOL(cachemode2protval); + +static uint8_t __pte2cachemode_tbl[8] = { [__pte2cm_idx( 0 | 0 | 0 )] = _PAGE_CACHE_MODE_WB, [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx( 0 | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, @@ -69,7 +76,22 @@ uint8_t __pte2cachemode_tbl[8] = { [__pte2cm_idx(0 | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC, }; -EXPORT_SYMBOL(__pte2cachemode_tbl); + +/* Check that the write-protect PAT entry is set for write-protect */ +bool x86_has_pat_wp(void) +{ + return __pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] == _PAGE_CACHE_MODE_WP; +} + +enum page_cache_mode pgprot2cachemode(pgprot_t pgprot) +{ + unsigned long masked; + + masked = pgprot_val(pgprot) & _PAGE_CACHE_MASK; + if (likely(masked == 0)) + return 0; + return __pte2cachemode_tbl[__pte2cm_idx(masked)]; +} static unsigned long __initdata pgt_buf_start; static unsigned long __initdata pgt_buf_end; @@ -121,8 +143,6 @@ __ref void *alloc_low_pages(unsigned int num) } else { pfn = pgt_buf_end; pgt_buf_end += num; - printk(KERN_DEBUG "BRK [%#010lx, %#010lx] PGTABLE\n", - pfn << PAGE_SHIFT, (pgt_buf_end << PAGE_SHIFT) - 1); } for (i = 0; i < num; i++) { @@ -172,6 +192,19 @@ struct map_range { static int page_size_mask; +/* + * Save some of cr4 feature set we're using (e.g. Pentium 4MB + * enable and PPro Global page enable), so that any CPU's that boot + * up after us can get the correct flags. Invoked on the boot CPU. + */ +static inline void cr4_set_bits_and_update_boot(unsigned long mask) +{ + mmu_cr4_features |= mask; + if (trampoline_cr4_features) + *trampoline_cr4_features = mmu_cr4_features; + cr4_set_bits(mask); +} + static void __init probe_page_size_mask(void) { /* @@ -949,7 +982,7 @@ void __init zone_sizes_init(void) max_zone_pfns[ZONE_HIGHMEM] = max_pfn; #endif - free_area_init_nodes(max_zone_pfns); + free_area_init(max_zone_pfns); } __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { @@ -957,7 +990,6 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { .next_asid = 1, .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ }; -EXPORT_PER_CPU_SYMBOL(cpu_tlbstate); void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache) { diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 3b289c2f75cd..add03c35aa34 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -54,6 +54,7 @@ #include <asm/init.h> #include <asm/uv/uv.h> #include <asm/setup.h> +#include <asm/ftrace.h> #include "mm_internal.h" @@ -217,6 +218,11 @@ void sync_global_pgds(unsigned long start, unsigned long end) sync_global_pgds_l4(start, end); } +void arch_sync_kernel_mappings(unsigned long start, unsigned long end) +{ + sync_global_pgds(start, end); +} + /* * NOTE: This function is marked __ref because it calls __init function * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. @@ -298,7 +304,7 @@ static void __set_pte_vaddr(pud_t *pud, unsigned long vaddr, pte_t new_pte) * It's enough to flush this one mapping. * (PGE mappings get flushed as well) */ - __flush_tlb_one_kernel(vaddr); + flush_tlb_one_kernel(vaddr); } void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte) @@ -367,7 +373,7 @@ static void __init __init_extra_mapping(unsigned long phys, unsigned long size, pgprot_t prot; pgprot_val(prot) = pgprot_val(PAGE_KERNEL_LARGE) | - pgprot_val(pgprot_4k_2_large(cachemode2pgprot(cache))); + protval_4k_2_large(cachemode2protval(cache)); BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK)); for (; size; phys += PMD_SIZE, size -= PMD_SIZE) { pgd = pgd_offset_k((unsigned long)__va(phys)); @@ -1259,6 +1265,18 @@ void __init mem_init(void) mem_init_print_info(NULL); } +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask) +{ + /* + * More CPUs always led to greater speedups on tested systems, up to + * all the nodes' CPUs. Use all since the system is otherwise idle + * now. + */ + return max_t(int, cpumask_weight(node_cpumask), 1); +} +#endif + int kernel_set_to_readonly; void mark_rodata_ro(void) @@ -1291,6 +1309,8 @@ void mark_rodata_ro(void) all_end = roundup((unsigned long)_brk_end, PMD_SIZE); set_memory_nx(text_end, (all_end - text_end) >> PAGE_SHIFT); + set_ftrace_ops_ro(); + #ifdef CONFIG_CPA_DEBUG printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end); set_memory_rw(start, (end-start) >> PAGE_SHIFT); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 18c637c0dc6f..986d57534fd6 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -778,10 +778,8 @@ void __init *early_memremap_encrypted(resource_size_t phys_addr, void __init *early_memremap_encrypted_wp(resource_size_t phys_addr, unsigned long size) { - /* Be sure the write-protect PAT entry is set for write-protect */ - if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP) + if (!x86_has_pat_wp()) return NULL; - return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC_WP); } @@ -799,10 +797,8 @@ void __init *early_memremap_decrypted(resource_size_t phys_addr, void __init *early_memremap_decrypted_wp(resource_size_t phys_addr, unsigned long size) { - /* Be sure the write-protect PAT entry is set for write-protect */ - if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP) + if (!x86_has_pat_wp()) return NULL; - return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC_WP); } #endif /* CONFIG_AMD_MEM_ENCRYPT */ @@ -889,5 +885,5 @@ void __init __early_set_fixmap(enum fixed_addresses idx, set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags)); else pte_clear(&init_mm, addr, pte); - __flush_tlb_one_kernel(addr); + flush_tlb_one_kernel(addr); } diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 9994353fb75d..be020a7bc414 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -130,7 +130,7 @@ static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old) pmdval_t v = pmd_val(*pmd); if (clear) { *old = v; - new_pmd = pmd_mknotpresent(*pmd); + new_pmd = pmd_mkinvalid(*pmd); } else { /* Presume this has been called with clear==true previously */ new_pmd = __pmd(*old); @@ -173,7 +173,7 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear) return -1; } - __flush_tlb_one_kernel(f->addr); + flush_tlb_one_kernel(f->addr); return 0; } diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index a03614bd3e1a..4a781cf99e92 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -134,7 +134,7 @@ static void __init __sme_early_map_unmap_mem(void *vaddr, unsigned long size, size = (size <= PMD_SIZE) ? 0 : size - PMD_SIZE; } while (size); - __native_flush_tlb(); + flush_tlb_local(); } void __init sme_unmap_bootdata(char *real_mode_data) diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index cb91eccc4960..c90c20904a60 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -18,7 +18,9 @@ #include <linux/sched/signal.h> #include <linux/sched/mm.h> #include <linux/compat.h> +#include <linux/elf-randomize.h> #include <asm/elf.h> +#include <asm/io.h> #include "physaddr.h" diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index 109325d77b3e..43fd19b3f118 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -372,7 +372,7 @@ static void enter_uniprocessor(void) int cpu; int err; - if (downed_cpus == NULL && + if (!cpumask_available(downed_cpus) && !alloc_cpumask_var(&downed_cpus, GFP_KERNEL)) { pr_notice("Failed to allocate mask\n"); goto out; @@ -402,7 +402,7 @@ static void leave_uniprocessor(void) int cpu; int err; - if (downed_cpus == NULL || cpumask_weight(downed_cpus) == 0) + if (!cpumask_available(downed_cpus) || cpumask_weight(downed_cpus) == 0) return; pr_notice("Re-enabling CPUs...\n"); for_each_cpu(cpu, downed_cpus) { diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 59ba008504dc..8ee952038c80 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -517,8 +517,10 @@ static void __init numa_clear_kernel_node_hotplug(void) * reserve specific pages for Sandy Bridge graphics. ] */ for_each_memblock(reserved, mb_region) { - if (mb_region->nid != MAX_NUMNODES) - node_set(mb_region->nid, reserved_nodemask); + int nid = memblock_get_region_node(mb_region); + + if (nid != MAX_NUMNODES) + node_set(nid, reserved_nodemask); } /* @@ -735,12 +737,9 @@ void __init x86_numa_init(void) static void __init init_memory_less_node(int nid) { - unsigned long zones_size[MAX_NR_ZONES] = {0}; - unsigned long zholes_size[MAX_NR_ZONES] = {0}; - /* Allocate and initialize node data. Memory-less node is now online.*/ alloc_node_data(nid); - free_area_init_node(nid, zones_size, 0, zholes_size); + free_area_init_memoryless_node(nid); /* * All zonelists will be built later in start_kernel() after per cpu diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index f2bd3d61e16b..104544359d69 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -27,40 +27,6 @@ #include "numa_internal.h" -#ifdef CONFIG_DISCONTIGMEM -/* - * 4) physnode_map - the mapping between a pfn and owning node - * physnode_map keeps track of the physical memory layout of a generic - * numa node on a 64Mb break (each element of the array will - * represent 64Mb of memory and will be marked by the node id. so, - * if the first gig is on node 0, and the second gig is on node 1 - * physnode_map will contain: - * - * physnode_map[0-15] = 0; - * physnode_map[16-31] = 1; - * physnode_map[32- ] = -1; - */ -s8 physnode_map[MAX_SECTIONS] __read_mostly = { [0 ... (MAX_SECTIONS - 1)] = -1}; -EXPORT_SYMBOL(physnode_map); - -void memory_present(int nid, unsigned long start, unsigned long end) -{ - unsigned long pfn; - - printk(KERN_INFO "Node: %d, start_pfn: %lx, end_pfn: %lx\n", - nid, start, end); - printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid); - printk(KERN_DEBUG " "); - start = round_down(start, PAGES_PER_SECTION); - end = round_up(end, PAGES_PER_SECTION); - for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { - physnode_map[pfn / PAGES_PER_SECTION] = nid; - printk(KERN_CONT "%lx ", pfn); - } - printk(KERN_CONT "\n"); -} -#endif - extern unsigned long highend_pfn, highstart_pfn; void __init initmem_init(void) diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 59eca6a94ce7..77e04304a2a7 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -43,7 +43,8 @@ struct cpa_data { unsigned long pfn; unsigned int flags; unsigned int force_split : 1, - force_static_prot : 1; + force_static_prot : 1, + force_flush_all : 1; struct page **pages; }; @@ -68,6 +69,11 @@ static DEFINE_SPINLOCK(cpa_lock); #define CPA_PAGES_ARRAY 4 #define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */ +static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm) +{ + return __pgprot(cachemode2protval(pcm)); +} + #ifdef CONFIG_PROC_FS static unsigned long direct_pages_count[PG_LEVEL_NUM]; @@ -340,7 +346,7 @@ static void __cpa_flush_tlb(void *data) unsigned int i; for (i = 0; i < cpa->numpages; i++) - __flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i))); + flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i))); } static void cpa_flush(struct cpa_data *data, int cache) @@ -355,10 +361,10 @@ static void cpa_flush(struct cpa_data *data, int cache) return; } - if (cpa->numpages <= tlb_single_page_flush_ceiling) - on_each_cpu(__cpa_flush_tlb, cpa, 1); - else + if (cpa->force_flush_all || cpa->numpages > tlb_single_page_flush_ceiling) flush_tlb_all(); + else + on_each_cpu(__cpa_flush_tlb, cpa, 1); if (!cache) return; @@ -1598,6 +1604,8 @@ static int cpa_process_alias(struct cpa_data *cpa) alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); alias_cpa.curpage = 0; + cpa->force_flush_all = 1; + ret = __change_page_attr_set_clr(&alias_cpa, 0); if (ret) return ret; @@ -1618,6 +1626,7 @@ static int cpa_process_alias(struct cpa_data *cpa) alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); alias_cpa.curpage = 0; + cpa->force_flush_all = 1; /* * The high mapping range is imprecise, so ignore the * return value. diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 7bd2c3a52297..d88e9064c28e 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -19,6 +19,14 @@ EXPORT_SYMBOL(physical_mask); #define PGTABLE_HIGHMEM 0 #endif +#ifndef CONFIG_PARAVIRT +static inline +void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) +{ + tlb_remove_page(tlb, table); +} +#endif + gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM; pgtable_t pte_alloc_one(struct mm_struct *mm) @@ -706,11 +714,9 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) if (pud_present(*pud) && !pud_huge(*pud)) return 0; - prot = pgprot_4k_2_large(prot); - set_pte((pte_t *)pud, pfn_pte( (u64)addr >> PAGE_SHIFT, - __pgprot(pgprot_val(prot) | _PAGE_PSE))); + __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE))); return 1; } @@ -738,11 +744,9 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) if (pmd_present(*pmd) && !pmd_huge(*pmd)) return 0; - prot = pgprot_4k_2_large(prot); - set_pte((pte_t *)pmd, pfn_pte( (u64)addr >> PAGE_SHIFT, - __pgprot(pgprot_val(prot) | _PAGE_PSE))); + __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE))); return 1; } diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 0e6700eaa4f9..e1ce59dc558f 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -64,7 +64,7 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval) * It's enough to flush this one mapping. * (PGE mappings get flushed as well) */ - __flush_tlb_one_kernel(vaddr); + flush_tlb_one_kernel(vaddr); } unsigned long __FIXADDR_TOP = 0xfffff000; diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 843aa10a4cb6..da0fb17a1a36 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -448,13 +448,7 @@ static void __init pti_clone_user_shared(void) * the sp1 and sp2 slots. * * This is done for all possible CPUs during boot to ensure - * that it's propagated to all mms. If we were to add one of - * these mappings during CPU hotplug, we would need to take - * some measure to make sure that every mm that subsequently - * ran on that CPU would have the relevant PGD entry in its - * pagetables. The usual vmalloc_fault() mechanism would not - * work for page faults taken in entry_SYSCALL_64 before RSP - * is set up. + * that it's propagated to all mms. */ unsigned long va = (unsigned long)&per_cpu(cpu_tss_rw, cpu); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 66f96f21a7b6..1a3569b43aa5 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -18,6 +18,16 @@ #include "mm_internal.h" +#ifdef CONFIG_PARAVIRT +# define STATIC_NOPV +#else +# define STATIC_NOPV static +# define __flush_tlb_local native_flush_tlb_local +# define __flush_tlb_global native_flush_tlb_global +# define __flush_tlb_one_user(addr) native_flush_tlb_one_user(addr) +# define __flush_tlb_others(msk, info) native_flush_tlb_others(msk, info) +#endif + /* * TLB flushing, formerly SMP-only * c/o Linus Torvalds. @@ -39,6 +49,126 @@ #define LAST_USER_MM_IBPB 0x1UL /* + * The x86 feature is called PCID (Process Context IDentifier). It is similar + * to what is traditionally called ASID on the RISC processors. + * + * We don't use the traditional ASID implementation, where each process/mm gets + * its own ASID and flush/restart when we run out of ASID space. + * + * Instead we have a small per-cpu array of ASIDs and cache the last few mm's + * that came by on this CPU, allowing cheaper switch_mm between processes on + * this CPU. + * + * We end up with different spaces for different things. To avoid confusion we + * use different names for each of them: + * + * ASID - [0, TLB_NR_DYN_ASIDS-1] + * the canonical identifier for an mm + * + * kPCID - [1, TLB_NR_DYN_ASIDS] + * the value we write into the PCID part of CR3; corresponds to the + * ASID+1, because PCID 0 is special. + * + * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS] + * for KPTI each mm has two address spaces and thus needs two + * PCID values, but we can still do with a single ASID denomination + * for each mm. Corresponds to kPCID + 2048. + * + */ + +/* There are 12 bits of space for ASIDS in CR3 */ +#define CR3_HW_ASID_BITS 12 + +/* + * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for + * user/kernel switches + */ +#ifdef CONFIG_PAGE_TABLE_ISOLATION +# define PTI_CONSUMED_PCID_BITS 1 +#else +# define PTI_CONSUMED_PCID_BITS 0 +#endif + +#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS) + +/* + * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account + * for them being zero-based. Another -1 is because PCID 0 is reserved for + * use by non-PCID-aware users. + */ +#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2) + +/* + * Given @asid, compute kPCID + */ +static inline u16 kern_pcid(u16 asid) +{ + VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); + +#ifdef CONFIG_PAGE_TABLE_ISOLATION + /* + * Make sure that the dynamic ASID space does not confict with the + * bit we are using to switch between user and kernel ASIDs. + */ + BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT)); + + /* + * The ASID being passed in here should have respected the + * MAX_ASID_AVAILABLE and thus never have the switch bit set. + */ + VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_PCID_USER_BIT)); +#endif + /* + * The dynamically-assigned ASIDs that get passed in are small + * (<TLB_NR_DYN_ASIDS). They never have the high switch bit set, + * so do not bother to clear it. + * + * If PCID is on, ASID-aware code paths put the ASID+1 into the + * PCID bits. This serves two purposes. It prevents a nasty + * situation in which PCID-unaware code saves CR3, loads some other + * value (with PCID == 0), and then restores CR3, thus corrupting + * the TLB for ASID 0 if the saved ASID was nonzero. It also means + * that any bugs involving loading a PCID-enabled CR3 with + * CR4.PCIDE off will trigger deterministically. + */ + return asid + 1; +} + +/* + * Given @asid, compute uPCID + */ +static inline u16 user_pcid(u16 asid) +{ + u16 ret = kern_pcid(asid); +#ifdef CONFIG_PAGE_TABLE_ISOLATION + ret |= 1 << X86_CR3_PTI_PCID_USER_BIT; +#endif + return ret; +} + +static inline unsigned long build_cr3(pgd_t *pgd, u16 asid) +{ + if (static_cpu_has(X86_FEATURE_PCID)) { + return __sme_pa(pgd) | kern_pcid(asid); + } else { + VM_WARN_ON_ONCE(asid != 0); + return __sme_pa(pgd); + } +} + +static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) +{ + VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); + /* + * Use boot_cpu_has() instead of this_cpu_has() as this function + * might be called during early boot. This should work even after + * boot because all CPU's the have same capabilities: + */ + VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID)); + return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH; +} + +/* * We get here when we do something requiring a TLB invalidation * but could not go invalidate all of the contexts. We do the * necessary invalidation by clearing out the 'ctx_id' which @@ -110,6 +240,32 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, *need_flush = true; } +/* + * Given an ASID, flush the corresponding user ASID. We can delay this + * until the next time we switch to it. + * + * See SWITCH_TO_USER_CR3. + */ +static inline void invalidate_user_asid(u16 asid) +{ + /* There is no user ASID if address space separation is off */ + if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) + return; + + /* + * We only have a single ASID if PCID is off and the CR3 + * write will have flushed it. + */ + if (!cpu_feature_enabled(X86_FEATURE_PCID)) + return; + + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + __set_bit(kern_pcid(asid), + (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask)); +} + static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush) { unsigned long new_mm_cr3; @@ -161,34 +317,6 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next, local_irq_restore(flags); } -static void sync_current_stack_to_mm(struct mm_struct *mm) -{ - unsigned long sp = current_stack_pointer; - pgd_t *pgd = pgd_offset(mm, sp); - - if (pgtable_l5_enabled()) { - if (unlikely(pgd_none(*pgd))) { - pgd_t *pgd_ref = pgd_offset_k(sp); - - set_pgd(pgd, *pgd_ref); - } - } else { - /* - * "pgd" is faked. The top level entries are "p4d"s, so sync - * the p4d. This compiles to approximately the same code as - * the 5-level case. - */ - p4d_t *p4d = p4d_offset(pgd, sp); - - if (unlikely(p4d_none(*p4d))) { - pgd_t *pgd_ref = pgd_offset_k(sp); - p4d_t *p4d_ref = p4d_offset(pgd_ref, sp); - - set_p4d(p4d, *p4d_ref); - } - } -} - static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next) { unsigned long next_tif = task_thread_info(next)->flags; @@ -272,6 +400,26 @@ static void cond_ibpb(struct task_struct *next) } } +#ifdef CONFIG_PERF_EVENTS +static inline void cr4_update_pce_mm(struct mm_struct *mm) +{ + if (static_branch_unlikely(&rdpmc_always_available_key) || + (!static_branch_unlikely(&rdpmc_never_available_key) && + atomic_read(&mm->context.perf_rdpmc_allowed))) + cr4_set_bits_irqsoff(X86_CR4_PCE); + else + cr4_clear_bits_irqsoff(X86_CR4_PCE); +} + +void cr4_update_pce(void *ignored) +{ + cr4_update_pce_mm(this_cpu_read(cpu_tlbstate.loaded_mm)); +} + +#else +static inline void cr4_update_pce_mm(struct mm_struct *mm) { } +#endif + void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { @@ -377,15 +525,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, */ cond_ibpb(tsk); - if (IS_ENABLED(CONFIG_VMAP_STACK)) { - /* - * If our current stack is in vmalloc space and isn't - * mapped in the new pgd, we'll double-fault. Forcibly - * map it. - */ - sync_current_stack_to_mm(next); - } - /* * Stop remote flushes for the previous mm. * Skip kernel threads; we never send init_mm TLB flushing IPIs, @@ -440,7 +579,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); if (next != real_prev) { - load_mm_cr4_irqsoff(next); + cr4_update_pce_mm(next); switch_ldt(real_prev, next); } } @@ -617,7 +756,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, unsigned long addr = f->start; while (addr < f->end) { - __flush_tlb_one_user(addr); + flush_tlb_one_user(addr); addr += 1UL << f->stride_shift; } if (local) @@ -625,7 +764,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, trace_tlb_flush(reason, nr_invalidate); } else { /* Full flush. */ - local_flush_tlb(); + flush_tlb_local(); if (local) count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); trace_tlb_flush(reason, TLB_FLUSH_ALL); @@ -660,8 +799,8 @@ static bool tlb_is_not_lazy(int cpu, void *data) return !per_cpu(cpu_tlbstate.is_lazy, cpu); } -void native_flush_tlb_others(const struct cpumask *cpumask, - const struct flush_tlb_info *info) +STATIC_NOPV void native_flush_tlb_others(const struct cpumask *cpumask, + const struct flush_tlb_info *info) { count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); if (info->end == TLB_FLUSH_ALL) @@ -711,6 +850,12 @@ void native_flush_tlb_others(const struct cpumask *cpumask, (void *)info, 1, cpumask); } +void flush_tlb_others(const struct cpumask *cpumask, + const struct flush_tlb_info *info) +{ + __flush_tlb_others(cpumask, info); +} + /* * See Documentation/x86/tlb.rst for details. We choose 33 * because it is large enough to cover the vast majority (at @@ -821,7 +966,7 @@ static void do_kernel_range_flush(void *info) /* flush range by one by one 'invlpg' */ for (addr = f->start; addr < f->end; addr += PAGE_SIZE) - __flush_tlb_one_kernel(addr); + flush_tlb_one_kernel(addr); } void flush_tlb_kernel_range(unsigned long start, unsigned long end) @@ -844,6 +989,164 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end) } /* + * This can be used from process context to figure out what the value of + * CR3 is without needing to do a (slow) __read_cr3(). + * + * It's intended to be used for code like KVM that sneakily changes CR3 + * and needs to restore it. It needs to be used very carefully. + */ +unsigned long __get_current_cr3_fast(void) +{ + unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd, + this_cpu_read(cpu_tlbstate.loaded_mm_asid)); + + /* For now, be very restrictive about when this can be called. */ + VM_WARN_ON(in_nmi() || preemptible()); + + VM_BUG_ON(cr3 != __read_cr3()); + return cr3; +} +EXPORT_SYMBOL_GPL(__get_current_cr3_fast); + +/* + * Flush one page in the kernel mapping + */ +void flush_tlb_one_kernel(unsigned long addr) +{ + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); + + /* + * If PTI is off, then __flush_tlb_one_user() is just INVLPG or its + * paravirt equivalent. Even with PCID, this is sufficient: we only + * use PCID if we also use global PTEs for the kernel mapping, and + * INVLPG flushes global translations across all address spaces. + * + * If PTI is on, then the kernel is mapped with non-global PTEs, and + * __flush_tlb_one_user() will flush the given address for the current + * kernel address space and for its usermode counterpart, but it does + * not flush it for other address spaces. + */ + flush_tlb_one_user(addr); + + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + /* + * See above. We need to propagate the flush to all other address + * spaces. In principle, we only need to propagate it to kernelmode + * address spaces, but the extra bookkeeping we would need is not + * worth it. + */ + this_cpu_write(cpu_tlbstate.invalidate_other, true); +} + +/* + * Flush one page in the user mapping + */ +STATIC_NOPV void native_flush_tlb_one_user(unsigned long addr) +{ + u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + + asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); + + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + /* + * Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1. + * Just use invalidate_user_asid() in case we are called early. + */ + if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) + invalidate_user_asid(loaded_mm_asid); + else + invpcid_flush_one(user_pcid(loaded_mm_asid), addr); +} + +void flush_tlb_one_user(unsigned long addr) +{ + __flush_tlb_one_user(addr); +} + +/* + * Flush everything + */ +STATIC_NOPV void native_flush_tlb_global(void) +{ + unsigned long cr4, flags; + + if (static_cpu_has(X86_FEATURE_INVPCID)) { + /* + * Using INVPCID is considerably faster than a pair of writes + * to CR4 sandwiched inside an IRQ flag save/restore. + * + * Note, this works with CR4.PCIDE=0 or 1. + */ + invpcid_flush_all(); + return; + } + + /* + * Read-modify-write to CR4 - protect it from preemption and + * from interrupts. (Use the raw variant because this code can + * be called from deep inside debugging code.) + */ + raw_local_irq_save(flags); + + cr4 = this_cpu_read(cpu_tlbstate.cr4); + /* toggle PGE */ + native_write_cr4(cr4 ^ X86_CR4_PGE); + /* write old PGE again and flush TLBs */ + native_write_cr4(cr4); + + raw_local_irq_restore(flags); +} + +/* + * Flush the entire current user mapping + */ +STATIC_NOPV void native_flush_tlb_local(void) +{ + /* + * Preemption or interrupts must be disabled to protect the access + * to the per CPU variable and to prevent being preempted between + * read_cr3() and write_cr3(). + */ + WARN_ON_ONCE(preemptible()); + + invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid)); + + /* If current->mm == NULL then the read_cr3() "borrows" an mm */ + native_write_cr3(__native_read_cr3()); +} + +void flush_tlb_local(void) +{ + __flush_tlb_local(); +} + +/* + * Flush everything + */ +void __flush_tlb_all(void) +{ + /* + * This is to catch users with enabled preemption and the PGE feature + * and don't trigger the warning in __native_flush_tlb(). + */ + VM_WARN_ON_ONCE(preemptible()); + + if (boot_cpu_has(X86_FEATURE_PGE)) { + __flush_tlb_global(); + } else { + /* + * !PGE -> !PCID (setup_pcid()), thus every flush is total. + */ + flush_tlb_local(); + } +} +EXPORT_SYMBOL_GPL(__flush_tlb_all); + +/* * arch_tlbbatch_flush() performs a full TLB flush regardless of the active mm. * This means that the 'struct flush_tlb_info' that describes which mappings to * flush is actually fixed. We therefore set a single fixed struct and use it in @@ -874,6 +1177,38 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) put_cpu(); } +/* + * Blindly accessing user memory from NMI context can be dangerous + * if we're in the middle of switching the current user task or + * switching the loaded mm. It can also be dangerous if we + * interrupted some kernel code that was temporarily using a + * different mm. + */ +bool nmi_uaccess_okay(void) +{ + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); + struct mm_struct *current_mm = current->mm; + + VM_WARN_ON_ONCE(!loaded_mm); + + /* + * The condition we want to check is + * current_mm->pgd == __va(read_cr3_pa()). This may be slow, though, + * if we're running in a VM with shadow paging, and nmi_uaccess_okay() + * is supposed to be reasonably fast. + * + * Instead, we check the almost equivalent but somewhat conservative + * condition below, and we rely on the fact that switch_mm_irqs_off() + * sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3. + */ + if (loaded_mm != current_mm) + return false; + + VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa())); + + return true; +} + static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { |