diff options
Diffstat (limited to 'arch/x86/mm')
-rw-r--r-- | arch/x86/mm/Makefile | 2 | ||||
-rw-r--r-- | arch/x86/mm/dump_pagetables.c | 93 | ||||
-rw-r--r-- | arch/x86/mm/extable.c | 44 | ||||
-rw-r--r-- | arch/x86/mm/fault.c | 75 | ||||
-rw-r--r-- | arch/x86/mm/hugetlbpage.c | 27 | ||||
-rw-r--r-- | arch/x86/mm/ident_map.c | 12 | ||||
-rw-r--r-- | arch/x86/mm/init.c | 2 | ||||
-rw-r--r-- | arch/x86/mm/ioremap.c | 287 | ||||
-rw-r--r-- | arch/x86/mm/kasan_init_64.c | 6 | ||||
-rw-r--r-- | arch/x86/mm/mem_encrypt.c | 593 | ||||
-rw-r--r-- | arch/x86/mm/mem_encrypt_boot.S | 149 | ||||
-rw-r--r-- | arch/x86/mm/mmap.c | 12 | ||||
-rw-r--r-- | arch/x86/mm/mpx.c | 33 | ||||
-rw-r--r-- | arch/x86/mm/numa_emulation.c | 55 | ||||
-rw-r--r-- | arch/x86/mm/pageattr.c | 67 | ||||
-rw-r--r-- | arch/x86/mm/pat.c | 9 | ||||
-rw-r--r-- | arch/x86/mm/pgtable.c | 8 | ||||
-rw-r--r-- | arch/x86/mm/tlb.c | 331 |
18 files changed, 1569 insertions, 236 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 0fbdcb64f9f8..72bf8c01c6e3 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -39,3 +39,5 @@ obj-$(CONFIG_X86_INTEL_MPX) += mpx.o obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o +obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o +obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 0470826d2bdc..5e3ac6fe6c9e 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -13,12 +13,12 @@ */ #include <linux/debugfs.h> +#include <linux/kasan.h> #include <linux/mm.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/seq_file.h> -#include <asm/kasan.h> #include <asm/pgtable.h> /* @@ -138,7 +138,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) { pgprotval_t pr = pgprot_val(prot); static const char * const level_name[] = - { "cr3", "pgd", "pud", "pmd", "pte" }; + { "cr3", "pgd", "p4d", "pud", "pmd", "pte" }; if (!pgprot_val(prot)) { /* Not present */ @@ -162,12 +162,12 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) pt_dump_cont_printf(m, dmsg, " "); /* Bit 7 has a different meaning on level 3 vs 4 */ - if (level <= 3 && pr & _PAGE_PSE) + if (level <= 4 && pr & _PAGE_PSE) pt_dump_cont_printf(m, dmsg, "PSE "); else pt_dump_cont_printf(m, dmsg, " "); - if ((level == 4 && pr & _PAGE_PAT) || - ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE)) + if ((level == 5 && pr & _PAGE_PAT) || + ((level == 4 || level == 3) && pr & _PAGE_PAT_LARGE)) pt_dump_cont_printf(m, dmsg, "PAT "); else pt_dump_cont_printf(m, dmsg, " "); @@ -188,11 +188,12 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) */ static unsigned long normalize_addr(unsigned long u) { -#ifdef CONFIG_X86_64 - return (signed long)(u << 16) >> 16; -#else - return u; -#endif + int shift; + if (!IS_ENABLED(CONFIG_X86_64)) + return u; + + shift = 64 - (__VIRTUAL_MASK_SHIFT + 1); + return (signed long)(u << shift) >> shift; } /* @@ -297,32 +298,62 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, for (i = 0; i < PTRS_PER_PTE; i++) { prot = pte_flags(*start); st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT); - note_page(m, st, __pgprot(prot), 4); + note_page(m, st, __pgprot(prot), 5); start++; } } +#ifdef CONFIG_KASAN + +/* + * This is an optimization for KASAN=y case. Since all kasan page tables + * eventually point to the kasan_zero_page we could call note_page() + * right away without walking through lower level page tables. This saves + * us dozens of seconds (minutes for 5-level config) while checking for + * W+X mapping or reading kernel_page_tables debugfs file. + */ +static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st, + void *pt) +{ + if (__pa(pt) == __pa(kasan_zero_pmd) || +#ifdef CONFIG_X86_5LEVEL + __pa(pt) == __pa(kasan_zero_p4d) || +#endif + __pa(pt) == __pa(kasan_zero_pud)) { + pgprotval_t prot = pte_flags(kasan_zero_pte[0]); + note_page(m, st, __pgprot(prot), 5); + return true; + } + return false; +} +#else +static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st, + void *pt) +{ + return false; +} +#endif #if PTRS_PER_PMD > 1 static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, unsigned long P) { int i; - pmd_t *start; + pmd_t *start, *pmd_start; pgprotval_t prot; - start = (pmd_t *)pud_page_vaddr(addr); + pmd_start = start = (pmd_t *)pud_page_vaddr(addr); for (i = 0; i < PTRS_PER_PMD; i++) { st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); if (!pmd_none(*start)) { if (pmd_large(*start) || !pmd_present(*start)) { prot = pmd_flags(*start); - note_page(m, st, __pgprot(prot), 3); - } else { + note_page(m, st, __pgprot(prot), 4); + } else if (!kasan_page_table(m, st, pmd_start)) { walk_pte_level(m, st, *start, P + i * PMD_LEVEL_MULT); } } else - note_page(m, st, __pgprot(0), 3); + note_page(m, st, __pgprot(0), 4); start++; } } @@ -335,39 +366,27 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, #if PTRS_PER_PUD > 1 -/* - * This is an optimization for CONFIG_DEBUG_WX=y + CONFIG_KASAN=y - * KASAN fills page tables with the same values. Since there is no - * point in checking page table more than once we just skip repeated - * entries. This saves us dozens of seconds during boot. - */ -static bool pud_already_checked(pud_t *prev_pud, pud_t *pud, bool checkwx) -{ - return checkwx && prev_pud && (pud_val(*prev_pud) == pud_val(*pud)); -} - static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, unsigned long P) { int i; - pud_t *start; + pud_t *start, *pud_start; pgprotval_t prot; pud_t *prev_pud = NULL; - start = (pud_t *)p4d_page_vaddr(addr); + pud_start = start = (pud_t *)p4d_page_vaddr(addr); for (i = 0; i < PTRS_PER_PUD; i++) { st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); - if (!pud_none(*start) && - !pud_already_checked(prev_pud, start, st->check_wx)) { + if (!pud_none(*start)) { if (pud_large(*start) || !pud_present(*start)) { prot = pud_flags(*start); - note_page(m, st, __pgprot(prot), 2); - } else { + note_page(m, st, __pgprot(prot), 3); + } else if (!kasan_page_table(m, st, pud_start)) { walk_pmd_level(m, st, *start, P + i * PUD_LEVEL_MULT); } } else - note_page(m, st, __pgprot(0), 2); + note_page(m, st, __pgprot(0), 3); prev_pud = start; start++; @@ -385,10 +404,10 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P) { int i; - p4d_t *start; + p4d_t *start, *p4d_start; pgprotval_t prot; - start = (p4d_t *)pgd_page_vaddr(addr); + p4d_start = start = (p4d_t *)pgd_page_vaddr(addr); for (i = 0; i < PTRS_PER_P4D; i++) { st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT); @@ -396,7 +415,7 @@ static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, if (p4d_large(*start) || !p4d_present(*start)) { prot = p4d_flags(*start); note_page(m, st, __pgprot(prot), 2); - } else { + } else if (!kasan_page_table(m, st, p4d_start)) { walk_pud_level(m, st, *start, P + i * P4D_LEVEL_MULT); } diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 0ea8afcb929c..c076f710de4c 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -36,6 +36,48 @@ bool ex_handler_fault(const struct exception_table_entry *fixup, } EXPORT_SYMBOL_GPL(ex_handler_fault); +/* + * Handler for UD0 exception following a failed test against the + * result of a refcount inc/dec/add/sub. + */ +bool ex_handler_refcount(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + /* First unconditionally saturate the refcount. */ + *(int *)regs->cx = INT_MIN / 2; + + /* + * Strictly speaking, this reports the fixup destination, not + * the fault location, and not the actually overflowing + * instruction, which is the instruction before the "js", but + * since that instruction could be a variety of lengths, just + * report the location after the overflow, which should be close + * enough for finding the overflow, as it's at least back in + * the function, having returned from .text.unlikely. + */ + regs->ip = ex_fixup_addr(fixup); + + /* + * This function has been called because either a negative refcount + * value was seen by any of the refcount functions, or a zero + * refcount value was seen by refcount_dec(). + * + * If we crossed from INT_MAX to INT_MIN, OF (Overflow Flag: result + * wrapped around) will be set. Additionally, seeing the refcount + * reach 0 will set ZF (Zero Flag: result was zero). In each of + * these cases we want a report, since it's a boundary condition. + * + */ + if (regs->flags & (X86_EFLAGS_OF | X86_EFLAGS_ZF)) { + bool zero = regs->flags & X86_EFLAGS_ZF; + + refcount_error_report(regs, zero ? "hit zero" : "overflow"); + } + + return true; +} +EXPORT_SYMBOL_GPL(ex_handler_refcount); + bool ex_handler_ext(const struct exception_table_entry *fixup, struct pt_regs *regs, int trapnr) { @@ -142,7 +184,7 @@ void __init early_fixup_exception(struct pt_regs *regs, int trapnr) * undefined. I'm not sure which CPUs do this, but at least * the 486 DX works this way. */ - if ((regs->cs & 0xFFFF) != __KERNEL_CS) + if (regs->cs != __KERNEL_CS) goto fail; /* diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 2a1fa10c6a98..b836a7274e12 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -396,14 +396,18 @@ static void dump_pagetable(unsigned long address) pte_t *pte; #ifdef CONFIG_X86_PAE - printk("*pdpt = %016Lx ", pgd_val(*pgd)); + pr_info("*pdpt = %016Lx ", pgd_val(*pgd)); if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) goto out; +#define pr_pde pr_cont +#else +#define pr_pde pr_info #endif p4d = p4d_offset(pgd, address); pud = pud_offset(p4d, address); pmd = pmd_offset(pud, address); - printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); + pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); +#undef pr_pde /* * We must not directly access the pte in the highpte @@ -415,9 +419,9 @@ static void dump_pagetable(unsigned long address) goto out; pte = pte_offset_kernel(pmd, address); - printk("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte)); + pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte)); out: - printk("\n"); + pr_cont("\n"); } #else /* CONFIG_X86_64: */ @@ -565,7 +569,7 @@ static void dump_pagetable(unsigned long address) if (bad_address(pgd)) goto bad; - printk("PGD %lx ", pgd_val(*pgd)); + pr_info("PGD %lx ", pgd_val(*pgd)); if (!pgd_present(*pgd)) goto out; @@ -574,7 +578,7 @@ static void dump_pagetable(unsigned long address) if (bad_address(p4d)) goto bad; - printk("P4D %lx ", p4d_val(*p4d)); + pr_cont("P4D %lx ", p4d_val(*p4d)); if (!p4d_present(*p4d) || p4d_large(*p4d)) goto out; @@ -582,7 +586,7 @@ static void dump_pagetable(unsigned long address) if (bad_address(pud)) goto bad; - printk("PUD %lx ", pud_val(*pud)); + pr_cont("PUD %lx ", pud_val(*pud)); if (!pud_present(*pud) || pud_large(*pud)) goto out; @@ -590,7 +594,7 @@ static void dump_pagetable(unsigned long address) if (bad_address(pmd)) goto bad; - printk("PMD %lx ", pmd_val(*pmd)); + pr_cont("PMD %lx ", pmd_val(*pmd)); if (!pmd_present(*pmd) || pmd_large(*pmd)) goto out; @@ -598,12 +602,12 @@ static void dump_pagetable(unsigned long address) if (bad_address(pte)) goto bad; - printk("PTE %lx", pte_val(*pte)); + pr_cont("PTE %lx", pte_val(*pte)); out: - printk("\n"); + pr_cont("\n"); return; bad: - printk("BAD\n"); + pr_info("BAD\n"); } #endif /* CONFIG_X86_64 */ @@ -1254,10 +1258,6 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs) * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate * routines. - * - * This function must have noinline because both callers - * {,trace_}do_page_fault() have notrace on. Having this an actual function - * guarantees there's a function trace entry. */ static noinline void __do_page_fault(struct pt_regs *regs, unsigned long error_code, @@ -1490,27 +1490,6 @@ good_area: } NOKPROBE_SYMBOL(__do_page_fault); -dotraplinkage void notrace -do_page_fault(struct pt_regs *regs, unsigned long error_code) -{ - unsigned long address = read_cr2(); /* Get the faulting address */ - enum ctx_state prev_state; - - /* - * We must have this function tagged with __kprobes, notrace and call - * read_cr2() before calling anything else. To avoid calling any kind - * of tracing machinery before we've observed the CR2 value. - * - * exception_{enter,exit}() contain all sorts of tracepoints. - */ - - prev_state = exception_enter(); - __do_page_fault(regs, error_code, address); - exception_exit(prev_state); -} -NOKPROBE_SYMBOL(do_page_fault); - -#ifdef CONFIG_TRACING static nokprobe_inline void trace_page_fault_entries(unsigned long address, struct pt_regs *regs, unsigned long error_code) @@ -1521,22 +1500,24 @@ trace_page_fault_entries(unsigned long address, struct pt_regs *regs, trace_page_fault_kernel(address, regs, error_code); } +/* + * We must have this function blacklisted from kprobes, tagged with notrace + * and call read_cr2() before calling anything else. To avoid calling any + * kind of tracing machinery before we've observed the CR2 value. + * + * exception_{enter,exit}() contains all sorts of tracepoints. + */ dotraplinkage void notrace -trace_do_page_fault(struct pt_regs *regs, unsigned long error_code) +do_page_fault(struct pt_regs *regs, unsigned long error_code) { - /* - * The exception_enter and tracepoint processing could - * trigger another page faults (user space callchain - * reading) and destroy the original cr2 value, so read - * the faulting address now. - */ - unsigned long address = read_cr2(); + unsigned long address = read_cr2(); /* Get the faulting address */ enum ctx_state prev_state; prev_state = exception_enter(); - trace_page_fault_entries(address, regs, error_code); + if (trace_pagefault_enabled()) + trace_page_fault_entries(address, regs, error_code); + __do_page_fault(regs, error_code, address); exception_exit(prev_state); } -NOKPROBE_SYMBOL(trace_do_page_fault); -#endif /* CONFIG_TRACING */ +NOKPROBE_SYMBOL(do_page_fault); diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 2824607df108..6d06cf33e3de 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -18,6 +18,7 @@ #include <asm/tlbflush.h> #include <asm/pgalloc.h> #include <asm/elf.h> +#include <asm/mpx.h> #if 0 /* This is just for testing */ struct page * @@ -85,25 +86,38 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, info.flags = 0; info.length = len; info.low_limit = get_mmap_base(1); + + /* + * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area + * in the full address space. + */ info.high_limit = in_compat_syscall() ? - tasksize_32bit() : tasksize_64bit(); + task_size_32bit() : task_size_64bit(addr > DEFAULT_MAP_WINDOW); + info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; return vm_unmapped_area(&info); } static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, - unsigned long addr0, unsigned long len, + unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct hstate *h = hstate_file(file); struct vm_unmapped_area_info info; - unsigned long addr; info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = PAGE_SIZE; info.high_limit = get_mmap_base(0); + + /* + * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area + * in the full address space. + */ + if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall()) + info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; + info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; addr = vm_unmapped_area(&info); @@ -118,7 +132,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, VM_BUG_ON(addr != -ENOMEM); info.flags = 0; info.low_limit = TASK_UNMAPPED_BASE; - info.high_limit = TASK_SIZE; + info.high_limit = TASK_SIZE_LOW; addr = vm_unmapped_area(&info); } @@ -135,6 +149,11 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, if (len & ~huge_page_mask(h)) return -EINVAL; + + addr = mpx_unmapped_area_check(addr, len, flags); + if (IS_ERR_VALUE(addr)) + return addr; + if (len > TASK_SIZE) return -ENOMEM; diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index adab1595f4bd..31cea988fa36 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -51,7 +51,7 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, if (!pmd) return -ENOMEM; ident_pmd_init(info, pmd, addr, next); - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); + set_pud(pud, __pud(__pa(pmd) | info->kernpg_flag)); } return 0; @@ -79,7 +79,7 @@ static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page, if (!pud) return -ENOMEM; ident_pud_init(info, pud, addr, next); - set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); + set_p4d(p4d, __p4d(__pa(pud) | info->kernpg_flag)); } return 0; @@ -93,6 +93,10 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, unsigned long next; int result; + /* Set the default pagetable flags if not supplied */ + if (!info->kernpg_flag) + info->kernpg_flag = _KERNPG_TABLE; + for (; addr < end; addr = next) { pgd_t *pgd = pgd_page + pgd_index(addr); p4d_t *p4d; @@ -116,14 +120,14 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, if (result) return result; if (IS_ENABLED(CONFIG_X86_5LEVEL)) { - set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE)); + set_pgd(pgd, __pgd(__pa(p4d) | info->kernpg_flag)); } else { /* * With p4d folded, pgd is equal to p4d. * The pgd entry has to point to the pud page table in this case. */ pud_t *pud = pud_offset(p4d, 0); - set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + set_pgd(pgd, __pgd(__pa(pud) | info->kernpg_flag)); } } diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index bf3f1065d6ad..7777ccc0e9f9 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -815,7 +815,7 @@ void __init zone_sizes_init(void) DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { .loaded_mm = &init_mm, - .state = 0, + .next_asid = 1, .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ }; EXPORT_SYMBOL_GPL(cpu_tlbstate); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 4c1b5fd0c7ad..34f0e1847dd6 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -13,6 +13,8 @@ #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/mmiotrace.h> +#include <linux/mem_encrypt.h> +#include <linux/efi.h> #include <asm/set_memory.h> #include <asm/e820/api.h> @@ -21,6 +23,7 @@ #include <asm/tlbflush.h> #include <asm/pgalloc.h> #include <asm/pat.h> +#include <asm/setup.h> #include "physaddr.h" @@ -106,12 +109,6 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, } /* - * Don't remap the low PCI/ISA area, it's always mapped.. - */ - if (is_ISA_range(phys_addr, last_addr)) - return (__force void __iomem *)phys_to_virt(phys_addr); - - /* * Don't allow anybody to remap normal RAM that we're using.. */ pfn = phys_addr >> PAGE_SHIFT; @@ -340,13 +337,17 @@ void iounmap(volatile void __iomem *addr) return; /* - * __ioremap special-cases the PCI/ISA range by not instantiating a - * vm_area and by simply returning an address into the kernel mapping - * of ISA space. So handle that here. + * The PCI/ISA range special-casing was removed from __ioremap() + * so this check, in theory, can be removed. However, there are + * cases where iounmap() is called for addresses not obtained via + * ioremap() (vga16fb for example). Add a warning so that these + * cases can be caught and fixed. */ if ((void __force *)addr >= phys_to_virt(ISA_START_ADDRESS) && - (void __force *)addr < phys_to_virt(ISA_END_ADDRESS)) + (void __force *)addr < phys_to_virt(ISA_END_ADDRESS)) { + WARN(1, "iounmap() called for ISA range not obtained using ioremap()\n"); return; + } addr = (volatile void __iomem *) (PAGE_MASK & (unsigned long __force)addr); @@ -399,12 +400,10 @@ void *xlate_dev_mem_ptr(phys_addr_t phys) unsigned long offset = phys & ~PAGE_MASK; void *vaddr; - /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */ - if (page_is_ram(start >> PAGE_SHIFT)) - return __va(phys); + /* memremap() maps if RAM, otherwise falls back to ioremap() */ + vaddr = memremap(start, PAGE_SIZE, MEMREMAP_WB); - vaddr = ioremap_cache(start, PAGE_SIZE); - /* Only add the offset on success and return NULL if the ioremap() failed: */ + /* Only add the offset on success and return NULL if memremap() failed */ if (vaddr) vaddr += offset; @@ -413,11 +412,263 @@ void *xlate_dev_mem_ptr(phys_addr_t phys) void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr) { - if (page_is_ram(phys >> PAGE_SHIFT)) - return; + memunmap((void *)((unsigned long)addr & PAGE_MASK)); +} + +/* + * Examine the physical address to determine if it is an area of memory + * that should be mapped decrypted. If the memory is not part of the + * kernel usable area it was accessed and created decrypted, so these + * areas should be mapped decrypted. And since the encryption key can + * change across reboots, persistent memory should also be mapped + * decrypted. + */ +static bool memremap_should_map_decrypted(resource_size_t phys_addr, + unsigned long size) +{ + int is_pmem; + + /* + * Check if the address is part of a persistent memory region. + * This check covers areas added by E820, EFI and ACPI. + */ + is_pmem = region_intersects(phys_addr, size, IORESOURCE_MEM, + IORES_DESC_PERSISTENT_MEMORY); + if (is_pmem != REGION_DISJOINT) + return true; + + /* + * Check if the non-volatile attribute is set for an EFI + * reserved area. + */ + if (efi_enabled(EFI_BOOT)) { + switch (efi_mem_type(phys_addr)) { + case EFI_RESERVED_TYPE: + if (efi_mem_attributes(phys_addr) & EFI_MEMORY_NV) + return true; + break; + default: + break; + } + } + + /* Check if the address is outside kernel usable area */ + switch (e820__get_entry_type(phys_addr, phys_addr + size - 1)) { + case E820_TYPE_RESERVED: + case E820_TYPE_ACPI: + case E820_TYPE_NVS: + case E820_TYPE_UNUSABLE: + case E820_TYPE_PRAM: + return true; + default: + break; + } + + return false; +} + +/* + * Examine the physical address to determine if it is EFI data. Check + * it against the boot params structure and EFI tables and memory types. + */ +static bool memremap_is_efi_data(resource_size_t phys_addr, + unsigned long size) +{ + u64 paddr; + + /* Check if the address is part of EFI boot/runtime data */ + if (!efi_enabled(EFI_BOOT)) + return false; + + paddr = boot_params.efi_info.efi_memmap_hi; + paddr <<= 32; + paddr |= boot_params.efi_info.efi_memmap; + if (phys_addr == paddr) + return true; + + paddr = boot_params.efi_info.efi_systab_hi; + paddr <<= 32; + paddr |= boot_params.efi_info.efi_systab; + if (phys_addr == paddr) + return true; + + if (efi_is_table_address(phys_addr)) + return true; + + switch (efi_mem_type(phys_addr)) { + case EFI_BOOT_SERVICES_DATA: + case EFI_RUNTIME_SERVICES_DATA: + return true; + default: + break; + } + + return false; +} + +/* + * Examine the physical address to determine if it is boot data by checking + * it against the boot params setup_data chain. + */ +static bool memremap_is_setup_data(resource_size_t phys_addr, + unsigned long size) +{ + struct setup_data *data; + u64 paddr, paddr_next; + + paddr = boot_params.hdr.setup_data; + while (paddr) { + unsigned int len; + + if (phys_addr == paddr) + return true; + + data = memremap(paddr, sizeof(*data), + MEMREMAP_WB | MEMREMAP_DEC); + + paddr_next = data->next; + len = data->len; + + memunmap(data); + + if ((phys_addr > paddr) && (phys_addr < (paddr + len))) + return true; + + paddr = paddr_next; + } + + return false; +} + +/* + * Examine the physical address to determine if it is boot data by checking + * it against the boot params setup_data chain (early boot version). + */ +static bool __init early_memremap_is_setup_data(resource_size_t phys_addr, + unsigned long size) +{ + struct setup_data *data; + u64 paddr, paddr_next; + + paddr = boot_params.hdr.setup_data; + while (paddr) { + unsigned int len; + + if (phys_addr == paddr) + return true; + + data = early_memremap_decrypted(paddr, sizeof(*data)); + + paddr_next = data->next; + len = data->len; + + early_memunmap(data, sizeof(*data)); + + if ((phys_addr > paddr) && (phys_addr < (paddr + len))) + return true; + + paddr = paddr_next; + } + + return false; +} + +/* + * Architecture function to determine if RAM remap is allowed. By default, a + * RAM remap will map the data as encrypted. Determine if a RAM remap should + * not be done so that the data will be mapped decrypted. + */ +bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size, + unsigned long flags) +{ + if (!sme_active()) + return true; + + if (flags & MEMREMAP_ENC) + return true; + + if (flags & MEMREMAP_DEC) + return false; + + if (memremap_is_setup_data(phys_addr, size) || + memremap_is_efi_data(phys_addr, size) || + memremap_should_map_decrypted(phys_addr, size)) + return false; + + return true; +} + +/* + * Architecture override of __weak function to adjust the protection attributes + * used when remapping memory. By default, early_memremap() will map the data + * as encrypted. Determine if an encrypted mapping should not be done and set + * the appropriate protection attributes. + */ +pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr, + unsigned long size, + pgprot_t prot) +{ + if (!sme_active()) + return prot; + + if (early_memremap_is_setup_data(phys_addr, size) || + memremap_is_efi_data(phys_addr, size) || + memremap_should_map_decrypted(phys_addr, size)) + prot = pgprot_decrypted(prot); + else + prot = pgprot_encrypted(prot); + + return prot; +} + +bool phys_mem_access_encrypted(unsigned long phys_addr, unsigned long size) +{ + return arch_memremap_can_ram_remap(phys_addr, size, 0); +} + +#ifdef CONFIG_ARCH_USE_MEMREMAP_PROT +/* Remap memory with encryption */ +void __init *early_memremap_encrypted(resource_size_t phys_addr, + unsigned long size) +{ + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC); +} + +/* + * Remap memory with encryption and write-protected - cannot be called + * before pat_init() is called + */ +void __init *early_memremap_encrypted_wp(resource_size_t phys_addr, + unsigned long size) +{ + /* Be sure the write-protect PAT entry is set for write-protect */ + if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP) + return NULL; + + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC_WP); +} + +/* Remap memory without encryption */ +void __init *early_memremap_decrypted(resource_size_t phys_addr, + unsigned long size) +{ + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC); +} + +/* + * Remap memory without encryption and write-protected - cannot be called + * before pat_init() is called + */ +void __init *early_memremap_decrypted_wp(resource_size_t phys_addr, + unsigned long size) +{ + /* Be sure the write-protect PAT entry is set for write-protect */ + if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP) + return NULL; - iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK)); + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC_WP); } +#endif /* CONFIG_ARCH_USE_MEMREMAP_PROT */ static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss; diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 02c9d7553409..bc84b73684b7 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -11,8 +11,8 @@ #include <asm/e820/types.h> #include <asm/tlbflush.h> #include <asm/sections.h> +#include <asm/pgtable.h> -extern pgd_t early_top_pgt[PTRS_PER_PGD]; extern struct range pfn_mapped[E820_MAX_ENTRIES]; static int __init map_range(struct range *range) @@ -87,7 +87,7 @@ static struct notifier_block kasan_die_notifier = { void __init kasan_early_init(void) { int i; - pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL; + pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL | _PAGE_ENC; pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE; pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE; p4dval_t p4d_val = __pa_nodebug(kasan_zero_pud) | _KERNPG_TABLE; @@ -153,7 +153,7 @@ void __init kasan_init(void) */ memset(kasan_zero_page, 0, PAGE_SIZE); for (i = 0; i < PTRS_PER_PTE; i++) { - pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO); + pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO | _PAGE_ENC); set_pte(&kasan_zero_pte[i], pte); } /* Flush TLBs again to be sure that write protection applied. */ diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c new file mode 100644 index 000000000000..0fbd09269757 --- /dev/null +++ b/arch/x86/mm/mem_encrypt.c @@ -0,0 +1,593 @@ +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky <thomas.lendacky@amd.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/dma-mapping.h> +#include <linux/swiotlb.h> +#include <linux/mem_encrypt.h> + +#include <asm/tlbflush.h> +#include <asm/fixmap.h> +#include <asm/setup.h> +#include <asm/bootparam.h> +#include <asm/set_memory.h> +#include <asm/cacheflush.h> +#include <asm/sections.h> +#include <asm/processor-flags.h> +#include <asm/msr.h> +#include <asm/cmdline.h> + +static char sme_cmdline_arg[] __initdata = "mem_encrypt"; +static char sme_cmdline_on[] __initdata = "on"; +static char sme_cmdline_off[] __initdata = "off"; + +/* + * Since SME related variables are set early in the boot process they must + * reside in the .data section so as not to be zeroed out when the .bss + * section is later cleared. + */ +unsigned long sme_me_mask __section(.data) = 0; +EXPORT_SYMBOL_GPL(sme_me_mask); + +/* Buffer used for early in-place encryption by BSP, no locking needed */ +static char sme_early_buffer[PAGE_SIZE] __aligned(PAGE_SIZE); + +/* + * This routine does not change the underlying encryption setting of the + * page(s) that map this memory. It assumes that eventually the memory is + * meant to be accessed as either encrypted or decrypted but the contents + * are currently not in the desired state. + * + * This routine follows the steps outlined in the AMD64 Architecture + * Programmer's Manual Volume 2, Section 7.10.8 Encrypt-in-Place. + */ +static void __init __sme_early_enc_dec(resource_size_t paddr, + unsigned long size, bool enc) +{ + void *src, *dst; + size_t len; + + if (!sme_me_mask) + return; + + local_flush_tlb(); + wbinvd(); + + /* + * There are limited number of early mapping slots, so map (at most) + * one page at time. + */ + while (size) { + len = min_t(size_t, sizeof(sme_early_buffer), size); + + /* + * Create mappings for the current and desired format of + * the memory. Use a write-protected mapping for the source. + */ + src = enc ? early_memremap_decrypted_wp(paddr, len) : + early_memremap_encrypted_wp(paddr, len); + + dst = enc ? early_memremap_encrypted(paddr, len) : + early_memremap_decrypted(paddr, len); + + /* + * If a mapping can't be obtained to perform the operation, + * then eventual access of that area in the desired mode + * will cause a crash. + */ + BUG_ON(!src || !dst); + + /* + * Use a temporary buffer, of cache-line multiple size, to + * avoid data corruption as documented in the APM. + */ + memcpy(sme_early_buffer, src, len); + memcpy(dst, sme_early_buffer, len); + + early_memunmap(dst, len); + early_memunmap(src, len); + + paddr += len; + size -= len; + } +} + +void __init sme_early_encrypt(resource_size_t paddr, unsigned long size) +{ + __sme_early_enc_dec(paddr, size, true); +} + +void __init sme_early_decrypt(resource_size_t paddr, unsigned long size) +{ + __sme_early_enc_dec(paddr, size, false); +} + +static void __init __sme_early_map_unmap_mem(void *vaddr, unsigned long size, + bool map) +{ + unsigned long paddr = (unsigned long)vaddr - __PAGE_OFFSET; + pmdval_t pmd_flags, pmd; + + /* Use early_pmd_flags but remove the encryption mask */ + pmd_flags = __sme_clr(early_pmd_flags); + + do { + pmd = map ? (paddr & PMD_MASK) + pmd_flags : 0; + __early_make_pgtable((unsigned long)vaddr, pmd); + + vaddr += PMD_SIZE; + paddr += PMD_SIZE; + size = (size <= PMD_SIZE) ? 0 : size - PMD_SIZE; + } while (size); + + __native_flush_tlb(); +} + +void __init sme_unmap_bootdata(char *real_mode_data) +{ + struct boot_params *boot_data; + unsigned long cmdline_paddr; + + if (!sme_active()) + return; + + /* Get the command line address before unmapping the real_mode_data */ + boot_data = (struct boot_params *)real_mode_data; + cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32); + + __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), false); + + if (!cmdline_paddr) + return; + + __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, false); +} + +void __init sme_map_bootdata(char *real_mode_data) +{ + struct boot_params *boot_data; + unsigned long cmdline_paddr; + + if (!sme_active()) + return; + + __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), true); + + /* Get the command line address after mapping the real_mode_data */ + boot_data = (struct boot_params *)real_mode_data; + cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32); + + if (!cmdline_paddr) + return; + + __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true); +} + +void __init sme_early_init(void) +{ + unsigned int i; + + if (!sme_me_mask) + return; + + early_pmd_flags = __sme_set(early_pmd_flags); + + __supported_pte_mask = __sme_set(__supported_pte_mask); + + /* Update the protection map with memory encryption mask */ + for (i = 0; i < ARRAY_SIZE(protection_map); i++) + protection_map[i] = pgprot_encrypted(protection_map[i]); +} + +/* Architecture __weak replacement functions */ +void __init mem_encrypt_init(void) +{ + if (!sme_me_mask) + return; + + /* Call into SWIOTLB to update the SWIOTLB DMA buffers */ + swiotlb_update_mem_attributes(); + + pr_info("AMD Secure Memory Encryption (SME) active\n"); +} + +void swiotlb_set_mem_attributes(void *vaddr, unsigned long size) +{ + WARN(PAGE_ALIGN(size) != size, + "size is not page-aligned (%#lx)\n", size); + + /* Make the SWIOTLB buffer area decrypted */ + set_memory_decrypted((unsigned long)vaddr, size >> PAGE_SHIFT); +} + +static void __init sme_clear_pgd(pgd_t *pgd_base, unsigned long start, + unsigned long end) +{ + unsigned long pgd_start, pgd_end, pgd_size; + pgd_t *pgd_p; + + pgd_start = start & PGDIR_MASK; + pgd_end = end & PGDIR_MASK; + + pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1); + pgd_size *= sizeof(pgd_t); + + pgd_p = pgd_base + pgd_index(start); + + memset(pgd_p, 0, pgd_size); +} + +#define PGD_FLAGS _KERNPG_TABLE_NOENC +#define P4D_FLAGS _KERNPG_TABLE_NOENC +#define PUD_FLAGS _KERNPG_TABLE_NOENC +#define PMD_FLAGS (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL) + +static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area, + unsigned long vaddr, pmdval_t pmd_val) +{ + pgd_t *pgd_p; + p4d_t *p4d_p; + pud_t *pud_p; + pmd_t *pmd_p; + + pgd_p = pgd_base + pgd_index(vaddr); + if (native_pgd_val(*pgd_p)) { + if (IS_ENABLED(CONFIG_X86_5LEVEL)) + p4d_p = (p4d_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK); + else + pud_p = (pud_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK); + } else { + pgd_t pgd; + + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + p4d_p = pgtable_area; + memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D); + pgtable_area += sizeof(*p4d_p) * PTRS_PER_P4D; + + pgd = native_make_pgd((pgdval_t)p4d_p + PGD_FLAGS); + } else { + pud_p = pgtable_area; + memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); + pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; + + pgd = native_make_pgd((pgdval_t)pud_p + PGD_FLAGS); + } + native_set_pgd(pgd_p, pgd); + } + + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + p4d_p += p4d_index(vaddr); + if (native_p4d_val(*p4d_p)) { + pud_p = (pud_t *)(native_p4d_val(*p4d_p) & ~PTE_FLAGS_MASK); + } else { + p4d_t p4d; + + pud_p = pgtable_area; + memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); + pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; + + p4d = native_make_p4d((pudval_t)pud_p + P4D_FLAGS); + native_set_p4d(p4d_p, p4d); + } + } + + pud_p += pud_index(vaddr); + if (native_pud_val(*pud_p)) { + if (native_pud_val(*pud_p) & _PAGE_PSE) + goto out; + + pmd_p = (pmd_t *)(native_pud_val(*pud_p) & ~PTE_FLAGS_MASK); + } else { + pud_t pud; + + pmd_p = pgtable_area; + memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD); + pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD; + + pud = native_make_pud((pmdval_t)pmd_p + PUD_FLAGS); + native_set_pud(pud_p, pud); + } + + pmd_p += pmd_index(vaddr); + if (!native_pmd_val(*pmd_p) || !(native_pmd_val(*pmd_p) & _PAGE_PSE)) + native_set_pmd(pmd_p, native_make_pmd(pmd_val)); + +out: + return pgtable_area; +} + +static unsigned long __init sme_pgtable_calc(unsigned long len) +{ + unsigned long p4d_size, pud_size, pmd_size; + unsigned long total; + + /* + * Perform a relatively simplistic calculation of the pagetable + * entries that are needed. That mappings will be covered by 2MB + * PMD entries so we can conservatively calculate the required + * number of P4D, PUD and PMD structures needed to perform the + * mappings. Incrementing the count for each covers the case where + * the addresses cross entries. + */ + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + p4d_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1; + p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D; + pud_size = (ALIGN(len, P4D_SIZE) / P4D_SIZE) + 1; + pud_size *= sizeof(pud_t) * PTRS_PER_PUD; + } else { + p4d_size = 0; + pud_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1; + pud_size *= sizeof(pud_t) * PTRS_PER_PUD; + } + pmd_size = (ALIGN(len, PUD_SIZE) / PUD_SIZE) + 1; + pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD; + + total = p4d_size + pud_size + pmd_size; + + /* + * Now calculate the added pagetable structures needed to populate + * the new pagetables. + */ + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + p4d_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE; + p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D; + pud_size = ALIGN(total, P4D_SIZE) / P4D_SIZE; + pud_size *= sizeof(pud_t) * PTRS_PER_PUD; + } else { + p4d_size = 0; + pud_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE; + pud_size *= sizeof(pud_t) * PTRS_PER_PUD; + } + pmd_size = ALIGN(total, PUD_SIZE) / PUD_SIZE; + pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD; + + total += p4d_size + pud_size + pmd_size; + + return total; +} + +void __init sme_encrypt_kernel(void) +{ + unsigned long workarea_start, workarea_end, workarea_len; + unsigned long execute_start, execute_end, execute_len; + unsigned long kernel_start, kernel_end, kernel_len; + unsigned long pgtable_area_len; + unsigned long paddr, pmd_flags; + unsigned long decrypted_base; + void *pgtable_area; + pgd_t *pgd; + + if (!sme_active()) + return; + + /* + * Prepare for encrypting the kernel by building new pagetables with + * the necessary attributes needed to encrypt the kernel in place. + * + * One range of virtual addresses will map the memory occupied + * by the kernel as encrypted. + * + * Another range of virtual addresses will map the memory occupied + * by the kernel as decrypted and write-protected. + * + * The use of write-protect attribute will prevent any of the + * memory from being cached. + */ + + /* Physical addresses gives us the identity mapped virtual addresses */ + kernel_start = __pa_symbol(_text); + kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE); + kernel_len = kernel_end - kernel_start; + + /* Set the encryption workarea to be immediately after the kernel */ + workarea_start = kernel_end; + + /* + * Calculate required number of workarea bytes needed: + * executable encryption area size: + * stack page (PAGE_SIZE) + * encryption routine page (PAGE_SIZE) + * intermediate copy buffer (PMD_PAGE_SIZE) + * pagetable structures for the encryption of the kernel + * pagetable structures for workarea (in case not currently mapped) + */ + execute_start = workarea_start; + execute_end = execute_start + (PAGE_SIZE * 2) + PMD_PAGE_SIZE; + execute_len = execute_end - execute_start; + + /* + * One PGD for both encrypted and decrypted mappings and a set of + * PUDs and PMDs for each of the encrypted and decrypted mappings. + */ + pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD; + pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2; + + /* PUDs and PMDs needed in the current pagetables for the workarea */ + pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len); + + /* + * The total workarea includes the executable encryption area and + * the pagetable area. + */ + workarea_len = execute_len + pgtable_area_len; + workarea_end = workarea_start + workarea_len; + + /* + * Set the address to the start of where newly created pagetable + * structures (PGDs, PUDs and PMDs) will be allocated. New pagetable + * structures are created when the workarea is added to the current + * pagetables and when the new encrypted and decrypted kernel + * mappings are populated. + */ + pgtable_area = (void *)execute_end; + + /* + * Make sure the current pagetable structure has entries for + * addressing the workarea. + */ + pgd = (pgd_t *)native_read_cr3_pa(); + paddr = workarea_start; + while (paddr < workarea_end) { + pgtable_area = sme_populate_pgd(pgd, pgtable_area, + paddr, + paddr + PMD_FLAGS); + + paddr += PMD_PAGE_SIZE; + } + + /* Flush the TLB - no globals so cr3 is enough */ + native_write_cr3(__native_read_cr3()); + + /* + * A new pagetable structure is being built to allow for the kernel + * to be encrypted. It starts with an empty PGD that will then be + * populated with new PUDs and PMDs as the encrypted and decrypted + * kernel mappings are created. + */ + pgd = pgtable_area; + memset(pgd, 0, sizeof(*pgd) * PTRS_PER_PGD); + pgtable_area += sizeof(*pgd) * PTRS_PER_PGD; + + /* Add encrypted kernel (identity) mappings */ + pmd_flags = PMD_FLAGS | _PAGE_ENC; + paddr = kernel_start; + while (paddr < kernel_end) { + pgtable_area = sme_populate_pgd(pgd, pgtable_area, + paddr, + paddr + pmd_flags); + + paddr += PMD_PAGE_SIZE; + } + + /* + * A different PGD index/entry must be used to get different + * pagetable entries for the decrypted mapping. Choose the next + * PGD index and convert it to a virtual address to be used as + * the base of the mapping. + */ + decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1); + decrypted_base <<= PGDIR_SHIFT; + + /* Add decrypted, write-protected kernel (non-identity) mappings */ + pmd_flags = (PMD_FLAGS & ~_PAGE_CACHE_MASK) | (_PAGE_PAT | _PAGE_PWT); + paddr = kernel_start; + while (paddr < kernel_end) { + pgtable_area = sme_populate_pgd(pgd, pgtable_area, + paddr + decrypted_base, + paddr + pmd_flags); + + paddr += PMD_PAGE_SIZE; + } + + /* Add decrypted workarea mappings to both kernel mappings */ + paddr = workarea_start; + while (paddr < workarea_end) { + pgtable_area = sme_populate_pgd(pgd, pgtable_area, + paddr, + paddr + PMD_FLAGS); + + pgtable_area = sme_populate_pgd(pgd, pgtable_area, + paddr + decrypted_base, + paddr + PMD_FLAGS); + + paddr += PMD_PAGE_SIZE; + } + + /* Perform the encryption */ + sme_encrypt_execute(kernel_start, kernel_start + decrypted_base, + kernel_len, workarea_start, (unsigned long)pgd); + + /* + * At this point we are running encrypted. Remove the mappings for + * the decrypted areas - all that is needed for this is to remove + * the PGD entry/entries. + */ + sme_clear_pgd(pgd, kernel_start + decrypted_base, + kernel_end + decrypted_base); + + sme_clear_pgd(pgd, workarea_start + decrypted_base, + workarea_end + decrypted_base); + + /* Flush the TLB - no globals so cr3 is enough */ + native_write_cr3(__native_read_cr3()); +} + +void __init __nostackprotector sme_enable(struct boot_params *bp) +{ + const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off; + unsigned int eax, ebx, ecx, edx; + bool active_by_default; + unsigned long me_mask; + char buffer[16]; + u64 msr; + + /* Check for the SME support leaf */ + eax = 0x80000000; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + if (eax < 0x8000001f) + return; + + /* + * Check for the SME feature: + * CPUID Fn8000_001F[EAX] - Bit 0 + * Secure Memory Encryption support + * CPUID Fn8000_001F[EBX] - Bits 5:0 + * Pagetable bit position used to indicate encryption + */ + eax = 0x8000001f; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + if (!(eax & 1)) + return; + + me_mask = 1UL << (ebx & 0x3f); + + /* Check if SME is enabled */ + msr = __rdmsr(MSR_K8_SYSCFG); + if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) + return; + + /* + * Fixups have not been applied to phys_base yet and we're running + * identity mapped, so we must obtain the address to the SME command + * line argument data using rip-relative addressing. + */ + asm ("lea sme_cmdline_arg(%%rip), %0" + : "=r" (cmdline_arg) + : "p" (sme_cmdline_arg)); + asm ("lea sme_cmdline_on(%%rip), %0" + : "=r" (cmdline_on) + : "p" (sme_cmdline_on)); + asm ("lea sme_cmdline_off(%%rip), %0" + : "=r" (cmdline_off) + : "p" (sme_cmdline_off)); + + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT)) + active_by_default = true; + else + active_by_default = false; + + cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr | + ((u64)bp->ext_cmd_line_ptr << 32)); + + cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer)); + + if (!strncmp(buffer, cmdline_on, sizeof(buffer))) + sme_me_mask = me_mask; + else if (!strncmp(buffer, cmdline_off, sizeof(buffer))) + sme_me_mask = 0; + else + sme_me_mask = active_by_default ? me_mask : 0; +} diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S new file mode 100644 index 000000000000..730e6d541df1 --- /dev/null +++ b/arch/x86/mm/mem_encrypt_boot.S @@ -0,0 +1,149 @@ +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky <thomas.lendacky@amd.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/pgtable.h> +#include <asm/page.h> +#include <asm/processor-flags.h> +#include <asm/msr-index.h> + + .text + .code64 +ENTRY(sme_encrypt_execute) + + /* + * Entry parameters: + * RDI - virtual address for the encrypted kernel mapping + * RSI - virtual address for the decrypted kernel mapping + * RDX - length of kernel + * RCX - virtual address of the encryption workarea, including: + * - stack page (PAGE_SIZE) + * - encryption routine page (PAGE_SIZE) + * - intermediate copy buffer (PMD_PAGE_SIZE) + * R8 - physcial address of the pagetables to use for encryption + */ + + push %rbp + movq %rsp, %rbp /* RBP now has original stack pointer */ + + /* Set up a one page stack in the non-encrypted memory area */ + movq %rcx, %rax /* Workarea stack page */ + leaq PAGE_SIZE(%rax), %rsp /* Set new stack pointer */ + addq $PAGE_SIZE, %rax /* Workarea encryption routine */ + + push %r12 + movq %rdi, %r10 /* Encrypted kernel */ + movq %rsi, %r11 /* Decrypted kernel */ + movq %rdx, %r12 /* Kernel length */ + + /* Copy encryption routine into the workarea */ + movq %rax, %rdi /* Workarea encryption routine */ + leaq __enc_copy(%rip), %rsi /* Encryption routine */ + movq $(.L__enc_copy_end - __enc_copy), %rcx /* Encryption routine length */ + rep movsb + + /* Setup registers for call */ + movq %r10, %rdi /* Encrypted kernel */ + movq %r11, %rsi /* Decrypted kernel */ + movq %r8, %rdx /* Pagetables used for encryption */ + movq %r12, %rcx /* Kernel length */ + movq %rax, %r8 /* Workarea encryption routine */ + addq $PAGE_SIZE, %r8 /* Workarea intermediate copy buffer */ + + call *%rax /* Call the encryption routine */ + + pop %r12 + + movq %rbp, %rsp /* Restore original stack pointer */ + pop %rbp + + ret +ENDPROC(sme_encrypt_execute) + +ENTRY(__enc_copy) +/* + * Routine used to encrypt kernel. + * This routine must be run outside of the kernel proper since + * the kernel will be encrypted during the process. So this + * routine is defined here and then copied to an area outside + * of the kernel where it will remain and run decrypted + * during execution. + * + * On entry the registers must be: + * RDI - virtual address for the encrypted kernel mapping + * RSI - virtual address for the decrypted kernel mapping + * RDX - address of the pagetables to use for encryption + * RCX - length of kernel + * R8 - intermediate copy buffer + * + * RAX - points to this routine + * + * The kernel will be encrypted by copying from the non-encrypted + * kernel space to an intermediate buffer and then copying from the + * intermediate buffer back to the encrypted kernel space. The physical + * addresses of the two kernel space mappings are the same which + * results in the kernel being encrypted "in place". + */ + /* Enable the new page tables */ + mov %rdx, %cr3 + + /* Flush any global TLBs */ + mov %cr4, %rdx + andq $~X86_CR4_PGE, %rdx + mov %rdx, %cr4 + orq $X86_CR4_PGE, %rdx + mov %rdx, %cr4 + + /* Set the PAT register PA5 entry to write-protect */ + push %rcx + movl $MSR_IA32_CR_PAT, %ecx + rdmsr + push %rdx /* Save original PAT value */ + andl $0xffff00ff, %edx /* Clear PA5 */ + orl $0x00000500, %edx /* Set PA5 to WP */ + wrmsr + pop %rdx /* RDX contains original PAT value */ + pop %rcx + + movq %rcx, %r9 /* Save kernel length */ + movq %rdi, %r10 /* Save encrypted kernel address */ + movq %rsi, %r11 /* Save decrypted kernel address */ + + wbinvd /* Invalidate any cache entries */ + + /* Copy/encrypt 2MB at a time */ +1: + movq %r11, %rsi /* Source - decrypted kernel */ + movq %r8, %rdi /* Dest - intermediate copy buffer */ + movq $PMD_PAGE_SIZE, %rcx /* 2MB length */ + rep movsb + + movq %r8, %rsi /* Source - intermediate copy buffer */ + movq %r10, %rdi /* Dest - encrypted kernel */ + movq $PMD_PAGE_SIZE, %rcx /* 2MB length */ + rep movsb + + addq $PMD_PAGE_SIZE, %r11 + addq $PMD_PAGE_SIZE, %r10 + subq $PMD_PAGE_SIZE, %r9 /* Kernel length decrement */ + jnz 1b /* Kernel length not zero? */ + + /* Restore PAT register */ + push %rdx /* Save original PAT value */ + movl $MSR_IA32_CR_PAT, %ecx + rdmsr + pop %rdx /* Restore original PAT value */ + wrmsr + + ret +.L__enc_copy_end: +ENDPROC(__enc_copy) diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index a88cfbfbd078..a99679826846 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -37,21 +37,21 @@ struct va_alignment __read_mostly va_align = { .flags = -1, }; -unsigned long tasksize_32bit(void) +unsigned long task_size_32bit(void) { return IA32_PAGE_OFFSET; } -unsigned long tasksize_64bit(void) +unsigned long task_size_64bit(int full_addr_space) { - return TASK_SIZE_MAX; + return full_addr_space ? TASK_SIZE_MAX : DEFAULT_MAP_WINDOW; } static unsigned long stack_maxrandom_size(unsigned long task_size) { unsigned long max = 0; if (current->flags & PF_RANDOMIZE) { - max = (-1UL) & __STACK_RND_MASK(task_size == tasksize_32bit()); + max = (-1UL) & __STACK_RND_MASK(task_size == task_size_32bit()); max <<= PAGE_SHIFT; } @@ -141,7 +141,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) mm->get_unmapped_area = arch_get_unmapped_area_topdown; arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base, - arch_rnd(mmap64_rnd_bits), tasksize_64bit()); + arch_rnd(mmap64_rnd_bits), task_size_64bit(0)); #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES /* @@ -151,7 +151,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) * mmap_base, the compat syscall uses mmap_compat_base. */ arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base, - arch_rnd(mmap32_rnd_bits), tasksize_32bit()); + arch_rnd(mmap32_rnd_bits), task_size_32bit()); #endif } diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 1c34b767c84c..9ceaa955d2ba 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -355,10 +355,19 @@ int mpx_enable_management(void) */ bd_base = mpx_get_bounds_dir(); down_write(&mm->mmap_sem); + + /* MPX doesn't support addresses above 47 bits yet. */ + if (find_vma(mm, DEFAULT_MAP_WINDOW)) { + pr_warn_once("%s (%d): MPX cannot handle addresses " + "above 47-bits. Disabling.", + current->comm, current->pid); + ret = -ENXIO; + goto out; + } mm->context.bd_addr = bd_base; if (mm->context.bd_addr == MPX_INVALID_BOUNDS_DIR) ret = -ENXIO; - +out: up_write(&mm->mmap_sem); return ret; } @@ -1030,3 +1039,25 @@ void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, if (ret) force_sig(SIGSEGV, current); } + +/* MPX cannot handle addresses above 47 bits yet. */ +unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len, + unsigned long flags) +{ + if (!kernel_managing_mpx_tables(current->mm)) + return addr; + if (addr + len <= DEFAULT_MAP_WINDOW) + return addr; + if (flags & MAP_FIXED) + return -ENOMEM; + + /* + * Requested len is larger than the whole area we're allowed to map in. + * Resetting hinting address wouldn't do much good -- fail early. + */ + if (len > DEFAULT_MAP_WINDOW) + return -ENOMEM; + + /* Look for unmap area within DEFAULT_MAP_WINDOW */ + return 0; +} diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index a8f90ce3dedf..d805162e6045 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -75,13 +75,15 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei, /* * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr - * to max_addr. The return value is the number of nodes allocated. + * to max_addr. + * + * Returns zero on success or negative on error. */ static int __init split_nodes_interleave(struct numa_meminfo *ei, struct numa_meminfo *pi, u64 addr, u64 max_addr, int nr_nodes) { - nodemask_t physnode_mask = NODE_MASK_NONE; + nodemask_t physnode_mask = numa_nodes_parsed; u64 size; int big; int nid = 0; @@ -116,9 +118,6 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei, return -1; } - for (i = 0; i < pi->nr_blks; i++) - node_set(pi->blk[i].nid, physnode_mask); - /* * Continue to fill physical nodes with fake nodes until there is no * memory left on any of them. @@ -200,13 +199,15 @@ static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) /* * Sets up fake nodes of `size' interleaved over physical nodes ranging from - * `addr' to `max_addr'. The return value is the number of nodes allocated. + * `addr' to `max_addr'. + * + * Returns zero on success or negative on error. */ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, struct numa_meminfo *pi, u64 addr, u64 max_addr, u64 size) { - nodemask_t physnode_mask = NODE_MASK_NONE; + nodemask_t physnode_mask = numa_nodes_parsed; u64 min_size; int nid = 0; int i, ret; @@ -231,9 +232,6 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, } size &= FAKE_NODE_MIN_HASH_MASK; - for (i = 0; i < pi->nr_blks; i++) - node_set(pi->blk[i].nid, physnode_mask); - /* * Fill physical nodes with fake nodes of size until there is no memory * left on any of them. @@ -280,6 +278,22 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, return 0; } +int __init setup_emu2phys_nid(int *dfl_phys_nid) +{ + int i, max_emu_nid = 0; + + *dfl_phys_nid = NUMA_NO_NODE; + for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { + if (emu_nid_to_phys[i] != NUMA_NO_NODE) { + max_emu_nid = i; + if (*dfl_phys_nid == NUMA_NO_NODE) + *dfl_phys_nid = emu_nid_to_phys[i]; + } + } + + return max_emu_nid; +} + /** * numa_emulation - Emulate NUMA nodes * @numa_meminfo: NUMA configuration to massage @@ -376,23 +390,18 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) * Determine the max emulated nid and the default phys nid to use * for unmapped nodes. */ - max_emu_nid = 0; - dfl_phys_nid = NUMA_NO_NODE; - for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { - if (emu_nid_to_phys[i] != NUMA_NO_NODE) { - max_emu_nid = i; - if (dfl_phys_nid == NUMA_NO_NODE) - dfl_phys_nid = emu_nid_to_phys[i]; - } - } - if (dfl_phys_nid == NUMA_NO_NODE) { - pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n"); - goto no_emu; - } + max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid); /* commit */ *numa_meminfo = ei; + /* Make sure numa_nodes_parsed only contains emulated nodes */ + nodes_clear(numa_nodes_parsed); + for (i = 0; i < ARRAY_SIZE(ei.blk); i++) + if (ei.blk[i].start != ei.blk[i].end && + ei.blk[i].nid != NUMA_NO_NODE) + node_set(ei.blk[i].nid, numa_nodes_parsed); + /* * Transform __apicid_to_node table to use emulated nids by * reverse-mapping phys_nid. The maps should always exist but fall diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 757b0bcdf712..dfb7d657cf43 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1775,6 +1775,70 @@ int set_memory_4k(unsigned long addr, int numpages) __pgprot(0), 1, 0, NULL); } +static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) +{ + struct cpa_data cpa; + unsigned long start; + int ret; + + /* Nothing to do if the SME is not active */ + if (!sme_active()) + return 0; + + /* Should not be working on unaligned addresses */ + if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n", addr)) + addr &= PAGE_MASK; + + start = addr; + + memset(&cpa, 0, sizeof(cpa)); + cpa.vaddr = &addr; + cpa.numpages = numpages; + cpa.mask_set = enc ? __pgprot(_PAGE_ENC) : __pgprot(0); + cpa.mask_clr = enc ? __pgprot(0) : __pgprot(_PAGE_ENC); + cpa.pgd = init_mm.pgd; + + /* Must avoid aliasing mappings in the highmem code */ + kmap_flush_unused(); + vm_unmap_aliases(); + + /* + * Before changing the encryption attribute, we need to flush caches. + */ + if (static_cpu_has(X86_FEATURE_CLFLUSH)) + cpa_flush_range(start, numpages, 1); + else + cpa_flush_all(1); + + ret = __change_page_attr_set_clr(&cpa, 1); + + /* + * After changing the encryption attribute, we need to flush TLBs + * again in case any speculative TLB caching occurred (but no need + * to flush caches again). We could just use cpa_flush_all(), but + * in case TLB flushing gets optimized in the cpa_flush_range() + * path use the same logic as above. + */ + if (static_cpu_has(X86_FEATURE_CLFLUSH)) + cpa_flush_range(start, numpages, 0); + else + cpa_flush_all(0); + + return ret; +} + +int set_memory_encrypted(unsigned long addr, int numpages) +{ + return __set_memory_enc_dec(addr, numpages, true); +} +EXPORT_SYMBOL_GPL(set_memory_encrypted); + +int set_memory_decrypted(unsigned long addr, int numpages) +{ + return __set_memory_enc_dec(addr, numpages, false); +} +EXPORT_SYMBOL_GPL(set_memory_decrypted); + int set_pages_uc(struct page *page, int numpages) { unsigned long addr = (unsigned long)page_address(page); @@ -2020,6 +2084,9 @@ int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, if (!(page_flags & _PAGE_RW)) cpa.mask_clr = __pgprot(_PAGE_RW); + if (!(page_flags & _PAGE_ENC)) + cpa.mask_clr = pgprot_encrypted(cpa.mask_clr); + cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags); retval = __change_page_attr_set_clr(&cpa, 0); diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 45979502f64b..fe7d57a8fb60 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -293,7 +293,7 @@ void init_cache_modes(void) * pat_init - Initialize PAT MSR and PAT table * * This function initializes PAT MSR and PAT table with an OS-defined value - * to enable additional cache attributes, WC and WT. + * to enable additional cache attributes, WC, WT and WP. * * This function must be called on all CPUs using the specific sequence of * operations defined in Intel SDM. mtrr_rendezvous_handler() provides this @@ -352,7 +352,7 @@ void pat_init(void) * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS * 011 3 UC : _PAGE_CACHE_MODE_UC * 100 4 WB : Reserved - * 101 5 WC : Reserved + * 101 5 WP : _PAGE_CACHE_MODE_WP * 110 6 UC-: Reserved * 111 7 WT : _PAGE_CACHE_MODE_WT * @@ -360,7 +360,7 @@ void pat_init(void) * corresponding types in the presence of PAT errata. */ pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | - PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, WT); + PAT(4, WB) | PAT(5, WP) | PAT(6, UC_MINUS) | PAT(7, WT); } if (!boot_cpu_done) { @@ -744,6 +744,9 @@ EXPORT_SYMBOL(arch_io_free_memtype_wc); pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot) { + if (!phys_mem_access_encrypted(pfn << PAGE_SHIFT, size)) + vma_prot = pgprot_decrypted(vma_prot); + return vma_prot; } diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 508a708eb9a6..218834a3e9ad 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -56,7 +56,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { pgtable_page_dtor(pte); paravirt_release_pte(page_to_pfn(pte)); - tlb_remove_page(tlb, pte); + tlb_remove_table(tlb, pte); } #if CONFIG_PGTABLE_LEVELS > 2 @@ -72,21 +72,21 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) tlb->need_flush_all = 1; #endif pgtable_pmd_page_dtor(page); - tlb_remove_page(tlb, page); + tlb_remove_table(tlb, page); } #if CONFIG_PGTABLE_LEVELS > 3 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) { paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); - tlb_remove_page(tlb, virt_to_page(pud)); + tlb_remove_table(tlb, virt_to_page(pud)); } #if CONFIG_PGTABLE_LEVELS > 4 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) { paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); - tlb_remove_page(tlb, virt_to_page(p4d)); + tlb_remove_table(tlb, virt_to_page(p4d)); } #endif /* CONFIG_PGTABLE_LEVELS > 4 */ #endif /* CONFIG_PGTABLE_LEVELS > 3 */ diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 014d07a80053..ce104b962a17 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -28,6 +28,42 @@ * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi */ +atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); + +static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, + u16 *new_asid, bool *need_flush) +{ + u16 asid; + + if (!static_cpu_has(X86_FEATURE_PCID)) { + *new_asid = 0; + *need_flush = true; + return; + } + + for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { + if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) != + next->context.ctx_id) + continue; + + *new_asid = asid; + *need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) < + next_tlb_gen); + return; + } + + /* + * We don't currently own an ASID slot on this CPU. + * Allocate a slot. + */ + *new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1; + if (*new_asid >= TLB_NR_DYN_ASIDS) { + *new_asid = 0; + this_cpu_write(cpu_tlbstate.next_asid, 1); + } + *need_flush = true; +} + void leave_mm(int cpu) { struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); @@ -43,12 +79,11 @@ void leave_mm(int cpu) if (loaded_mm == &init_mm) return; - if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) - BUG(); + /* Warn if we're not lazy. */ + WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))); switch_mm(NULL, &init_mm, NULL); } -EXPORT_SYMBOL_GPL(leave_mm); void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) @@ -63,115 +98,219 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next, void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { - unsigned cpu = smp_processor_id(); struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); + u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + unsigned cpu = smp_processor_id(); + u64 next_tlb_gen; /* - * NB: The scheduler will call us with prev == next when - * switching from lazy TLB mode to normal mode if active_mm - * isn't changing. When this happens, there is no guarantee - * that CR3 (and hence cpu_tlbstate.loaded_mm) matches next. + * NB: The scheduler will call us with prev == next when switching + * from lazy TLB mode to normal mode if active_mm isn't changing. + * When this happens, we don't assume that CR3 (and hence + * cpu_tlbstate.loaded_mm) matches next. * * NB: leave_mm() calls us with prev == NULL and tsk == NULL. */ - this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); + /* We don't want flush_tlb_func_* to run concurrently with us. */ + if (IS_ENABLED(CONFIG_PROVE_LOCKING)) + WARN_ON_ONCE(!irqs_disabled()); + + /* + * Verify that CR3 is what we think it is. This will catch + * hypothetical buggy code that directly switches to swapper_pg_dir + * without going through leave_mm() / switch_mm_irqs_off() or that + * does something like write_cr3(read_cr3_pa()). + */ + VM_BUG_ON(__read_cr3() != (__sme_pa(real_prev->pgd) | prev_asid)); if (real_prev == next) { - /* - * There's nothing to do: we always keep the per-mm control - * regs in sync with cpu_tlbstate.loaded_mm. Just - * sanity-check mm_cpumask. - */ - if (WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(next)))) - cpumask_set_cpu(cpu, mm_cpumask(next)); - return; - } + VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != + next->context.ctx_id); + + if (cpumask_test_cpu(cpu, mm_cpumask(next))) { + /* + * There's nothing to do: we weren't lazy, and we + * aren't changing our mm. We don't need to flush + * anything, nor do we need to update CR3, CR4, or + * LDTR. + */ + return; + } + + /* Resume remote flushes and then read tlb_gen. */ + cpumask_set_cpu(cpu, mm_cpumask(next)); + next_tlb_gen = atomic64_read(&next->context.tlb_gen); + + if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) < + next_tlb_gen) { + /* + * Ideally, we'd have a flush_tlb() variant that + * takes the known CR3 value as input. This would + * be faster on Xen PV and on hypothetical CPUs + * on which INVPCID is fast. + */ + this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen, + next_tlb_gen); + write_cr3(__sme_pa(next->pgd) | prev_asid); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, + TLB_FLUSH_ALL); + } - if (IS_ENABLED(CONFIG_VMAP_STACK)) { /* - * If our current stack is in vmalloc space and isn't - * mapped in the new pgd, we'll double-fault. Forcibly - * map it. + * We just exited lazy mode, which means that CR4 and/or LDTR + * may be stale. (Changes to the required CR4 and LDTR states + * are not reflected in tlb_gen.) */ - unsigned int stack_pgd_index = pgd_index(current_stack_pointer()); - - pgd_t *pgd = next->pgd + stack_pgd_index; - - if (unlikely(pgd_none(*pgd))) - set_pgd(pgd, init_mm.pgd[stack_pgd_index]); - } + } else { + u16 new_asid; + bool need_flush; + + if (IS_ENABLED(CONFIG_VMAP_STACK)) { + /* + * If our current stack is in vmalloc space and isn't + * mapped in the new pgd, we'll double-fault. Forcibly + * map it. + */ + unsigned int index = pgd_index(current_stack_pointer()); + pgd_t *pgd = next->pgd + index; + + if (unlikely(pgd_none(*pgd))) + set_pgd(pgd, init_mm.pgd[index]); + } - this_cpu_write(cpu_tlbstate.loaded_mm, next); + /* Stop remote flushes for the previous mm */ + if (cpumask_test_cpu(cpu, mm_cpumask(real_prev))) + cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); - WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); - cpumask_set_cpu(cpu, mm_cpumask(next)); + VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); - /* - * Re-load page tables. - * - * This logic has an ordering constraint: - * - * CPU 0: Write to a PTE for 'next' - * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI. - * CPU 1: set bit 1 in next's mm_cpumask - * CPU 1: load from the PTE that CPU 0 writes (implicit) - * - * We need to prevent an outcome in which CPU 1 observes - * the new PTE value and CPU 0 observes bit 1 clear in - * mm_cpumask. (If that occurs, then the IPI will never - * be sent, and CPU 0's TLB will contain a stale entry.) - * - * The bad outcome can occur if either CPU's load is - * reordered before that CPU's store, so both CPUs must - * execute full barriers to prevent this from happening. - * - * Thus, switch_mm needs a full barrier between the - * store to mm_cpumask and any operation that could load - * from next->pgd. TLB fills are special and can happen - * due to instruction fetches or for no reason at all, - * and neither LOCK nor MFENCE orders them. - * Fortunately, load_cr3() is serializing and gives the - * ordering guarantee we need. - */ - load_cr3(next->pgd); - - /* - * This gets called via leave_mm() in the idle path where RCU - * functions differently. Tracing normally uses RCU, so we have to - * call the tracepoint specially here. - */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + /* + * Start remote flushes and then read tlb_gen. + */ + cpumask_set_cpu(cpu, mm_cpumask(next)); + next_tlb_gen = atomic64_read(&next->context.tlb_gen); + + choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); + + if (need_flush) { + this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); + write_cr3(__sme_pa(next->pgd) | new_asid); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, + TLB_FLUSH_ALL); + } else { + /* The new ASID is already up to date. */ + write_cr3(__sme_pa(next->pgd) | new_asid | CR3_NOFLUSH); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); + } - /* Stop flush ipis for the previous mm */ - WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) && - real_prev != &init_mm); - cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); + this_cpu_write(cpu_tlbstate.loaded_mm, next); + this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); + } - /* Load per-mm CR4 and LDTR state */ load_mm_cr4(next); switch_ldt(real_prev, next); } +/* + * flush_tlb_func_common()'s memory ordering requirement is that any + * TLB fills that happen after we flush the TLB are ordered after we + * read active_mm's tlb_gen. We don't need any explicit barriers + * because all x86 flush operations are serializing and the + * atomic64_read operation won't be reordered by the compiler. + */ static void flush_tlb_func_common(const struct flush_tlb_info *f, bool local, enum tlb_flush_reason reason) { + /* + * We have three different tlb_gen values in here. They are: + * + * - mm_tlb_gen: the latest generation. + * - local_tlb_gen: the generation that this CPU has already caught + * up to. + * - f->new_tlb_gen: the generation that the requester of the flush + * wants us to catch up to. + */ + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); + u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen); + u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen); + /* This code cannot presently handle being reentered. */ VM_WARN_ON(!irqs_disabled()); - if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) { - leave_mm(smp_processor_id()); + VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) != + loaded_mm->context.ctx_id); + + if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) { + /* + * We're in lazy mode -- don't flush. We can get here on + * remote flushes due to races and on local flushes if a + * kernel thread coincidentally flushes the mm it's lazily + * still using. + */ return; } - if (f->end == TLB_FLUSH_ALL) { - local_flush_tlb(); - if (local) - count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); - trace_tlb_flush(reason, TLB_FLUSH_ALL); - } else { + if (unlikely(local_tlb_gen == mm_tlb_gen)) { + /* + * There's nothing to do: we're already up to date. This can + * happen if two concurrent flushes happen -- the first flush to + * be handled can catch us all the way up, leaving no work for + * the second flush. + */ + trace_tlb_flush(reason, 0); + return; + } + + WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen); + WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen); + + /* + * If we get to this point, we know that our TLB is out of date. + * This does not strictly imply that we need to flush (it's + * possible that f->new_tlb_gen <= local_tlb_gen), but we're + * going to need to flush in the very near future, so we might + * as well get it over with. + * + * The only question is whether to do a full or partial flush. + * + * We do a partial flush if requested and two extra conditions + * are met: + * + * 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that + * we've always done all needed flushes to catch up to + * local_tlb_gen. If, for example, local_tlb_gen == 2 and + * f->new_tlb_gen == 3, then we know that the flush needed to bring + * us up to date for tlb_gen 3 is the partial flush we're + * processing. + * + * As an example of why this check is needed, suppose that there + * are two concurrent flushes. The first is a full flush that + * changes context.tlb_gen from 1 to 2. The second is a partial + * flush that changes context.tlb_gen from 2 to 3. If they get + * processed on this CPU in reverse order, we'll see + * local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL. + * If we were to use __flush_tlb_single() and set local_tlb_gen to + * 3, we'd be break the invariant: we'd update local_tlb_gen above + * 1 without the full flush that's needed for tlb_gen 2. + * + * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimiation. + * Partial TLB flushes are not all that much cheaper than full TLB + * flushes, so it seems unlikely that it would be a performance win + * to do a partial flush if that won't bring our TLB fully up to + * date. By doing a full flush instead, we can increase + * local_tlb_gen all the way to mm_tlb_gen and we can probably + * avoid another flush in the very near future. + */ + if (f->end != TLB_FLUSH_ALL && + f->new_tlb_gen == local_tlb_gen + 1 && + f->new_tlb_gen == mm_tlb_gen) { + /* Partial flush */ unsigned long addr; unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT; + addr = f->start; while (addr < f->end) { __flush_tlb_single(addr); @@ -180,7 +319,16 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, if (local) count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages); trace_tlb_flush(reason, nr_pages); + } else { + /* Full flush. */ + local_flush_tlb(); + if (local) + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); + trace_tlb_flush(reason, TLB_FLUSH_ALL); } + + /* Both paths above update our state to mm_tlb_gen. */ + this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen); } static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason) @@ -214,6 +362,21 @@ void native_flush_tlb_others(const struct cpumask *cpumask, (info->end - info->start) >> PAGE_SHIFT); if (is_uv_system()) { + /* + * This whole special case is confused. UV has a "Broadcast + * Assist Unit", which seems to be a fancy way to send IPIs. + * Back when x86 used an explicit TLB flush IPI, UV was + * optimized to use its own mechanism. These days, x86 uses + * smp_call_function_many(), but UV still uses a manual IPI, + * and that IPI's action is out of date -- it does a manual + * flush instead of calling flush_tlb_func_remote(). This + * means that the percpu tlb_gen variables won't be updated + * and we'll do pointless flushes on future context switches. + * + * Rather than hooking native_flush_tlb_others() here, I think + * that UV should be updated so that smp_call_function_many(), + * etc, are optimal on UV. + */ unsigned int cpu; cpu = smp_processor_id(); @@ -250,8 +413,8 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, cpu = get_cpu(); - /* Synchronize with switch_mm. */ - smp_mb(); + /* This is also a barrier that synchronizes with switch_mm(). */ + info.new_tlb_gen = inc_mm_tlb_gen(mm); /* Should we flush just the requested range? */ if ((end != TLB_FLUSH_ALL) && @@ -273,6 +436,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) flush_tlb_others(mm_cpumask(mm), &info); + put_cpu(); } @@ -281,8 +445,6 @@ static void do_flush_tlb_all(void *info) { count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); __flush_tlb_all(); - if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) - leave_mm(smp_processor_id()); } void flush_tlb_all(void) @@ -335,6 +497,7 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) flush_tlb_others(&batch->cpumask, &info); + cpumask_clear(&batch->cpumask); put_cpu(); |