diff options
Diffstat (limited to 'arch/x86/mm')
27 files changed, 889 insertions, 1444 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 7ba7f3d7f477..27e9e90a8d35 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -10,7 +10,7 @@ CFLAGS_REMOVE_mem_encrypt.o = -pg endif obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ - pat.o pgtable.o physaddr.o setup_nx.o tlb.o + pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o # Make sure __phys_addr has no stackprotector nostackp := $(call cc-option, -fno-stack-protector) @@ -29,8 +29,6 @@ obj-$(CONFIG_X86_PTDUMP) += debug_pagetables.o obj-$(CONFIG_HIGHMEM) += highmem_32.o -obj-$(CONFIG_KMEMCHECK) += kmemcheck/ - KASAN_SANITIZE_kasan_init_$(BITS).o := n obj-$(CONFIG_KASAN) += kasan_init_$(BITS).o @@ -43,9 +41,10 @@ obj-$(CONFIG_AMD_NUMA) += amdtopology.o obj-$(CONFIG_ACPI_NUMA) += srat.o obj-$(CONFIG_NUMA_EMU) += numa_emulation.o -obj-$(CONFIG_X86_INTEL_MPX) += mpx.o -obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o -obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o +obj-$(CONFIG_X86_INTEL_MPX) += mpx.o +obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o +obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o +obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c new file mode 100644 index 000000000000..b9283cc27622 --- /dev/null +++ b/arch/x86/mm/cpu_entry_area.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/spinlock.h> +#include <linux/percpu.h> + +#include <asm/cpu_entry_area.h> +#include <asm/pgtable.h> +#include <asm/fixmap.h> +#include <asm/desc.h> + +static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage); + +#ifdef CONFIG_X86_64 +static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks + [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); +#endif + +struct cpu_entry_area *get_cpu_entry_area(int cpu) +{ + unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE; + BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0); + + return (struct cpu_entry_area *) va; +} +EXPORT_SYMBOL(get_cpu_entry_area); + +void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags) +{ + unsigned long va = (unsigned long) cea_vaddr; + + set_pte_vaddr(va, pfn_pte(pa >> PAGE_SHIFT, flags)); +} + +static void __init +cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot) +{ + for ( ; pages; pages--, cea_vaddr+= PAGE_SIZE, ptr += PAGE_SIZE) + cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot); +} + +static void percpu_setup_debug_store(int cpu) +{ +#ifdef CONFIG_CPU_SUP_INTEL + int npages; + void *cea; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return; + + cea = &get_cpu_entry_area(cpu)->cpu_debug_store; + npages = sizeof(struct debug_store) / PAGE_SIZE; + BUILD_BUG_ON(sizeof(struct debug_store) % PAGE_SIZE != 0); + cea_map_percpu_pages(cea, &per_cpu(cpu_debug_store, cpu), npages, + PAGE_KERNEL); + + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers; + /* + * Force the population of PMDs for not yet allocated per cpu + * memory like debug store buffers. + */ + npages = sizeof(struct debug_store_buffers) / PAGE_SIZE; + for (; npages; npages--, cea += PAGE_SIZE) + cea_set_pte(cea, 0, PAGE_NONE); +#endif +} + +/* Setup the fixmap mappings only once per-processor */ +static void __init setup_cpu_entry_area(int cpu) +{ +#ifdef CONFIG_X86_64 + extern char _entry_trampoline[]; + + /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */ + pgprot_t gdt_prot = PAGE_KERNEL_RO; + pgprot_t tss_prot = PAGE_KERNEL_RO; +#else + /* + * On native 32-bit systems, the GDT cannot be read-only because + * our double fault handler uses a task gate, and entering through + * a task gate needs to change an available TSS to busy. If the + * GDT is read-only, that will triple fault. The TSS cannot be + * read-only because the CPU writes to it on task switches. + * + * On Xen PV, the GDT must be read-only because the hypervisor + * requires it. + */ + pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ? + PAGE_KERNEL_RO : PAGE_KERNEL; + pgprot_t tss_prot = PAGE_KERNEL; +#endif + + cea_set_pte(&get_cpu_entry_area(cpu)->gdt, get_cpu_gdt_paddr(cpu), + gdt_prot); + + cea_map_percpu_pages(&get_cpu_entry_area(cpu)->entry_stack_page, + per_cpu_ptr(&entry_stack_storage, cpu), 1, + PAGE_KERNEL); + + /* + * The Intel SDM says (Volume 3, 7.2.1): + * + * Avoid placing a page boundary in the part of the TSS that the + * processor reads during a task switch (the first 104 bytes). The + * processor may not correctly perform address translations if a + * boundary occurs in this area. During a task switch, the processor + * reads and writes into the first 104 bytes of each TSS (using + * contiguous physical addresses beginning with the physical address + * of the first byte of the TSS). So, after TSS access begins, if + * part of the 104 bytes is not physically contiguous, the processor + * will access incorrect information without generating a page-fault + * exception. + * + * There are also a lot of errata involving the TSS spanning a page + * boundary. Assert that we're not doing that. + */ + BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ + offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); + BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); + cea_map_percpu_pages(&get_cpu_entry_area(cpu)->tss, + &per_cpu(cpu_tss_rw, cpu), + sizeof(struct tss_struct) / PAGE_SIZE, tss_prot); + +#ifdef CONFIG_X86_32 + per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); +#endif + +#ifdef CONFIG_X86_64 + BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); + BUILD_BUG_ON(sizeof(exception_stacks) != + sizeof(((struct cpu_entry_area *)0)->exception_stacks)); + cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks, + &per_cpu(exception_stacks, cpu), + sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL); + + cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline, + __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); +#endif + percpu_setup_debug_store(cpu); +} + +static __init void setup_cpu_entry_area_ptes(void) +{ +#ifdef CONFIG_X86_32 + unsigned long start, end; + + BUILD_BUG_ON(CPU_ENTRY_AREA_PAGES * PAGE_SIZE < CPU_ENTRY_AREA_MAP_SIZE); + BUG_ON(CPU_ENTRY_AREA_BASE & ~PMD_MASK); + + start = CPU_ENTRY_AREA_BASE; + end = start + CPU_ENTRY_AREA_MAP_SIZE; + + /* Careful here: start + PMD_SIZE might wrap around */ + for (; start < end && start >= CPU_ENTRY_AREA_BASE; start += PMD_SIZE) + populate_extra_pte(start); +#endif +} + +void __init setup_cpu_entry_areas(void) +{ + unsigned int cpu; + + setup_cpu_entry_area_ptes(); + + for_each_possible_cpu(cpu) + setup_cpu_entry_area(cpu); +} diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c index bfcffdf6c577..421f2664ffa0 100644 --- a/arch/x86/mm/debug_pagetables.c +++ b/arch/x86/mm/debug_pagetables.c @@ -5,7 +5,7 @@ static int ptdump_show(struct seq_file *m, void *v) { - ptdump_walk_pgd_level(m, NULL); + ptdump_walk_pgd_level_debugfs(m, NULL, false); return 0; } @@ -22,21 +22,89 @@ static const struct file_operations ptdump_fops = { .release = single_release, }; -static struct dentry *pe; +static int ptdump_show_curknl(struct seq_file *m, void *v) +{ + if (current->mm->pgd) { + down_read(¤t->mm->mmap_sem); + ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, false); + up_read(¤t->mm->mmap_sem); + } + return 0; +} + +static int ptdump_open_curknl(struct inode *inode, struct file *filp) +{ + return single_open(filp, ptdump_show_curknl, NULL); +} + +static const struct file_operations ptdump_curknl_fops = { + .owner = THIS_MODULE, + .open = ptdump_open_curknl, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +static struct dentry *pe_curusr; + +static int ptdump_show_curusr(struct seq_file *m, void *v) +{ + if (current->mm->pgd) { + down_read(¤t->mm->mmap_sem); + ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, true); + up_read(¤t->mm->mmap_sem); + } + return 0; +} + +static int ptdump_open_curusr(struct inode *inode, struct file *filp) +{ + return single_open(filp, ptdump_show_curusr, NULL); +} + +static const struct file_operations ptdump_curusr_fops = { + .owner = THIS_MODULE, + .open = ptdump_open_curusr, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + +static struct dentry *dir, *pe_knl, *pe_curknl; static int __init pt_dump_debug_init(void) { - pe = debugfs_create_file("kernel_page_tables", S_IRUSR, NULL, NULL, - &ptdump_fops); - if (!pe) + dir = debugfs_create_dir("page_tables", NULL); + if (!dir) return -ENOMEM; + pe_knl = debugfs_create_file("kernel", 0400, dir, NULL, + &ptdump_fops); + if (!pe_knl) + goto err; + + pe_curknl = debugfs_create_file("current_kernel", 0400, + dir, NULL, &ptdump_curknl_fops); + if (!pe_curknl) + goto err; + +#ifdef CONFIG_PAGE_TABLE_ISOLATION + pe_curusr = debugfs_create_file("current_user", 0400, + dir, NULL, &ptdump_curusr_fops); + if (!pe_curusr) + goto err; +#endif return 0; +err: + debugfs_remove_recursive(dir); + return -ENOMEM; } static void __exit pt_dump_debug_exit(void) { - debugfs_remove_recursive(pe); + debugfs_remove_recursive(dir); } module_init(pt_dump_debug_init); diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 5e3ac6fe6c9e..f56902c1f04b 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -44,68 +44,97 @@ struct addr_marker { unsigned long max_lines; }; -/* indices for address_markers; keep sync'd w/ address_markers below */ +/* Address space markers hints */ + +#ifdef CONFIG_X86_64 + enum address_markers_idx { USER_SPACE_NR = 0, -#ifdef CONFIG_X86_64 KERNEL_SPACE_NR, LOW_KERNEL_NR, +#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL) + LDT_NR, +#endif VMALLOC_START_NR, VMEMMAP_START_NR, #ifdef CONFIG_KASAN KASAN_SHADOW_START_NR, KASAN_SHADOW_END_NR, #endif -# ifdef CONFIG_X86_ESPFIX64 +#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL) + LDT_NR, +#endif + CPU_ENTRY_AREA_NR, +#ifdef CONFIG_X86_ESPFIX64 ESPFIX_START_NR, -# endif +#endif +#ifdef CONFIG_EFI + EFI_END_NR, +#endif HIGH_KERNEL_NR, MODULES_VADDR_NR, MODULES_END_NR, -#else + FIXADDR_START_NR, + END_OF_SPACE_NR, +}; + +static struct addr_marker address_markers[] = { + [USER_SPACE_NR] = { 0, "User Space" }, + [KERNEL_SPACE_NR] = { (1UL << 63), "Kernel Space" }, + [LOW_KERNEL_NR] = { 0UL, "Low Kernel Mapping" }, + [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" }, + [VMEMMAP_START_NR] = { 0UL, "Vmemmap" }, +#ifdef CONFIG_KASAN + [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" }, + [KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" }, +#endif +#ifdef CONFIG_MODIFY_LDT_SYSCALL + [LDT_NR] = { LDT_BASE_ADDR, "LDT remap" }, +#endif + [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" }, +#ifdef CONFIG_X86_ESPFIX64 + [ESPFIX_START_NR] = { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, +#endif +#ifdef CONFIG_EFI + [EFI_END_NR] = { EFI_VA_END, "EFI Runtime Services" }, +#endif + [HIGH_KERNEL_NR] = { __START_KERNEL_map, "High Kernel Mapping" }, + [MODULES_VADDR_NR] = { MODULES_VADDR, "Modules" }, + [MODULES_END_NR] = { MODULES_END, "End Modules" }, + [FIXADDR_START_NR] = { FIXADDR_START, "Fixmap Area" }, + [END_OF_SPACE_NR] = { -1, NULL } +}; + +#else /* CONFIG_X86_64 */ + +enum address_markers_idx { + USER_SPACE_NR = 0, KERNEL_SPACE_NR, VMALLOC_START_NR, VMALLOC_END_NR, -# ifdef CONFIG_HIGHMEM +#ifdef CONFIG_HIGHMEM PKMAP_BASE_NR, -# endif - FIXADDR_START_NR, #endif + CPU_ENTRY_AREA_NR, + FIXADDR_START_NR, + END_OF_SPACE_NR, }; -/* Address space markers hints */ static struct addr_marker address_markers[] = { - { 0, "User Space" }, -#ifdef CONFIG_X86_64 - { 0x8000000000000000UL, "Kernel Space" }, - { 0/* PAGE_OFFSET */, "Low Kernel Mapping" }, - { 0/* VMALLOC_START */, "vmalloc() Area" }, - { 0/* VMEMMAP_START */, "Vmemmap" }, -#ifdef CONFIG_KASAN - { KASAN_SHADOW_START, "KASAN shadow" }, - { KASAN_SHADOW_END, "KASAN shadow end" }, + [USER_SPACE_NR] = { 0, "User Space" }, + [KERNEL_SPACE_NR] = { PAGE_OFFSET, "Kernel Mapping" }, + [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" }, + [VMALLOC_END_NR] = { 0UL, "vmalloc() End" }, +#ifdef CONFIG_HIGHMEM + [PKMAP_BASE_NR] = { 0UL, "Persistent kmap() Area" }, #endif -# ifdef CONFIG_X86_ESPFIX64 - { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, -# endif -# ifdef CONFIG_EFI - { EFI_VA_END, "EFI Runtime Services" }, -# endif - { __START_KERNEL_map, "High Kernel Mapping" }, - { MODULES_VADDR, "Modules" }, - { MODULES_END, "End Modules" }, -#else - { PAGE_OFFSET, "Kernel Mapping" }, - { 0/* VMALLOC_START */, "vmalloc() Area" }, - { 0/*VMALLOC_END*/, "vmalloc() End" }, -# ifdef CONFIG_HIGHMEM - { 0/*PKMAP_BASE*/, "Persistent kmap() Area" }, -# endif - { 0/*FIXADDR_START*/, "Fixmap Area" }, -#endif - { -1, NULL } /* End of list */ + [CPU_ENTRY_AREA_NR] = { 0UL, "CPU entry area" }, + [FIXADDR_START_NR] = { 0UL, "Fixmap area" }, + [END_OF_SPACE_NR] = { -1, NULL } }; +#endif /* !CONFIG_X86_64 */ + /* Multipliers for offsets within the PTEs */ #define PTE_LEVEL_MULT (PAGE_SIZE) #define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT) @@ -140,7 +169,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) static const char * const level_name[] = { "cr3", "pgd", "p4d", "pud", "pmd", "pte" }; - if (!pgprot_val(prot)) { + if (!(pr & _PAGE_PRESENT)) { /* Not present */ pt_dump_cont_printf(m, dmsg, " "); } else { @@ -447,7 +476,7 @@ static inline bool is_hypervisor_range(int idx) } static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, - bool checkwx) + bool checkwx, bool dmesg) { #ifdef CONFIG_X86_64 pgd_t *start = (pgd_t *) &init_top_pgt; @@ -460,7 +489,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, if (pgd) { start = pgd; - st.to_dmesg = true; + st.to_dmesg = dmesg; } st.check_wx = checkwx; @@ -498,13 +527,37 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) { - ptdump_walk_pgd_level_core(m, pgd, false); + ptdump_walk_pgd_level_core(m, pgd, false, true); +} + +void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user) +{ +#ifdef CONFIG_PAGE_TABLE_ISOLATION + if (user && static_cpu_has(X86_FEATURE_PTI)) + pgd = kernel_to_user_pgdp(pgd); +#endif + ptdump_walk_pgd_level_core(m, pgd, false, false); +} +EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs); + +static void ptdump_walk_user_pgd_level_checkwx(void) +{ +#ifdef CONFIG_PAGE_TABLE_ISOLATION + pgd_t *pgd = (pgd_t *) &init_top_pgt; + + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + pr_info("x86/mm: Checking user space page tables\n"); + pgd = kernel_to_user_pgdp(pgd); + ptdump_walk_pgd_level_core(NULL, pgd, true, false); +#endif } -EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level); void ptdump_walk_pgd_level_checkwx(void) { - ptdump_walk_pgd_level_core(NULL, NULL, true); + ptdump_walk_pgd_level_core(NULL, NULL, true, false); + ptdump_walk_user_pgd_level_checkwx(); } static int __init pt_dump_init(void) @@ -525,8 +578,8 @@ static int __init pt_dump_init(void) address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE; # endif address_markers[FIXADDR_START_NR].start_address = FIXADDR_START; + address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE; #endif - return 0; } __initcall(pt_dump_init); diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 88754bfd425f..9fe656c42aa5 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -83,7 +83,7 @@ bool ex_handler_refcount(const struct exception_table_entry *fixup, return true; } -EXPORT_SYMBOL_GPL(ex_handler_refcount); +EXPORT_SYMBOL(ex_handler_refcount); /* * Handler for when we fail to restore a task's FPU state. We should never get diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 3109ba6c6ede..06fe3d51d385 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -20,7 +20,6 @@ #include <asm/cpufeature.h> /* boot_cpu_has, ... */ #include <asm/traps.h> /* dotraplinkage, ... */ #include <asm/pgalloc.h> /* pgd_*(), ... */ -#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */ #include <asm/fixmap.h> /* VSYSCALL_ADDR */ #include <asm/vsyscall.h> /* emulate_vsyscall */ #include <asm/vm86.h> /* struct vm86 */ @@ -702,7 +701,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, else printk(KERN_CONT "paging request"); - printk(KERN_CONT " at %p\n", (void *) address); + printk(KERN_CONT " at %px\n", (void *) address); printk(KERN_ALERT "IP: %pS\n", (void *)regs->ip); dump_pagetable(address); @@ -861,7 +860,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, if (!printk_ratelimit()) return; - printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx", + printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx", task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, tsk->comm, task_pid_nr(tsk), address, (void *)regs->ip, (void *)regs->sp, error_code); @@ -1256,8 +1255,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, * Detect and handle instructions that would cause a page fault for * both a tracked kernel page and a userspace page. */ - if (kmemcheck_active(regs)) - kmemcheck_hide(regs); prefetchw(&mm->mmap_sem); if (unlikely(kmmio_fault(regs, address))) @@ -1280,9 +1277,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { if (vmalloc_fault(address) >= 0) return; - - if (kmemcheck_fault(regs, address, error_code)) - return; } /* Can handle a stale RO->RW TLB: */ diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index a22c2b95e513..8ca324d07282 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -20,6 +20,7 @@ #include <asm/kaslr.h> #include <asm/hypervisor.h> #include <asm/cpufeature.h> +#include <asm/pti.h> /* * We need to define the tracepoints somewhere, and tlb.c @@ -92,8 +93,7 @@ __ref void *alloc_low_pages(unsigned int num) unsigned int order; order = get_order((unsigned long)num << PAGE_SHIFT); - return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK | - __GFP_ZERO, order); + return (void *)__get_free_pages(GFP_ATOMIC | __GFP_ZERO, order); } if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) { @@ -161,15 +161,20 @@ struct map_range { static int page_size_mask; +static void enable_global_pages(void) +{ + if (!static_cpu_has(X86_FEATURE_PTI)) + __supported_pte_mask |= _PAGE_GLOBAL; +} + static void __init probe_page_size_mask(void) { /* - * For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will - * use small pages. + * For pagealloc debugging, identity mapping will use small pages. * This will simplify cpa(), which otherwise needs to support splitting * large pages into small in interrupt context, etc. */ - if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled() && !IS_ENABLED(CONFIG_KMEMCHECK)) + if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled()) page_size_mask |= 1 << PG_LEVEL_2M; else direct_gbpages = 0; @@ -179,11 +184,11 @@ static void __init probe_page_size_mask(void) cr4_set_bits_and_update_boot(X86_CR4_PSE); /* Enable PGE if available */ + __supported_pte_mask &= ~_PAGE_GLOBAL; if (boot_cpu_has(X86_FEATURE_PGE)) { cr4_set_bits_and_update_boot(X86_CR4_PGE); - __supported_pte_mask |= _PAGE_GLOBAL; - } else - __supported_pte_mask &= ~_PAGE_GLOBAL; + enable_global_pages(); + } /* Enable 1 GB linear kernel mappings if available: */ if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) { @@ -196,34 +201,44 @@ static void __init probe_page_size_mask(void) static void setup_pcid(void) { -#ifdef CONFIG_X86_64 - if (boot_cpu_has(X86_FEATURE_PCID)) { - if (boot_cpu_has(X86_FEATURE_PGE)) { - /* - * This can't be cr4_set_bits_and_update_boot() -- - * the trampoline code can't handle CR4.PCIDE and - * it wouldn't do any good anyway. Despite the name, - * cr4_set_bits_and_update_boot() doesn't actually - * cause the bits in question to remain set all the - * way through the secondary boot asm. - * - * Instead, we brute-force it and set CR4.PCIDE - * manually in start_secondary(). - */ - cr4_set_bits(X86_CR4_PCIDE); - } else { - /* - * flush_tlb_all(), as currently implemented, won't - * work if PCID is on but PGE is not. Since that - * combination doesn't exist on real hardware, there's - * no reason to try to fully support it, but it's - * polite to avoid corrupting data if we're on - * an improperly configured VM. - */ - setup_clear_cpu_cap(X86_FEATURE_PCID); - } + if (!IS_ENABLED(CONFIG_X86_64)) + return; + + if (!boot_cpu_has(X86_FEATURE_PCID)) + return; + + if (boot_cpu_has(X86_FEATURE_PGE)) { + /* + * This can't be cr4_set_bits_and_update_boot() -- the + * trampoline code can't handle CR4.PCIDE and it wouldn't + * do any good anyway. Despite the name, + * cr4_set_bits_and_update_boot() doesn't actually cause + * the bits in question to remain set all the way through + * the secondary boot asm. + * + * Instead, we brute-force it and set CR4.PCIDE manually in + * start_secondary(). + */ + cr4_set_bits(X86_CR4_PCIDE); + + /* + * INVPCID's single-context modes (2/3) only work if we set + * X86_CR4_PCIDE, *and* we INVPCID support. It's unusable + * on systems that have X86_CR4_PCIDE clear, or that have + * no INVPCID support at all. + */ + if (boot_cpu_has(X86_FEATURE_INVPCID)) + setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE); + } else { + /* + * flush_tlb_all(), as currently implemented, won't work if + * PCID is on but PGE is not. Since that combination + * doesn't exist on real hardware, there's no reason to try + * to fully support it, but it's polite to avoid corrupting + * data if we're on an improperly configured VM. + */ + setup_clear_cpu_cap(X86_FEATURE_PCID); } -#endif } #ifdef CONFIG_X86_32 @@ -624,6 +639,7 @@ void __init init_mem_mapping(void) { unsigned long end; + pti_check_boottime_disable(); probe_page_size_mask(); setup_pcid(); @@ -847,7 +863,7 @@ void __init zone_sizes_init(void) free_area_init_nodes(max_zone_pfns); } -DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { +__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { .loaded_mm = &init_mm, .next_asid = 1, .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 8a64a6f2848d..135c9a7898c7 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -50,6 +50,7 @@ #include <asm/setup.h> #include <asm/set_memory.h> #include <asm/page_types.h> +#include <asm/cpu_entry_area.h> #include <asm/init.h> #include "mm_internal.h" @@ -766,6 +767,7 @@ void __init mem_init(void) mem_init_print_info(NULL); printk(KERN_INFO "virtual kernel memory layout:\n" " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" + " cpu_entry : 0x%08lx - 0x%08lx (%4ld kB)\n" #ifdef CONFIG_HIGHMEM " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n" #endif @@ -777,6 +779,10 @@ void __init mem_init(void) FIXADDR_START, FIXADDR_TOP, (FIXADDR_TOP - FIXADDR_START) >> 10, + CPU_ENTRY_AREA_BASE, + CPU_ENTRY_AREA_BASE + CPU_ENTRY_AREA_MAP_SIZE, + CPU_ENTRY_AREA_MAP_SIZE >> 10, + #ifdef CONFIG_HIGHMEM PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, (LAST_PKMAP*PAGE_SIZE) >> 10, diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index adcea90a2046..4a837289f2ad 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -184,7 +184,7 @@ static __ref void *spp_getpage(void) void *ptr; if (after_bootmem) - ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); + ptr = (void *) get_zeroed_page(GFP_ATOMIC); else ptr = alloc_bootmem_pages(PAGE_SIZE); @@ -1173,12 +1173,18 @@ void __init mem_init(void) /* clear_bss() already clear the empty_zero_page */ - register_page_bootmem_info(); - /* this will put all memory onto the freelists */ free_all_bootmem(); after_bootmem = 1; + /* + * Must be done after boot memory is put on freelist, because here we + * might set fields in deferred struct pages that have not yet been + * initialized, and free_all_bootmem() initializes all the reserved + * deferred pages for us. + */ + register_page_bootmem_info(); + /* Register memory areas for /proc/kcore */ kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, PAGE_SIZE, KCORE_OTHER); @@ -1399,7 +1405,6 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start, vmemmap_verify((pte_t *)pmd, node, addr, next); continue; } - pr_warn_once("vmemmap: falling back to regular page backing\n"); if (vmemmap_populate_basepages(addr, next, node)) return -ENOMEM; } diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 99dfed6dfef8..47388f0c0e59 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -15,6 +15,7 @@ #include <asm/tlbflush.h> #include <asm/sections.h> #include <asm/pgtable.h> +#include <asm/cpu_entry_area.h> extern struct range pfn_mapped[E820_MAX_ENTRIES]; @@ -277,6 +278,7 @@ void __init kasan_early_init(void) void __init kasan_init(void) { int i; + void *shadow_cpu_entry_begin, *shadow_cpu_entry_end; #ifdef CONFIG_KASAN_INLINE register_die_notifier(&kasan_die_notifier); @@ -321,16 +323,33 @@ void __init kasan_init(void) map_range(&pfn_mapped[i]); } + shadow_cpu_entry_begin = (void *)CPU_ENTRY_AREA_BASE; + shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin); + shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin, + PAGE_SIZE); + + shadow_cpu_entry_end = (void *)(CPU_ENTRY_AREA_BASE + + CPU_ENTRY_AREA_MAP_SIZE); + shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end); + shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end, + PAGE_SIZE); + kasan_populate_zero_shadow( kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM), - kasan_mem_to_shadow((void *)__START_KERNEL_map)); + shadow_cpu_entry_begin); + + kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin, + (unsigned long)shadow_cpu_entry_end, 0); + + kasan_populate_zero_shadow(shadow_cpu_entry_end, + kasan_mem_to_shadow((void *)__START_KERNEL_map)); kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext), (unsigned long)kasan_mem_to_shadow(_end), early_pfn_to_nid(__pa(_stext))); kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), - (void *)KASAN_SHADOW_END); + (void *)KASAN_SHADOW_END); load_cr3(init_top_pgt); __flush_tlb_all(); diff --git a/arch/x86/mm/kmemcheck/Makefile b/arch/x86/mm/kmemcheck/Makefile deleted file mode 100644 index 520b3bce4095..000000000000 --- a/arch/x86/mm/kmemcheck/Makefile +++ /dev/null @@ -1 +0,0 @@ -obj-y := error.o kmemcheck.o opcode.o pte.o selftest.o shadow.o diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c deleted file mode 100644 index 872ec4159a68..000000000000 --- a/arch/x86/mm/kmemcheck/error.c +++ /dev/null @@ -1,228 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/interrupt.h> -#include <linux/kdebug.h> -#include <linux/kmemcheck.h> -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/ptrace.h> -#include <linux/stacktrace.h> -#include <linux/string.h> - -#include "error.h" -#include "shadow.h" - -enum kmemcheck_error_type { - KMEMCHECK_ERROR_INVALID_ACCESS, - KMEMCHECK_ERROR_BUG, -}; - -#define SHADOW_COPY_SIZE (1 << CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT) - -struct kmemcheck_error { - enum kmemcheck_error_type type; - - union { - /* KMEMCHECK_ERROR_INVALID_ACCESS */ - struct { - /* Kind of access that caused the error */ - enum kmemcheck_shadow state; - /* Address and size of the erroneous read */ - unsigned long address; - unsigned int size; - }; - }; - - struct pt_regs regs; - struct stack_trace trace; - unsigned long trace_entries[32]; - - /* We compress it to a char. */ - unsigned char shadow_copy[SHADOW_COPY_SIZE]; - unsigned char memory_copy[SHADOW_COPY_SIZE]; -}; - -/* - * Create a ring queue of errors to output. We can't call printk() directly - * from the kmemcheck traps, since this may call the console drivers and - * result in a recursive fault. - */ -static struct kmemcheck_error error_fifo[CONFIG_KMEMCHECK_QUEUE_SIZE]; -static unsigned int error_count; -static unsigned int error_rd; -static unsigned int error_wr; -static unsigned int error_missed_count; - -static struct kmemcheck_error *error_next_wr(void) -{ - struct kmemcheck_error *e; - - if (error_count == ARRAY_SIZE(error_fifo)) { - ++error_missed_count; - return NULL; - } - - e = &error_fifo[error_wr]; - if (++error_wr == ARRAY_SIZE(error_fifo)) - error_wr = 0; - ++error_count; - return e; -} - -static struct kmemcheck_error *error_next_rd(void) -{ - struct kmemcheck_error *e; - - if (error_count == 0) - return NULL; - - e = &error_fifo[error_rd]; - if (++error_rd == ARRAY_SIZE(error_fifo)) - error_rd = 0; - --error_count; - return e; -} - -void kmemcheck_error_recall(void) -{ - static const char *desc[] = { - [KMEMCHECK_SHADOW_UNALLOCATED] = "unallocated", - [KMEMCHECK_SHADOW_UNINITIALIZED] = "uninitialized", - [KMEMCHECK_SHADOW_INITIALIZED] = "initialized", - [KMEMCHECK_SHADOW_FREED] = "freed", - }; - - static const char short_desc[] = { - [KMEMCHECK_SHADOW_UNALLOCATED] = 'a', - [KMEMCHECK_SHADOW_UNINITIALIZED] = 'u', - [KMEMCHECK_SHADOW_INITIALIZED] = 'i', - [KMEMCHECK_SHADOW_FREED] = 'f', - }; - - struct kmemcheck_error *e; - unsigned int i; - - e = error_next_rd(); - if (!e) - return; - - switch (e->type) { - case KMEMCHECK_ERROR_INVALID_ACCESS: - printk(KERN_WARNING "WARNING: kmemcheck: Caught %d-bit read from %s memory (%p)\n", - 8 * e->size, e->state < ARRAY_SIZE(desc) ? - desc[e->state] : "(invalid shadow state)", - (void *) e->address); - - printk(KERN_WARNING); - for (i = 0; i < SHADOW_COPY_SIZE; ++i) - printk(KERN_CONT "%02x", e->memory_copy[i]); - printk(KERN_CONT "\n"); - - printk(KERN_WARNING); - for (i = 0; i < SHADOW_COPY_SIZE; ++i) { - if (e->shadow_copy[i] < ARRAY_SIZE(short_desc)) - printk(KERN_CONT " %c", short_desc[e->shadow_copy[i]]); - else - printk(KERN_CONT " ?"); - } - printk(KERN_CONT "\n"); - printk(KERN_WARNING "%*c\n", 2 + 2 - * (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^'); - break; - case KMEMCHECK_ERROR_BUG: - printk(KERN_EMERG "ERROR: kmemcheck: Fatal error\n"); - break; - } - - __show_regs(&e->regs, 1); - print_stack_trace(&e->trace, 0); -} - -static void do_wakeup(unsigned long data) -{ - while (error_count > 0) - kmemcheck_error_recall(); - - if (error_missed_count > 0) { - printk(KERN_WARNING "kmemcheck: Lost %d error reports because " - "the queue was too small\n", error_missed_count); - error_missed_count = 0; - } -} - -static DECLARE_TASKLET(kmemcheck_tasklet, &do_wakeup, 0); - -/* - * Save the context of an error report. - */ -void kmemcheck_error_save(enum kmemcheck_shadow state, - unsigned long address, unsigned int size, struct pt_regs *regs) -{ - static unsigned long prev_ip; - - struct kmemcheck_error *e; - void *shadow_copy; - void *memory_copy; - - /* Don't report several adjacent errors from the same EIP. */ - if (regs->ip == prev_ip) - return; - prev_ip = regs->ip; - - e = error_next_wr(); - if (!e) - return; - - e->type = KMEMCHECK_ERROR_INVALID_ACCESS; - - e->state = state; - e->address = address; - e->size = size; - - /* Save regs */ - memcpy(&e->regs, regs, sizeof(*regs)); - - /* Save stack trace */ - e->trace.nr_entries = 0; - e->trace.entries = e->trace_entries; - e->trace.max_entries = ARRAY_SIZE(e->trace_entries); - e->trace.skip = 0; - save_stack_trace_regs(regs, &e->trace); - - /* Round address down to nearest 16 bytes */ - shadow_copy = kmemcheck_shadow_lookup(address - & ~(SHADOW_COPY_SIZE - 1)); - BUG_ON(!shadow_copy); - - memcpy(e->shadow_copy, shadow_copy, SHADOW_COPY_SIZE); - - kmemcheck_show_addr(address); - memory_copy = (void *) (address & ~(SHADOW_COPY_SIZE - 1)); - memcpy(e->memory_copy, memory_copy, SHADOW_COPY_SIZE); - kmemcheck_hide_addr(address); - - tasklet_hi_schedule_first(&kmemcheck_tasklet); -} - -/* - * Save the context of a kmemcheck bug. - */ -void kmemcheck_error_save_bug(struct pt_regs *regs) -{ - struct kmemcheck_error *e; - - e = error_next_wr(); - if (!e) - return; - - e->type = KMEMCHECK_ERROR_BUG; - - memcpy(&e->regs, regs, sizeof(*regs)); - - e->trace.nr_entries = 0; - e->trace.entries = e->trace_entries; - e->trace.max_entries = ARRAY_SIZE(e->trace_entries); - e->trace.skip = 1; - save_stack_trace(&e->trace); - - tasklet_hi_schedule_first(&kmemcheck_tasklet); -} diff --git a/arch/x86/mm/kmemcheck/error.h b/arch/x86/mm/kmemcheck/error.h deleted file mode 100644 index 39f80d7a874d..000000000000 --- a/arch/x86/mm/kmemcheck/error.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef ARCH__X86__MM__KMEMCHECK__ERROR_H -#define ARCH__X86__MM__KMEMCHECK__ERROR_H - -#include <linux/ptrace.h> - -#include "shadow.h" - -void kmemcheck_error_save(enum kmemcheck_shadow state, - unsigned long address, unsigned int size, struct pt_regs *regs); - -void kmemcheck_error_save_bug(struct pt_regs *regs); - -void kmemcheck_error_recall(void); - -#endif diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c deleted file mode 100644 index 4515bae36bbe..000000000000 --- a/arch/x86/mm/kmemcheck/kmemcheck.c +++ /dev/null @@ -1,658 +0,0 @@ -/** - * kmemcheck - a heavyweight memory checker for the linux kernel - * Copyright (C) 2007, 2008 Vegard Nossum <vegardno@ifi.uio.no> - * (With a lot of help from Ingo Molnar and Pekka Enberg.) - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2) as - * published by the Free Software Foundation. - */ - -#include <linux/init.h> -#include <linux/interrupt.h> -#include <linux/kallsyms.h> -#include <linux/kernel.h> -#include <linux/kmemcheck.h> -#include <linux/mm.h> -#include <linux/page-flags.h> -#include <linux/percpu.h> -#include <linux/ptrace.h> -#include <linux/string.h> -#include <linux/types.h> - -#include <asm/cacheflush.h> -#include <asm/kmemcheck.h> -#include <asm/pgtable.h> -#include <asm/tlbflush.h> - -#include "error.h" -#include "opcode.h" -#include "pte.h" -#include "selftest.h" -#include "shadow.h" - - -#ifdef CONFIG_KMEMCHECK_DISABLED_BY_DEFAULT -# define KMEMCHECK_ENABLED 0 -#endif - -#ifdef CONFIG_KMEMCHECK_ENABLED_BY_DEFAULT -# define KMEMCHECK_ENABLED 1 -#endif - -#ifdef CONFIG_KMEMCHECK_ONESHOT_BY_DEFAULT -# define KMEMCHECK_ENABLED 2 -#endif - -int kmemcheck_enabled = KMEMCHECK_ENABLED; - -int __init kmemcheck_init(void) -{ -#ifdef CONFIG_SMP - /* - * Limit SMP to use a single CPU. We rely on the fact that this code - * runs before SMP is set up. - */ - if (setup_max_cpus > 1) { - printk(KERN_INFO - "kmemcheck: Limiting number of CPUs to 1.\n"); - setup_max_cpus = 1; - } -#endif - - if (!kmemcheck_selftest()) { - printk(KERN_INFO "kmemcheck: self-tests failed; disabling\n"); - kmemcheck_enabled = 0; - return -EINVAL; - } - - printk(KERN_INFO "kmemcheck: Initialized\n"); - return 0; -} - -early_initcall(kmemcheck_init); - -/* - * We need to parse the kmemcheck= option before any memory is allocated. - */ -static int __init param_kmemcheck(char *str) -{ - int val; - int ret; - - if (!str) - return -EINVAL; - - ret = kstrtoint(str, 0, &val); - if (ret) - return ret; - kmemcheck_enabled = val; - return 0; -} - -early_param("kmemcheck", param_kmemcheck); - -int kmemcheck_show_addr(unsigned long address) -{ - pte_t *pte; - - pte = kmemcheck_pte_lookup(address); - if (!pte) - return 0; - - set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); - __flush_tlb_one(address); - return 1; -} - -int kmemcheck_hide_addr(unsigned long address) -{ - pte_t *pte; - - pte = kmemcheck_pte_lookup(address); - if (!pte) - return 0; - - set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); - __flush_tlb_one(address); - return 1; -} - -struct kmemcheck_context { - bool busy; - int balance; - - /* - * There can be at most two memory operands to an instruction, but - * each address can cross a page boundary -- so we may need up to - * four addresses that must be hidden/revealed for each fault. - */ - unsigned long addr[4]; - unsigned long n_addrs; - unsigned long flags; - - /* Data size of the instruction that caused a fault. */ - unsigned int size; -}; - -static DEFINE_PER_CPU(struct kmemcheck_context, kmemcheck_context); - -bool kmemcheck_active(struct pt_regs *regs) -{ - struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); - - return data->balance > 0; -} - -/* Save an address that needs to be shown/hidden */ -static void kmemcheck_save_addr(unsigned long addr) -{ - struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); - - BUG_ON(data->n_addrs >= ARRAY_SIZE(data->addr)); - data->addr[data->n_addrs++] = addr; -} - -static unsigned int kmemcheck_show_all(void) -{ - struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); - unsigned int i; - unsigned int n; - - n = 0; - for (i = 0; i < data->n_addrs; ++i) - n += kmemcheck_show_addr(data->addr[i]); - - return n; -} - -static unsigned int kmemcheck_hide_all(void) -{ - struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); - unsigned int i; - unsigned int n; - - n = 0; - for (i = 0; i < data->n_addrs; ++i) - n += kmemcheck_hide_addr(data->addr[i]); - - return n; -} - -/* - * Called from the #PF handler. - */ -void kmemcheck_show(struct pt_regs *regs) -{ - struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); - - BUG_ON(!irqs_disabled()); - - if (unlikely(data->balance != 0)) { - kmemcheck_show_all(); - kmemcheck_error_save_bug(regs); - data->balance = 0; - return; - } - - /* - * None of the addresses actually belonged to kmemcheck. Note that - * this is not an error. - */ - if (kmemcheck_show_all() == 0) - return; - - ++data->balance; - - /* - * The IF needs to be cleared as well, so that the faulting - * instruction can run "uninterrupted". Otherwise, we might take - * an interrupt and start executing that before we've had a chance - * to hide the page again. - * - * NOTE: In the rare case of multiple faults, we must not override - * the original flags: - */ - if (!(regs->flags & X86_EFLAGS_TF)) - data->flags = regs->flags; - - regs->flags |= X86_EFLAGS_TF; - regs->flags &= ~X86_EFLAGS_IF; -} - -/* - * Called from the #DB handler. - */ -void kmemcheck_hide(struct pt_regs *regs) -{ - struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); - int n; - - BUG_ON(!irqs_disabled()); - - if (unlikely(data->balance != 1)) { - kmemcheck_show_all(); - kmemcheck_error_save_bug(regs); - data->n_addrs = 0; - data->balance = 0; - - if (!(data->flags & X86_EFLAGS_TF)) - regs->flags &= ~X86_EFLAGS_TF; - if (data->flags & X86_EFLAGS_IF) - regs->flags |= X86_EFLAGS_IF; - return; - } - - if (kmemcheck_enabled) - n = kmemcheck_hide_all(); - else - n = kmemcheck_show_all(); - - if (n == 0) - return; - - --data->balance; - - data->n_addrs = 0; - - if (!(data->flags & X86_EFLAGS_TF)) - regs->flags &= ~X86_EFLAGS_TF; - if (data->flags & X86_EFLAGS_IF) - regs->flags |= X86_EFLAGS_IF; -} - -void kmemcheck_show_pages(struct page *p, unsigned int n) -{ - unsigned int i; - - for (i = 0; i < n; ++i) { - unsigned long address; - pte_t *pte; - unsigned int level; - - address = (unsigned long) page_address(&p[i]); - pte = lookup_address(address, &level); - BUG_ON(!pte); - BUG_ON(level != PG_LEVEL_4K); - - set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); - set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_HIDDEN)); - __flush_tlb_one(address); - } -} - -bool kmemcheck_page_is_tracked(struct page *p) -{ - /* This will also check the "hidden" flag of the PTE. */ - return kmemcheck_pte_lookup((unsigned long) page_address(p)); -} - -void kmemcheck_hide_pages(struct page *p, unsigned int n) -{ - unsigned int i; - - for (i = 0; i < n; ++i) { - unsigned long address; - pte_t *pte; - unsigned int level; - - address = (unsigned long) page_address(&p[i]); - pte = lookup_address(address, &level); - BUG_ON(!pte); - BUG_ON(level != PG_LEVEL_4K); - - set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); - set_pte(pte, __pte(pte_val(*pte) | _PAGE_HIDDEN)); - __flush_tlb_one(address); - } -} - -/* Access may NOT cross page boundary */ -static void kmemcheck_read_strict(struct pt_regs *regs, - unsigned long addr, unsigned int size) -{ - void *shadow; - enum kmemcheck_shadow status; - - shadow = kmemcheck_shadow_lookup(addr); - if (!shadow) - return; - - kmemcheck_save_addr(addr); - status = kmemcheck_shadow_test(shadow, size); - if (status == KMEMCHECK_SHADOW_INITIALIZED) - return; - - if (kmemcheck_enabled) - kmemcheck_error_save(status, addr, size, regs); - - if (kmemcheck_enabled == 2) - kmemcheck_enabled = 0; - - /* Don't warn about it again. */ - kmemcheck_shadow_set(shadow, size); -} - -bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size) -{ - enum kmemcheck_shadow status; - void *shadow; - - shadow = kmemcheck_shadow_lookup(addr); - if (!shadow) - return true; - - status = kmemcheck_shadow_test_all(shadow, size); - - return status == KMEMCHECK_SHADOW_INITIALIZED; -} - -/* Access may cross page boundary */ -static void kmemcheck_read(struct pt_regs *regs, - unsigned long addr, unsigned int size) -{ - unsigned long page = addr & PAGE_MASK; - unsigned long next_addr = addr + size - 1; - unsigned long next_page = next_addr & PAGE_MASK; - - if (likely(page == next_page)) { - kmemcheck_read_strict(regs, addr, size); - return; - } - - /* - * What we do is basically to split the access across the - * two pages and handle each part separately. Yes, this means - * that we may now see reads that are 3 + 5 bytes, for - * example (and if both are uninitialized, there will be two - * reports), but it makes the code a lot simpler. - */ - kmemcheck_read_strict(regs, addr, next_page - addr); - kmemcheck_read_strict(regs, next_page, next_addr - next_page); -} - -static void kmemcheck_write_strict(struct pt_regs *regs, - unsigned long addr, unsigned int size) -{ - void *shadow; - - shadow = kmemcheck_shadow_lookup(addr); - if (!shadow) - return; - - kmemcheck_save_addr(addr); - kmemcheck_shadow_set(shadow, size); -} - -static void kmemcheck_write(struct pt_regs *regs, - unsigned long addr, unsigned int size) -{ - unsigned long page = addr & PAGE_MASK; - unsigned long next_addr = addr + size - 1; - unsigned long next_page = next_addr & PAGE_MASK; - - if (likely(page == next_page)) { - kmemcheck_write_strict(regs, addr, size); - return; - } - - /* See comment in kmemcheck_read(). */ - kmemcheck_write_strict(regs, addr, next_page - addr); - kmemcheck_write_strict(regs, next_page, next_addr - next_page); -} - -/* - * Copying is hard. We have two addresses, each of which may be split across - * a page (and each page will have different shadow addresses). - */ -static void kmemcheck_copy(struct pt_regs *regs, - unsigned long src_addr, unsigned long dst_addr, unsigned int size) -{ - uint8_t shadow[8]; - enum kmemcheck_shadow status; - - unsigned long page; - unsigned long next_addr; - unsigned long next_page; - - uint8_t *x; - unsigned int i; - unsigned int n; - - BUG_ON(size > sizeof(shadow)); - - page = src_addr & PAGE_MASK; - next_addr = src_addr + size - 1; - next_page = next_addr & PAGE_MASK; - - if (likely(page == next_page)) { - /* Same page */ - x = kmemcheck_shadow_lookup(src_addr); - if (x) { - kmemcheck_save_addr(src_addr); - for (i = 0; i < size; ++i) - shadow[i] = x[i]; - } else { - for (i = 0; i < size; ++i) - shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; - } - } else { - n = next_page - src_addr; - BUG_ON(n > sizeof(shadow)); - - /* First page */ - x = kmemcheck_shadow_lookup(src_addr); - if (x) { - kmemcheck_save_addr(src_addr); - for (i = 0; i < n; ++i) - shadow[i] = x[i]; - } else { - /* Not tracked */ - for (i = 0; i < n; ++i) - shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; - } - - /* Second page */ - x = kmemcheck_shadow_lookup(next_page); - if (x) { - kmemcheck_save_addr(next_page); - for (i = n; i < size; ++i) - shadow[i] = x[i - n]; - } else { - /* Not tracked */ - for (i = n; i < size; ++i) - shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; - } - } - - page = dst_addr & PAGE_MASK; - next_addr = dst_addr + size - 1; - next_page = next_addr & PAGE_MASK; - - if (likely(page == next_page)) { - /* Same page */ - x = kmemcheck_shadow_lookup(dst_addr); - if (x) { - kmemcheck_save_addr(dst_addr); - for (i = 0; i < size; ++i) { - x[i] = shadow[i]; - shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; - } - } - } else { - n = next_page - dst_addr; - BUG_ON(n > sizeof(shadow)); - - /* First page */ - x = kmemcheck_shadow_lookup(dst_addr); - if (x) { - kmemcheck_save_addr(dst_addr); - for (i = 0; i < n; ++i) { - x[i] = shadow[i]; - shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; - } - } - - /* Second page */ - x = kmemcheck_shadow_lookup(next_page); - if (x) { - kmemcheck_save_addr(next_page); - for (i = n; i < size; ++i) { - x[i - n] = shadow[i]; - shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; - } - } - } - - status = kmemcheck_shadow_test(shadow, size); - if (status == KMEMCHECK_SHADOW_INITIALIZED) - return; - - if (kmemcheck_enabled) - kmemcheck_error_save(status, src_addr, size, regs); - - if (kmemcheck_enabled == 2) - kmemcheck_enabled = 0; -} - -enum kmemcheck_method { - KMEMCHECK_READ, - KMEMCHECK_WRITE, -}; - -static void kmemcheck_access(struct pt_regs *regs, - unsigned long fallback_address, enum kmemcheck_method fallback_method) -{ - const uint8_t *insn; - const uint8_t *insn_primary; - unsigned int size; - - struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); - - /* Recursive fault -- ouch. */ - if (data->busy) { - kmemcheck_show_addr(fallback_address); - kmemcheck_error_save_bug(regs); - return; - } - - data->busy = true; - - insn = (const uint8_t *) regs->ip; - insn_primary = kmemcheck_opcode_get_primary(insn); - - kmemcheck_opcode_decode(insn, &size); - - switch (insn_primary[0]) { -#ifdef CONFIG_KMEMCHECK_BITOPS_OK - /* AND, OR, XOR */ - /* - * Unfortunately, these instructions have to be excluded from - * our regular checking since they access only some (and not - * all) bits. This clears out "bogus" bitfield-access warnings. - */ - case 0x80: - case 0x81: - case 0x82: - case 0x83: - switch ((insn_primary[1] >> 3) & 7) { - /* OR */ - case 1: - /* AND */ - case 4: - /* XOR */ - case 6: - kmemcheck_write(regs, fallback_address, size); - goto out; - - /* ADD */ - case 0: - /* ADC */ - case 2: - /* SBB */ - case 3: - /* SUB */ - case 5: - /* CMP */ - case 7: - break; - } - break; -#endif - - /* MOVS, MOVSB, MOVSW, MOVSD */ - case 0xa4: - case 0xa5: - /* - * These instructions are special because they take two - * addresses, but we only get one page fault. - */ - kmemcheck_copy(regs, regs->si, regs->di, size); - goto out; - - /* CMPS, CMPSB, CMPSW, CMPSD */ - case 0xa6: - case 0xa7: - kmemcheck_read(regs, regs->si, size); - kmemcheck_read(regs, regs->di, size); - goto out; - } - - /* - * If the opcode isn't special in any way, we use the data from the - * page fault handler to determine the address and type of memory - * access. - */ - switch (fallback_method) { - case KMEMCHECK_READ: - kmemcheck_read(regs, fallback_address, size); - goto out; - case KMEMCHECK_WRITE: - kmemcheck_write(regs, fallback_address, size); - goto out; - } - -out: - data->busy = false; -} - -bool kmemcheck_fault(struct pt_regs *regs, unsigned long address, - unsigned long error_code) -{ - pte_t *pte; - - /* - * XXX: Is it safe to assume that memory accesses from virtual 86 - * mode or non-kernel code segments will _never_ access kernel - * memory (e.g. tracked pages)? For now, we need this to avoid - * invoking kmemcheck for PnP BIOS calls. - */ - if (regs->flags & X86_VM_MASK) - return false; - if (regs->cs != __KERNEL_CS) - return false; - - pte = kmemcheck_pte_lookup(address); - if (!pte) - return false; - - WARN_ON_ONCE(in_nmi()); - - if (error_code & 2) - kmemcheck_access(regs, address, KMEMCHECK_WRITE); - else - kmemcheck_access(regs, address, KMEMCHECK_READ); - - kmemcheck_show(regs); - return true; -} - -bool kmemcheck_trap(struct pt_regs *regs) -{ - if (!kmemcheck_active(regs)) - return false; - - /* We're done. */ - kmemcheck_hide(regs); - return true; -} diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c deleted file mode 100644 index df8109ddf7fe..000000000000 --- a/arch/x86/mm/kmemcheck/opcode.c +++ /dev/null @@ -1,107 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/types.h> - -#include "opcode.h" - -static bool opcode_is_prefix(uint8_t b) -{ - return - /* Group 1 */ - b == 0xf0 || b == 0xf2 || b == 0xf3 - /* Group 2 */ - || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26 - || b == 0x64 || b == 0x65 - /* Group 3 */ - || b == 0x66 - /* Group 4 */ - || b == 0x67; -} - -#ifdef CONFIG_X86_64 -static bool opcode_is_rex_prefix(uint8_t b) -{ - return (b & 0xf0) == 0x40; -} -#else -static bool opcode_is_rex_prefix(uint8_t b) -{ - return false; -} -#endif - -#define REX_W (1 << 3) - -/* - * This is a VERY crude opcode decoder. We only need to find the size of the - * load/store that caused our #PF and this should work for all the opcodes - * that we care about. Moreover, the ones who invented this instruction set - * should be shot. - */ -void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size) -{ - /* Default operand size */ - int operand_size_override = 4; - - /* prefixes */ - for (; opcode_is_prefix(*op); ++op) { - if (*op == 0x66) - operand_size_override = 2; - } - - /* REX prefix */ - if (opcode_is_rex_prefix(*op)) { - uint8_t rex = *op; - - ++op; - if (rex & REX_W) { - switch (*op) { - case 0x63: - *size = 4; - return; - case 0x0f: - ++op; - - switch (*op) { - case 0xb6: - case 0xbe: - *size = 1; - return; - case 0xb7: - case 0xbf: - *size = 2; - return; - } - - break; - } - - *size = 8; - return; - } - } - - /* escape opcode */ - if (*op == 0x0f) { - ++op; - - /* - * This is move with zero-extend and sign-extend, respectively; - * we don't have to think about 0xb6/0xbe, because this is - * already handled in the conditional below. - */ - if (*op == 0xb7 || *op == 0xbf) - operand_size_override = 2; - } - - *size = (*op & 1) ? operand_size_override : 1; -} - -const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op) -{ - /* skip prefixes */ - while (opcode_is_prefix(*op)) - ++op; - if (opcode_is_rex_prefix(*op)) - ++op; - return op; -} diff --git a/arch/x86/mm/kmemcheck/opcode.h b/arch/x86/mm/kmemcheck/opcode.h deleted file mode 100644 index 51a1ce94c24a..000000000000 --- a/arch/x86/mm/kmemcheck/opcode.h +++ /dev/null @@ -1,10 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef ARCH__X86__MM__KMEMCHECK__OPCODE_H -#define ARCH__X86__MM__KMEMCHECK__OPCODE_H - -#include <linux/types.h> - -void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size); -const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op); - -#endif diff --git a/arch/x86/mm/kmemcheck/pte.c b/arch/x86/mm/kmemcheck/pte.c deleted file mode 100644 index 8a03be90272a..000000000000 --- a/arch/x86/mm/kmemcheck/pte.c +++ /dev/null @@ -1,23 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/mm.h> - -#include <asm/pgtable.h> - -#include "pte.h" - -pte_t *kmemcheck_pte_lookup(unsigned long address) -{ - pte_t *pte; - unsigned int level; - - pte = lookup_address(address, &level); - if (!pte) - return NULL; - if (level != PG_LEVEL_4K) - return NULL; - if (!pte_hidden(*pte)) - return NULL; - - return pte; -} - diff --git a/arch/x86/mm/kmemcheck/pte.h b/arch/x86/mm/kmemcheck/pte.h deleted file mode 100644 index b595612382c2..000000000000 --- a/arch/x86/mm/kmemcheck/pte.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef ARCH__X86__MM__KMEMCHECK__PTE_H -#define ARCH__X86__MM__KMEMCHECK__PTE_H - -#include <linux/mm.h> - -#include <asm/pgtable.h> - -pte_t *kmemcheck_pte_lookup(unsigned long address); - -#endif diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c deleted file mode 100644 index 7ce0be1f99eb..000000000000 --- a/arch/x86/mm/kmemcheck/selftest.c +++ /dev/null @@ -1,71 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/bug.h> -#include <linux/kernel.h> - -#include "opcode.h" -#include "selftest.h" - -struct selftest_opcode { - unsigned int expected_size; - const uint8_t *insn; - const char *desc; -}; - -static const struct selftest_opcode selftest_opcodes[] = { - /* REP MOVS */ - {1, "\xf3\xa4", "rep movsb <mem8>, <mem8>"}, - {4, "\xf3\xa5", "rep movsl <mem32>, <mem32>"}, - - /* MOVZX / MOVZXD */ - {1, "\x66\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg16>"}, - {1, "\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg32>"}, - - /* MOVSX / MOVSXD */ - {1, "\x66\x0f\xbe\x51\xf8", "movswq <mem8>, <reg16>"}, - {1, "\x0f\xbe\x51\xf8", "movswq <mem8>, <reg32>"}, - -#ifdef CONFIG_X86_64 - /* MOVZX / MOVZXD */ - {1, "\x49\x0f\xb6\x51\xf8", "movzbq <mem8>, <reg64>"}, - {2, "\x49\x0f\xb7\x51\xf8", "movzbq <mem16>, <reg64>"}, - - /* MOVSX / MOVSXD */ - {1, "\x49\x0f\xbe\x51\xf8", "movsbq <mem8>, <reg64>"}, - {2, "\x49\x0f\xbf\x51\xf8", "movsbq <mem16>, <reg64>"}, - {4, "\x49\x63\x51\xf8", "movslq <mem32>, <reg64>"}, -#endif -}; - -static bool selftest_opcode_one(const struct selftest_opcode *op) -{ - unsigned size; - - kmemcheck_opcode_decode(op->insn, &size); - - if (size == op->expected_size) - return true; - - printk(KERN_WARNING "kmemcheck: opcode %s: expected size %d, got %d\n", - op->desc, op->expected_size, size); - return false; -} - -static bool selftest_opcodes_all(void) -{ - bool pass = true; - unsigned int i; - - for (i = 0; i < ARRAY_SIZE(selftest_opcodes); ++i) - pass = pass && selftest_opcode_one(&selftest_opcodes[i]); - - return pass; -} - -bool kmemcheck_selftest(void) -{ - bool pass = true; - - pass = pass && selftest_opcodes_all(); - - return pass; -} diff --git a/arch/x86/mm/kmemcheck/selftest.h b/arch/x86/mm/kmemcheck/selftest.h deleted file mode 100644 index 8d759aae453d..000000000000 --- a/arch/x86/mm/kmemcheck/selftest.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef ARCH_X86_MM_KMEMCHECK_SELFTEST_H -#define ARCH_X86_MM_KMEMCHECK_SELFTEST_H - -bool kmemcheck_selftest(void); - -#endif diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c deleted file mode 100644 index c2638a7d2c10..000000000000 --- a/arch/x86/mm/kmemcheck/shadow.c +++ /dev/null @@ -1,173 +0,0 @@ -#include <linux/kmemcheck.h> -#include <linux/export.h> -#include <linux/mm.h> - -#include <asm/page.h> -#include <asm/pgtable.h> - -#include "pte.h" -#include "shadow.h" - -/* - * Return the shadow address for the given address. Returns NULL if the - * address is not tracked. - * - * We need to be extremely careful not to follow any invalid pointers, - * because this function can be called for *any* possible address. - */ -void *kmemcheck_shadow_lookup(unsigned long address) -{ - pte_t *pte; - struct page *page; - - if (!virt_addr_valid(address)) - return NULL; - - pte = kmemcheck_pte_lookup(address); - if (!pte) - return NULL; - - page = virt_to_page(address); - if (!page->shadow) - return NULL; - return page->shadow + (address & (PAGE_SIZE - 1)); -} - -static void mark_shadow(void *address, unsigned int n, - enum kmemcheck_shadow status) -{ - unsigned long addr = (unsigned long) address; - unsigned long last_addr = addr + n - 1; - unsigned long page = addr & PAGE_MASK; - unsigned long last_page = last_addr & PAGE_MASK; - unsigned int first_n; - void *shadow; - - /* If the memory range crosses a page boundary, stop there. */ - if (page == last_page) - first_n = n; - else - first_n = page + PAGE_SIZE - addr; - - shadow = kmemcheck_shadow_lookup(addr); - if (shadow) - memset(shadow, status, first_n); - - addr += first_n; - n -= first_n; - - /* Do full-page memset()s. */ - while (n >= PAGE_SIZE) { - shadow = kmemcheck_shadow_lookup(addr); - if (shadow) - memset(shadow, status, PAGE_SIZE); - - addr += PAGE_SIZE; - n -= PAGE_SIZE; - } - - /* Do the remaining page, if any. */ - if (n > 0) { - shadow = kmemcheck_shadow_lookup(addr); - if (shadow) - memset(shadow, status, n); - } -} - -void kmemcheck_mark_unallocated(void *address, unsigned int n) -{ - mark_shadow(address, n, KMEMCHECK_SHADOW_UNALLOCATED); -} - -void kmemcheck_mark_uninitialized(void *address, unsigned int n) -{ - mark_shadow(address, n, KMEMCHECK_SHADOW_UNINITIALIZED); -} - -/* - * Fill the shadow memory of the given address such that the memory at that - * address is marked as being initialized. - */ -void kmemcheck_mark_initialized(void *address, unsigned int n) -{ - mark_shadow(address, n, KMEMCHECK_SHADOW_INITIALIZED); -} -EXPORT_SYMBOL_GPL(kmemcheck_mark_initialized); - -void kmemcheck_mark_freed(void *address, unsigned int n) -{ - mark_shadow(address, n, KMEMCHECK_SHADOW_FREED); -} - -void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n) -{ - unsigned int i; - - for (i = 0; i < n; ++i) - kmemcheck_mark_unallocated(page_address(&p[i]), PAGE_SIZE); -} - -void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n) -{ - unsigned int i; - - for (i = 0; i < n; ++i) - kmemcheck_mark_uninitialized(page_address(&p[i]), PAGE_SIZE); -} - -void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n) -{ - unsigned int i; - - for (i = 0; i < n; ++i) - kmemcheck_mark_initialized(page_address(&p[i]), PAGE_SIZE); -} - -enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size) -{ -#ifdef CONFIG_KMEMCHECK_PARTIAL_OK - uint8_t *x; - unsigned int i; - - x = shadow; - - /* - * Make sure _some_ bytes are initialized. Gcc frequently generates - * code to access neighboring bytes. - */ - for (i = 0; i < size; ++i) { - if (x[i] == KMEMCHECK_SHADOW_INITIALIZED) - return x[i]; - } - - return x[0]; -#else - return kmemcheck_shadow_test_all(shadow, size); -#endif -} - -enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow, unsigned int size) -{ - uint8_t *x; - unsigned int i; - - x = shadow; - - /* All bytes must be initialized. */ - for (i = 0; i < size; ++i) { - if (x[i] != KMEMCHECK_SHADOW_INITIALIZED) - return x[i]; - } - - return x[0]; -} - -void kmemcheck_shadow_set(void *shadow, unsigned int size) -{ - uint8_t *x; - unsigned int i; - - x = shadow; - for (i = 0; i < size; ++i) - x[i] = KMEMCHECK_SHADOW_INITIALIZED; -} diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h deleted file mode 100644 index 49768dc18664..000000000000 --- a/arch/x86/mm/kmemcheck/shadow.h +++ /dev/null @@ -1,19 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef ARCH__X86__MM__KMEMCHECK__SHADOW_H -#define ARCH__X86__MM__KMEMCHECK__SHADOW_H - -enum kmemcheck_shadow { - KMEMCHECK_SHADOW_UNALLOCATED, - KMEMCHECK_SHADOW_UNINITIALIZED, - KMEMCHECK_SHADOW_INITIALIZED, - KMEMCHECK_SHADOW_FREED, -}; - -void *kmemcheck_shadow_lookup(unsigned long address); - -enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size); -enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow, - unsigned int size); -void kmemcheck_shadow_set(void *shadow, unsigned int size); - -#endif diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 3fe68483463c..85cf12219dea 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -753,7 +753,7 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte, if (!debug_pagealloc_enabled()) spin_unlock(&cpa_lock); - base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0); + base = alloc_pages(GFP_KERNEL, 0); if (!debug_pagealloc_enabled()) spin_lock(&cpa_lock); if (!base) @@ -904,7 +904,7 @@ static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end) static int alloc_pte_page(pmd_t *pmd) { - pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); + pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL); if (!pte) return -1; @@ -914,7 +914,7 @@ static int alloc_pte_page(pmd_t *pmd) static int alloc_pmd_page(pud_t *pud) { - pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); + pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL); if (!pmd) return -1; @@ -1120,7 +1120,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) pgd_entry = cpa->pgd + pgd_index(addr); if (pgd_none(*pgd_entry)) { - p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); + p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL); if (!p4d) return -1; @@ -1132,7 +1132,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) */ p4d = p4d_offset(pgd_entry, addr); if (p4d_none(*p4d)) { - pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); + pud = (pud_t *)get_zeroed_page(GFP_KERNEL); if (!pud) return -1; diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 17ebc5a978cc..004abf9ebf12 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -7,7 +7,7 @@ #include <asm/fixmap.h> #include <asm/mtrr.h> -#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | __GFP_ZERO) +#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO) #ifdef CONFIG_HIGHPTE #define PGALLOC_USER_GFP __GFP_HIGHMEM @@ -355,14 +355,15 @@ static inline void _pgd_free(pgd_t *pgd) kmem_cache_free(pgd_cache, pgd); } #else + static inline pgd_t *_pgd_alloc(void) { - return (pgd_t *)__get_free_page(PGALLOC_GFP); + return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER); } static inline void _pgd_free(pgd_t *pgd) { - free_page((unsigned long)pgd); + free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); } #endif /* CONFIG_X86_PAE */ diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 6b9bf023a700..c3c5274410a9 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -10,6 +10,7 @@ #include <linux/pagemap.h> #include <linux/spinlock.h> +#include <asm/cpu_entry_area.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> #include <asm/fixmap.h> diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c new file mode 100644 index 000000000000..bce8aea65606 --- /dev/null +++ b/arch/x86/mm/pti.c @@ -0,0 +1,387 @@ +/* + * Copyright(c) 2017 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * This code is based in part on work published here: + * + * https://github.com/IAIK/KAISER + * + * The original work was written by and and signed off by for the Linux + * kernel by: + * + * Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at> + * Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at> + * Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at> + * Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at> + * + * Major changes to the original code by: Dave Hansen <dave.hansen@intel.com> + * Mostly rewritten by Thomas Gleixner <tglx@linutronix.de> and + * Andy Lutomirsky <luto@amacapital.net> + */ +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/bug.h> +#include <linux/init.h> +#include <linux/spinlock.h> +#include <linux/mm.h> +#include <linux/uaccess.h> + +#include <asm/cpufeature.h> +#include <asm/hypervisor.h> +#include <asm/vsyscall.h> +#include <asm/cmdline.h> +#include <asm/pti.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/tlbflush.h> +#include <asm/desc.h> + +#undef pr_fmt +#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt + +/* Backporting helper */ +#ifndef __GFP_NOTRACK +#define __GFP_NOTRACK 0 +#endif + +static void __init pti_print_if_insecure(const char *reason) +{ + if (boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) + pr_info("%s\n", reason); +} + +static void __init pti_print_if_secure(const char *reason) +{ + if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) + pr_info("%s\n", reason); +} + +void __init pti_check_boottime_disable(void) +{ + char arg[5]; + int ret; + + if (hypervisor_is_type(X86_HYPER_XEN_PV)) { + pti_print_if_insecure("disabled on XEN PV."); + return; + } + + ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg)); + if (ret > 0) { + if (ret == 3 && !strncmp(arg, "off", 3)) { + pti_print_if_insecure("disabled on command line."); + return; + } + if (ret == 2 && !strncmp(arg, "on", 2)) { + pti_print_if_secure("force enabled on command line."); + goto enable; + } + if (ret == 4 && !strncmp(arg, "auto", 4)) + goto autosel; + } + + if (cmdline_find_option_bool(boot_command_line, "nopti")) { + pti_print_if_insecure("disabled on command line."); + return; + } + +autosel: + if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) + return; +enable: + setup_force_cpu_cap(X86_FEATURE_PTI); +} + +pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) +{ + /* + * Changes to the high (kernel) portion of the kernelmode page + * tables are not automatically propagated to the usermode tables. + * + * Users should keep in mind that, unlike the kernelmode tables, + * there is no vmalloc_fault equivalent for the usermode tables. + * Top-level entries added to init_mm's usermode pgd after boot + * will not be automatically propagated to other mms. + */ + if (!pgdp_maps_userspace(pgdp)) + return pgd; + + /* + * The user page tables get the full PGD, accessible from + * userspace: + */ + kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd; + + /* + * If this is normal user memory, make it NX in the kernel + * pagetables so that, if we somehow screw up and return to + * usermode with the kernel CR3 loaded, we'll get a page fault + * instead of allowing user code to execute with the wrong CR3. + * + * As exceptions, we don't set NX if: + * - _PAGE_USER is not set. This could be an executable + * EFI runtime mapping or something similar, and the kernel + * may execute from it + * - we don't have NX support + * - we're clearing the PGD (i.e. the new pgd is not present). + */ + if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) && + (__supported_pte_mask & _PAGE_NX)) + pgd.pgd |= _PAGE_NX; + + /* return the copy of the PGD we want the kernel to use: */ + return pgd; +} + +/* + * Walk the user copy of the page tables (optionally) trying to allocate + * page table pages on the way down. + * + * Returns a pointer to a P4D on success, or NULL on failure. + */ +static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) +{ + pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address)); + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); + + if (address < PAGE_OFFSET) { + WARN_ONCE(1, "attempt to walk user address\n"); + return NULL; + } + + if (pgd_none(*pgd)) { + unsigned long new_p4d_page = __get_free_page(gfp); + if (!new_p4d_page) + return NULL; + + if (pgd_none(*pgd)) { + set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); + new_p4d_page = 0; + } + if (new_p4d_page) + free_page(new_p4d_page); + } + BUILD_BUG_ON(pgd_large(*pgd) != 0); + + return p4d_offset(pgd, address); +} + +/* + * Walk the user copy of the page tables (optionally) trying to allocate + * page table pages on the way down. + * + * Returns a pointer to a PMD on success, or NULL on failure. + */ +static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) +{ + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); + p4d_t *p4d = pti_user_pagetable_walk_p4d(address); + pud_t *pud; + + BUILD_BUG_ON(p4d_large(*p4d) != 0); + if (p4d_none(*p4d)) { + unsigned long new_pud_page = __get_free_page(gfp); + if (!new_pud_page) + return NULL; + + if (p4d_none(*p4d)) { + set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page))); + new_pud_page = 0; + } + if (new_pud_page) + free_page(new_pud_page); + } + + pud = pud_offset(p4d, address); + /* The user page tables do not use large mappings: */ + if (pud_large(*pud)) { + WARN_ON(1); + return NULL; + } + if (pud_none(*pud)) { + unsigned long new_pmd_page = __get_free_page(gfp); + if (!new_pmd_page) + return NULL; + + if (pud_none(*pud)) { + set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); + new_pmd_page = 0; + } + if (new_pmd_page) + free_page(new_pmd_page); + } + + return pmd_offset(pud, address); +} + +#ifdef CONFIG_X86_VSYSCALL_EMULATION +/* + * Walk the shadow copy of the page tables (optionally) trying to allocate + * page table pages on the way down. Does not support large pages. + * + * Note: this is only used when mapping *new* kernel data into the + * user/shadow page tables. It is never used for userspace data. + * + * Returns a pointer to a PTE on success, or NULL on failure. + */ +static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address) +{ + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); + pmd_t *pmd = pti_user_pagetable_walk_pmd(address); + pte_t *pte; + + /* We can't do anything sensible if we hit a large mapping. */ + if (pmd_large(*pmd)) { + WARN_ON(1); + return NULL; + } + + if (pmd_none(*pmd)) { + unsigned long new_pte_page = __get_free_page(gfp); + if (!new_pte_page) + return NULL; + + if (pmd_none(*pmd)) { + set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); + new_pte_page = 0; + } + if (new_pte_page) + free_page(new_pte_page); + } + + pte = pte_offset_kernel(pmd, address); + if (pte_flags(*pte) & _PAGE_USER) { + WARN_ONCE(1, "attempt to walk to user pte\n"); + return NULL; + } + return pte; +} + +static void __init pti_setup_vsyscall(void) +{ + pte_t *pte, *target_pte; + unsigned int level; + + pte = lookup_address(VSYSCALL_ADDR, &level); + if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte)) + return; + + target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR); + if (WARN_ON(!target_pte)) + return; + + *target_pte = *pte; + set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir)); +} +#else +static void __init pti_setup_vsyscall(void) { } +#endif + +static void __init +pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear) +{ + unsigned long addr; + + /* + * Clone the populated PMDs which cover start to end. These PMD areas + * can have holes. + */ + for (addr = start; addr < end; addr += PMD_SIZE) { + pmd_t *pmd, *target_pmd; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + + pgd = pgd_offset_k(addr); + if (WARN_ON(pgd_none(*pgd))) + return; + p4d = p4d_offset(pgd, addr); + if (WARN_ON(p4d_none(*p4d))) + return; + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) + continue; + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + continue; + + target_pmd = pti_user_pagetable_walk_pmd(addr); + if (WARN_ON(!target_pmd)) + return; + + /* + * Copy the PMD. That is, the kernelmode and usermode + * tables will share the last-level page tables of this + * address range + */ + *target_pmd = pmd_clear_flags(*pmd, clear); + } +} + +/* + * Clone a single p4d (i.e. a top-level entry on 4-level systems and a + * next-level entry on 5-level systems. + */ +static void __init pti_clone_p4d(unsigned long addr) +{ + p4d_t *kernel_p4d, *user_p4d; + pgd_t *kernel_pgd; + + user_p4d = pti_user_pagetable_walk_p4d(addr); + kernel_pgd = pgd_offset_k(addr); + kernel_p4d = p4d_offset(kernel_pgd, addr); + *user_p4d = *kernel_p4d; +} + +/* + * Clone the CPU_ENTRY_AREA into the user space visible page table. + */ +static void __init pti_clone_user_shared(void) +{ + pti_clone_p4d(CPU_ENTRY_AREA_BASE); +} + +/* + * Clone the ESPFIX P4D into the user space visinble page table + */ +static void __init pti_setup_espfix64(void) +{ +#ifdef CONFIG_X86_ESPFIX64 + pti_clone_p4d(ESPFIX_BASE_ADDR); +#endif +} + +/* + * Clone the populated PMDs of the entry and irqentry text and force it RO. + */ +static void __init pti_clone_entry_text(void) +{ + pti_clone_pmds((unsigned long) __entry_text_start, + (unsigned long) __irqentry_text_end, _PAGE_RW); +} + +/* + * Initialize kernel page table isolation + */ +void __init pti_init(void) +{ + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + pr_info("enabled\n"); + + pti_clone_user_shared(); + pti_clone_entry_text(); + pti_setup_espfix64(); + pti_setup_vsyscall(); +} diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 3118392cdf75..a1561957dccb 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -28,6 +28,38 @@ * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi */ +/* + * We get here when we do something requiring a TLB invalidation + * but could not go invalidate all of the contexts. We do the + * necessary invalidation by clearing out the 'ctx_id' which + * forces a TLB flush when the context is loaded. + */ +void clear_asid_other(void) +{ + u16 asid; + + /* + * This is only expected to be set if we have disabled + * kernel _PAGE_GLOBAL pages. + */ + if (!static_cpu_has(X86_FEATURE_PTI)) { + WARN_ON_ONCE(1); + return; + } + + for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { + /* Do not need to flush the current asid */ + if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid)) + continue; + /* + * Make sure the next time we go to switch to + * this asid, we do a flush: + */ + this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0); + } + this_cpu_write(cpu_tlbstate.invalidate_other, false); +} + atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); @@ -42,6 +74,9 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, return; } + if (this_cpu_read(cpu_tlbstate.invalidate_other)) + clear_asid_other(); + for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) != next->context.ctx_id) @@ -65,6 +100,25 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, *need_flush = true; } +static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush) +{ + unsigned long new_mm_cr3; + + if (need_flush) { + invalidate_user_asid(new_asid); + new_mm_cr3 = build_cr3(pgdir, new_asid); + } else { + new_mm_cr3 = build_cr3_noflush(pgdir, new_asid); + } + + /* + * Caution: many callers of this function expect + * that load_cr3() is serializing and orders TLB + * fills with respect to the mm_cpumask writes. + */ + write_cr3(new_mm_cr3); +} + void leave_mm(int cpu) { struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); @@ -128,7 +182,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * isn't free. */ #ifdef CONFIG_DEBUG_VM - if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev, prev_asid))) { + if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) { /* * If we were to BUG here, we'd be very likely to kill * the system so hard that we don't see the call trace. @@ -195,7 +249,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, if (need_flush) { this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); - write_cr3(build_cr3(next, new_asid)); + load_new_mm_cr3(next->pgd, new_asid, true); /* * NB: This gets called via leave_mm() in the idle path @@ -208,7 +262,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } else { /* The new ASID is already up to date. */ - write_cr3(build_cr3_noflush(next, new_asid)); + load_new_mm_cr3(next->pgd, new_asid, false); /* See above wrt _rcuidle. */ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); @@ -288,7 +342,7 @@ void initialize_tlbstate_and_flush(void) !(cr4_read_shadow() & X86_CR4_PCIDE)); /* Force ASID 0 and force a TLB flush. */ - write_cr3(build_cr3(mm, 0)); + write_cr3(build_cr3(mm->pgd, 0)); /* Reinitialize tlbstate. */ this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); @@ -551,7 +605,7 @@ static void do_kernel_range_flush(void *info) /* flush range by one by one 'invlpg' */ for (addr = f->start; addr < f->end; addr += PAGE_SIZE) - __flush_tlb_single(addr); + __flush_tlb_one(addr); } void flush_tlb_kernel_range(unsigned long start, unsigned long end) |