summaryrefslogtreecommitdiffstats
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/Makefile11
-rw-r--r--arch/x86/mm/cpu_entry_area.c166
-rw-r--r--arch/x86/mm/debug_pagetables.c80
-rw-r--r--arch/x86/mm/dump_pagetables.c141
-rw-r--r--arch/x86/mm/extable.c2
-rw-r--r--arch/x86/mm/fault.c10
-rw-r--r--arch/x86/mm/init.c88
-rw-r--r--arch/x86/mm/init_32.c6
-rw-r--r--arch/x86/mm/init_64.c13
-rw-r--r--arch/x86/mm/kasan_init_64.c23
-rw-r--r--arch/x86/mm/kmemcheck/Makefile1
-rw-r--r--arch/x86/mm/kmemcheck/error.c228
-rw-r--r--arch/x86/mm/kmemcheck/error.h16
-rw-r--r--arch/x86/mm/kmemcheck/kmemcheck.c658
-rw-r--r--arch/x86/mm/kmemcheck/opcode.c107
-rw-r--r--arch/x86/mm/kmemcheck/opcode.h10
-rw-r--r--arch/x86/mm/kmemcheck/pte.c23
-rw-r--r--arch/x86/mm/kmemcheck/pte.h11
-rw-r--r--arch/x86/mm/kmemcheck/selftest.c71
-rw-r--r--arch/x86/mm/kmemcheck/selftest.h7
-rw-r--r--arch/x86/mm/kmemcheck/shadow.c173
-rw-r--r--arch/x86/mm/kmemcheck/shadow.h19
-rw-r--r--arch/x86/mm/pageattr.c10
-rw-r--r--arch/x86/mm/pgtable.c7
-rw-r--r--arch/x86/mm/pgtable_32.c1
-rw-r--r--arch/x86/mm/pti.c387
-rw-r--r--arch/x86/mm/tlb.c64
27 files changed, 889 insertions, 1444 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 7ba7f3d7f477..27e9e90a8d35 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -10,7 +10,7 @@ CFLAGS_REMOVE_mem_encrypt.o = -pg
endif
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
- pat.o pgtable.o physaddr.o setup_nx.o tlb.o
+ pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o
# Make sure __phys_addr has no stackprotector
nostackp := $(call cc-option, -fno-stack-protector)
@@ -29,8 +29,6 @@ obj-$(CONFIG_X86_PTDUMP) += debug_pagetables.o
obj-$(CONFIG_HIGHMEM) += highmem_32.o
-obj-$(CONFIG_KMEMCHECK) += kmemcheck/
-
KASAN_SANITIZE_kasan_init_$(BITS).o := n
obj-$(CONFIG_KASAN) += kasan_init_$(BITS).o
@@ -43,9 +41,10 @@ obj-$(CONFIG_AMD_NUMA) += amdtopology.o
obj-$(CONFIG_ACPI_NUMA) += srat.o
obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
-obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
-obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
-obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
+obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
+obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
+obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
+obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
new file mode 100644
index 000000000000..b9283cc27622
--- /dev/null
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+
+#include <asm/cpu_entry_area.h>
+#include <asm/pgtable.h>
+#include <asm/fixmap.h>
+#include <asm/desc.h>
+
+static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage);
+
+#ifdef CONFIG_X86_64
+static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+ [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
+#endif
+
+struct cpu_entry_area *get_cpu_entry_area(int cpu)
+{
+ unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE;
+ BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0);
+
+ return (struct cpu_entry_area *) va;
+}
+EXPORT_SYMBOL(get_cpu_entry_area);
+
+void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags)
+{
+ unsigned long va = (unsigned long) cea_vaddr;
+
+ set_pte_vaddr(va, pfn_pte(pa >> PAGE_SHIFT, flags));
+}
+
+static void __init
+cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot)
+{
+ for ( ; pages; pages--, cea_vaddr+= PAGE_SIZE, ptr += PAGE_SIZE)
+ cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot);
+}
+
+static void percpu_setup_debug_store(int cpu)
+{
+#ifdef CONFIG_CPU_SUP_INTEL
+ int npages;
+ void *cea;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return;
+
+ cea = &get_cpu_entry_area(cpu)->cpu_debug_store;
+ npages = sizeof(struct debug_store) / PAGE_SIZE;
+ BUILD_BUG_ON(sizeof(struct debug_store) % PAGE_SIZE != 0);
+ cea_map_percpu_pages(cea, &per_cpu(cpu_debug_store, cpu), npages,
+ PAGE_KERNEL);
+
+ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers;
+ /*
+ * Force the population of PMDs for not yet allocated per cpu
+ * memory like debug store buffers.
+ */
+ npages = sizeof(struct debug_store_buffers) / PAGE_SIZE;
+ for (; npages; npages--, cea += PAGE_SIZE)
+ cea_set_pte(cea, 0, PAGE_NONE);
+#endif
+}
+
+/* Setup the fixmap mappings only once per-processor */
+static void __init setup_cpu_entry_area(int cpu)
+{
+#ifdef CONFIG_X86_64
+ extern char _entry_trampoline[];
+
+ /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
+ pgprot_t gdt_prot = PAGE_KERNEL_RO;
+ pgprot_t tss_prot = PAGE_KERNEL_RO;
+#else
+ /*
+ * On native 32-bit systems, the GDT cannot be read-only because
+ * our double fault handler uses a task gate, and entering through
+ * a task gate needs to change an available TSS to busy. If the
+ * GDT is read-only, that will triple fault. The TSS cannot be
+ * read-only because the CPU writes to it on task switches.
+ *
+ * On Xen PV, the GDT must be read-only because the hypervisor
+ * requires it.
+ */
+ pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ?
+ PAGE_KERNEL_RO : PAGE_KERNEL;
+ pgprot_t tss_prot = PAGE_KERNEL;
+#endif
+
+ cea_set_pte(&get_cpu_entry_area(cpu)->gdt, get_cpu_gdt_paddr(cpu),
+ gdt_prot);
+
+ cea_map_percpu_pages(&get_cpu_entry_area(cpu)->entry_stack_page,
+ per_cpu_ptr(&entry_stack_storage, cpu), 1,
+ PAGE_KERNEL);
+
+ /*
+ * The Intel SDM says (Volume 3, 7.2.1):
+ *
+ * Avoid placing a page boundary in the part of the TSS that the
+ * processor reads during a task switch (the first 104 bytes). The
+ * processor may not correctly perform address translations if a
+ * boundary occurs in this area. During a task switch, the processor
+ * reads and writes into the first 104 bytes of each TSS (using
+ * contiguous physical addresses beginning with the physical address
+ * of the first byte of the TSS). So, after TSS access begins, if
+ * part of the 104 bytes is not physically contiguous, the processor
+ * will access incorrect information without generating a page-fault
+ * exception.
+ *
+ * There are also a lot of errata involving the TSS spanning a page
+ * boundary. Assert that we're not doing that.
+ */
+ BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^
+ offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
+ BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
+ cea_map_percpu_pages(&get_cpu_entry_area(cpu)->tss,
+ &per_cpu(cpu_tss_rw, cpu),
+ sizeof(struct tss_struct) / PAGE_SIZE, tss_prot);
+
+#ifdef CONFIG_X86_32
+ per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu);
+#endif
+
+#ifdef CONFIG_X86_64
+ BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0);
+ BUILD_BUG_ON(sizeof(exception_stacks) !=
+ sizeof(((struct cpu_entry_area *)0)->exception_stacks));
+ cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks,
+ &per_cpu(exception_stacks, cpu),
+ sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL);
+
+ cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline,
+ __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
+#endif
+ percpu_setup_debug_store(cpu);
+}
+
+static __init void setup_cpu_entry_area_ptes(void)
+{
+#ifdef CONFIG_X86_32
+ unsigned long start, end;
+
+ BUILD_BUG_ON(CPU_ENTRY_AREA_PAGES * PAGE_SIZE < CPU_ENTRY_AREA_MAP_SIZE);
+ BUG_ON(CPU_ENTRY_AREA_BASE & ~PMD_MASK);
+
+ start = CPU_ENTRY_AREA_BASE;
+ end = start + CPU_ENTRY_AREA_MAP_SIZE;
+
+ /* Careful here: start + PMD_SIZE might wrap around */
+ for (; start < end && start >= CPU_ENTRY_AREA_BASE; start += PMD_SIZE)
+ populate_extra_pte(start);
+#endif
+}
+
+void __init setup_cpu_entry_areas(void)
+{
+ unsigned int cpu;
+
+ setup_cpu_entry_area_ptes();
+
+ for_each_possible_cpu(cpu)
+ setup_cpu_entry_area(cpu);
+}
diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c
index bfcffdf6c577..421f2664ffa0 100644
--- a/arch/x86/mm/debug_pagetables.c
+++ b/arch/x86/mm/debug_pagetables.c
@@ -5,7 +5,7 @@
static int ptdump_show(struct seq_file *m, void *v)
{
- ptdump_walk_pgd_level(m, NULL);
+ ptdump_walk_pgd_level_debugfs(m, NULL, false);
return 0;
}
@@ -22,21 +22,89 @@ static const struct file_operations ptdump_fops = {
.release = single_release,
};
-static struct dentry *pe;
+static int ptdump_show_curknl(struct seq_file *m, void *v)
+{
+ if (current->mm->pgd) {
+ down_read(&current->mm->mmap_sem);
+ ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, false);
+ up_read(&current->mm->mmap_sem);
+ }
+ return 0;
+}
+
+static int ptdump_open_curknl(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, ptdump_show_curknl, NULL);
+}
+
+static const struct file_operations ptdump_curknl_fops = {
+ .owner = THIS_MODULE,
+ .open = ptdump_open_curknl,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+static struct dentry *pe_curusr;
+
+static int ptdump_show_curusr(struct seq_file *m, void *v)
+{
+ if (current->mm->pgd) {
+ down_read(&current->mm->mmap_sem);
+ ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, true);
+ up_read(&current->mm->mmap_sem);
+ }
+ return 0;
+}
+
+static int ptdump_open_curusr(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, ptdump_show_curusr, NULL);
+}
+
+static const struct file_operations ptdump_curusr_fops = {
+ .owner = THIS_MODULE,
+ .open = ptdump_open_curusr,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif
+
+static struct dentry *dir, *pe_knl, *pe_curknl;
static int __init pt_dump_debug_init(void)
{
- pe = debugfs_create_file("kernel_page_tables", S_IRUSR, NULL, NULL,
- &ptdump_fops);
- if (!pe)
+ dir = debugfs_create_dir("page_tables", NULL);
+ if (!dir)
return -ENOMEM;
+ pe_knl = debugfs_create_file("kernel", 0400, dir, NULL,
+ &ptdump_fops);
+ if (!pe_knl)
+ goto err;
+
+ pe_curknl = debugfs_create_file("current_kernel", 0400,
+ dir, NULL, &ptdump_curknl_fops);
+ if (!pe_curknl)
+ goto err;
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ pe_curusr = debugfs_create_file("current_user", 0400,
+ dir, NULL, &ptdump_curusr_fops);
+ if (!pe_curusr)
+ goto err;
+#endif
return 0;
+err:
+ debugfs_remove_recursive(dir);
+ return -ENOMEM;
}
static void __exit pt_dump_debug_exit(void)
{
- debugfs_remove_recursive(pe);
+ debugfs_remove_recursive(dir);
}
module_init(pt_dump_debug_init);
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 5e3ac6fe6c9e..f56902c1f04b 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -44,68 +44,97 @@ struct addr_marker {
unsigned long max_lines;
};
-/* indices for address_markers; keep sync'd w/ address_markers below */
+/* Address space markers hints */
+
+#ifdef CONFIG_X86_64
+
enum address_markers_idx {
USER_SPACE_NR = 0,
-#ifdef CONFIG_X86_64
KERNEL_SPACE_NR,
LOW_KERNEL_NR,
+#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL)
+ LDT_NR,
+#endif
VMALLOC_START_NR,
VMEMMAP_START_NR,
#ifdef CONFIG_KASAN
KASAN_SHADOW_START_NR,
KASAN_SHADOW_END_NR,
#endif
-# ifdef CONFIG_X86_ESPFIX64
+#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL)
+ LDT_NR,
+#endif
+ CPU_ENTRY_AREA_NR,
+#ifdef CONFIG_X86_ESPFIX64
ESPFIX_START_NR,
-# endif
+#endif
+#ifdef CONFIG_EFI
+ EFI_END_NR,
+#endif
HIGH_KERNEL_NR,
MODULES_VADDR_NR,
MODULES_END_NR,
-#else
+ FIXADDR_START_NR,
+ END_OF_SPACE_NR,
+};
+
+static struct addr_marker address_markers[] = {
+ [USER_SPACE_NR] = { 0, "User Space" },
+ [KERNEL_SPACE_NR] = { (1UL << 63), "Kernel Space" },
+ [LOW_KERNEL_NR] = { 0UL, "Low Kernel Mapping" },
+ [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" },
+ [VMEMMAP_START_NR] = { 0UL, "Vmemmap" },
+#ifdef CONFIG_KASAN
+ [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" },
+ [KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" },
+#endif
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ [LDT_NR] = { LDT_BASE_ADDR, "LDT remap" },
+#endif
+ [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
+#ifdef CONFIG_X86_ESPFIX64
+ [ESPFIX_START_NR] = { ESPFIX_BASE_ADDR, "ESPfix Area", 16 },
+#endif
+#ifdef CONFIG_EFI
+ [EFI_END_NR] = { EFI_VA_END, "EFI Runtime Services" },
+#endif
+ [HIGH_KERNEL_NR] = { __START_KERNEL_map, "High Kernel Mapping" },
+ [MODULES_VADDR_NR] = { MODULES_VADDR, "Modules" },
+ [MODULES_END_NR] = { MODULES_END, "End Modules" },
+ [FIXADDR_START_NR] = { FIXADDR_START, "Fixmap Area" },
+ [END_OF_SPACE_NR] = { -1, NULL }
+};
+
+#else /* CONFIG_X86_64 */
+
+enum address_markers_idx {
+ USER_SPACE_NR = 0,
KERNEL_SPACE_NR,
VMALLOC_START_NR,
VMALLOC_END_NR,
-# ifdef CONFIG_HIGHMEM
+#ifdef CONFIG_HIGHMEM
PKMAP_BASE_NR,
-# endif
- FIXADDR_START_NR,
#endif
+ CPU_ENTRY_AREA_NR,
+ FIXADDR_START_NR,
+ END_OF_SPACE_NR,
};
-/* Address space markers hints */
static struct addr_marker address_markers[] = {
- { 0, "User Space" },
-#ifdef CONFIG_X86_64
- { 0x8000000000000000UL, "Kernel Space" },
- { 0/* PAGE_OFFSET */, "Low Kernel Mapping" },
- { 0/* VMALLOC_START */, "vmalloc() Area" },
- { 0/* VMEMMAP_START */, "Vmemmap" },
-#ifdef CONFIG_KASAN
- { KASAN_SHADOW_START, "KASAN shadow" },
- { KASAN_SHADOW_END, "KASAN shadow end" },
+ [USER_SPACE_NR] = { 0, "User Space" },
+ [KERNEL_SPACE_NR] = { PAGE_OFFSET, "Kernel Mapping" },
+ [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" },
+ [VMALLOC_END_NR] = { 0UL, "vmalloc() End" },
+#ifdef CONFIG_HIGHMEM
+ [PKMAP_BASE_NR] = { 0UL, "Persistent kmap() Area" },
#endif
-# ifdef CONFIG_X86_ESPFIX64
- { ESPFIX_BASE_ADDR, "ESPfix Area", 16 },
-# endif
-# ifdef CONFIG_EFI
- { EFI_VA_END, "EFI Runtime Services" },
-# endif
- { __START_KERNEL_map, "High Kernel Mapping" },
- { MODULES_VADDR, "Modules" },
- { MODULES_END, "End Modules" },
-#else
- { PAGE_OFFSET, "Kernel Mapping" },
- { 0/* VMALLOC_START */, "vmalloc() Area" },
- { 0/*VMALLOC_END*/, "vmalloc() End" },
-# ifdef CONFIG_HIGHMEM
- { 0/*PKMAP_BASE*/, "Persistent kmap() Area" },
-# endif
- { 0/*FIXADDR_START*/, "Fixmap Area" },
-#endif
- { -1, NULL } /* End of list */
+ [CPU_ENTRY_AREA_NR] = { 0UL, "CPU entry area" },
+ [FIXADDR_START_NR] = { 0UL, "Fixmap area" },
+ [END_OF_SPACE_NR] = { -1, NULL }
};
+#endif /* !CONFIG_X86_64 */
+
/* Multipliers for offsets within the PTEs */
#define PTE_LEVEL_MULT (PAGE_SIZE)
#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
@@ -140,7 +169,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
static const char * const level_name[] =
{ "cr3", "pgd", "p4d", "pud", "pmd", "pte" };
- if (!pgprot_val(prot)) {
+ if (!(pr & _PAGE_PRESENT)) {
/* Not present */
pt_dump_cont_printf(m, dmsg, " ");
} else {
@@ -447,7 +476,7 @@ static inline bool is_hypervisor_range(int idx)
}
static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
- bool checkwx)
+ bool checkwx, bool dmesg)
{
#ifdef CONFIG_X86_64
pgd_t *start = (pgd_t *) &init_top_pgt;
@@ -460,7 +489,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
if (pgd) {
start = pgd;
- st.to_dmesg = true;
+ st.to_dmesg = dmesg;
}
st.check_wx = checkwx;
@@ -498,13 +527,37 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
{
- ptdump_walk_pgd_level_core(m, pgd, false);
+ ptdump_walk_pgd_level_core(m, pgd, false, true);
+}
+
+void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user)
+{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ if (user && static_cpu_has(X86_FEATURE_PTI))
+ pgd = kernel_to_user_pgdp(pgd);
+#endif
+ ptdump_walk_pgd_level_core(m, pgd, false, false);
+}
+EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs);
+
+static void ptdump_walk_user_pgd_level_checkwx(void)
+{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ pgd_t *pgd = (pgd_t *) &init_top_pgt;
+
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
+
+ pr_info("x86/mm: Checking user space page tables\n");
+ pgd = kernel_to_user_pgdp(pgd);
+ ptdump_walk_pgd_level_core(NULL, pgd, true, false);
+#endif
}
-EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level);
void ptdump_walk_pgd_level_checkwx(void)
{
- ptdump_walk_pgd_level_core(NULL, NULL, true);
+ ptdump_walk_pgd_level_core(NULL, NULL, true, false);
+ ptdump_walk_user_pgd_level_checkwx();
}
static int __init pt_dump_init(void)
@@ -525,8 +578,8 @@ static int __init pt_dump_init(void)
address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE;
# endif
address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
+ address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE;
#endif
-
return 0;
}
__initcall(pt_dump_init);
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 88754bfd425f..9fe656c42aa5 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -83,7 +83,7 @@ bool ex_handler_refcount(const struct exception_table_entry *fixup,
return true;
}
-EXPORT_SYMBOL_GPL(ex_handler_refcount);
+EXPORT_SYMBOL(ex_handler_refcount);
/*
* Handler for when we fail to restore a task's FPU state. We should never get
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 3109ba6c6ede..06fe3d51d385 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -20,7 +20,6 @@
#include <asm/cpufeature.h> /* boot_cpu_has, ... */
#include <asm/traps.h> /* dotraplinkage, ... */
#include <asm/pgalloc.h> /* pgd_*(), ... */
-#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */
#include <asm/fixmap.h> /* VSYSCALL_ADDR */
#include <asm/vsyscall.h> /* emulate_vsyscall */
#include <asm/vm86.h> /* struct vm86 */
@@ -702,7 +701,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
else
printk(KERN_CONT "paging request");
- printk(KERN_CONT " at %p\n", (void *) address);
+ printk(KERN_CONT " at %px\n", (void *) address);
printk(KERN_ALERT "IP: %pS\n", (void *)regs->ip);
dump_pagetable(address);
@@ -861,7 +860,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
if (!printk_ratelimit())
return;
- printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
+ printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
tsk->comm, task_pid_nr(tsk), address,
(void *)regs->ip, (void *)regs->sp, error_code);
@@ -1256,8 +1255,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* Detect and handle instructions that would cause a page fault for
* both a tracked kernel page and a userspace page.
*/
- if (kmemcheck_active(regs))
- kmemcheck_hide(regs);
prefetchw(&mm->mmap_sem);
if (unlikely(kmmio_fault(regs, address)))
@@ -1280,9 +1277,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
if (vmalloc_fault(address) >= 0)
return;
-
- if (kmemcheck_fault(regs, address, error_code))
- return;
}
/* Can handle a stale RO->RW TLB: */
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index a22c2b95e513..8ca324d07282 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -20,6 +20,7 @@
#include <asm/kaslr.h>
#include <asm/hypervisor.h>
#include <asm/cpufeature.h>
+#include <asm/pti.h>
/*
* We need to define the tracepoints somewhere, and tlb.c
@@ -92,8 +93,7 @@ __ref void *alloc_low_pages(unsigned int num)
unsigned int order;
order = get_order((unsigned long)num << PAGE_SHIFT);
- return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK |
- __GFP_ZERO, order);
+ return (void *)__get_free_pages(GFP_ATOMIC | __GFP_ZERO, order);
}
if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
@@ -161,15 +161,20 @@ struct map_range {
static int page_size_mask;
+static void enable_global_pages(void)
+{
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ __supported_pte_mask |= _PAGE_GLOBAL;
+}
+
static void __init probe_page_size_mask(void)
{
/*
- * For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will
- * use small pages.
+ * For pagealloc debugging, identity mapping will use small pages.
* This will simplify cpa(), which otherwise needs to support splitting
* large pages into small in interrupt context, etc.
*/
- if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled() && !IS_ENABLED(CONFIG_KMEMCHECK))
+ if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled())
page_size_mask |= 1 << PG_LEVEL_2M;
else
direct_gbpages = 0;
@@ -179,11 +184,11 @@ static void __init probe_page_size_mask(void)
cr4_set_bits_and_update_boot(X86_CR4_PSE);
/* Enable PGE if available */
+ __supported_pte_mask &= ~_PAGE_GLOBAL;
if (boot_cpu_has(X86_FEATURE_PGE)) {
cr4_set_bits_and_update_boot(X86_CR4_PGE);
- __supported_pte_mask |= _PAGE_GLOBAL;
- } else
- __supported_pte_mask &= ~_PAGE_GLOBAL;
+ enable_global_pages();
+ }
/* Enable 1 GB linear kernel mappings if available: */
if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) {
@@ -196,34 +201,44 @@ static void __init probe_page_size_mask(void)
static void setup_pcid(void)
{
-#ifdef CONFIG_X86_64
- if (boot_cpu_has(X86_FEATURE_PCID)) {
- if (boot_cpu_has(X86_FEATURE_PGE)) {
- /*
- * This can't be cr4_set_bits_and_update_boot() --
- * the trampoline code can't handle CR4.PCIDE and
- * it wouldn't do any good anyway. Despite the name,
- * cr4_set_bits_and_update_boot() doesn't actually
- * cause the bits in question to remain set all the
- * way through the secondary boot asm.
- *
- * Instead, we brute-force it and set CR4.PCIDE
- * manually in start_secondary().
- */
- cr4_set_bits(X86_CR4_PCIDE);
- } else {
- /*
- * flush_tlb_all(), as currently implemented, won't
- * work if PCID is on but PGE is not. Since that
- * combination doesn't exist on real hardware, there's
- * no reason to try to fully support it, but it's
- * polite to avoid corrupting data if we're on
- * an improperly configured VM.
- */
- setup_clear_cpu_cap(X86_FEATURE_PCID);
- }
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return;
+
+ if (!boot_cpu_has(X86_FEATURE_PCID))
+ return;
+
+ if (boot_cpu_has(X86_FEATURE_PGE)) {
+ /*
+ * This can't be cr4_set_bits_and_update_boot() -- the
+ * trampoline code can't handle CR4.PCIDE and it wouldn't
+ * do any good anyway. Despite the name,
+ * cr4_set_bits_and_update_boot() doesn't actually cause
+ * the bits in question to remain set all the way through
+ * the secondary boot asm.
+ *
+ * Instead, we brute-force it and set CR4.PCIDE manually in
+ * start_secondary().
+ */
+ cr4_set_bits(X86_CR4_PCIDE);
+
+ /*
+ * INVPCID's single-context modes (2/3) only work if we set
+ * X86_CR4_PCIDE, *and* we INVPCID support. It's unusable
+ * on systems that have X86_CR4_PCIDE clear, or that have
+ * no INVPCID support at all.
+ */
+ if (boot_cpu_has(X86_FEATURE_INVPCID))
+ setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE);
+ } else {
+ /*
+ * flush_tlb_all(), as currently implemented, won't work if
+ * PCID is on but PGE is not. Since that combination
+ * doesn't exist on real hardware, there's no reason to try
+ * to fully support it, but it's polite to avoid corrupting
+ * data if we're on an improperly configured VM.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_PCID);
}
-#endif
}
#ifdef CONFIG_X86_32
@@ -624,6 +639,7 @@ void __init init_mem_mapping(void)
{
unsigned long end;
+ pti_check_boottime_disable();
probe_page_size_mask();
setup_pcid();
@@ -847,7 +863,7 @@ void __init zone_sizes_init(void)
free_area_init_nodes(max_zone_pfns);
}
-DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
+__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
.loaded_mm = &init_mm,
.next_asid = 1,
.cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 8a64a6f2848d..135c9a7898c7 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -50,6 +50,7 @@
#include <asm/setup.h>
#include <asm/set_memory.h>
#include <asm/page_types.h>
+#include <asm/cpu_entry_area.h>
#include <asm/init.h>
#include "mm_internal.h"
@@ -766,6 +767,7 @@ void __init mem_init(void)
mem_init_print_info(NULL);
printk(KERN_INFO "virtual kernel memory layout:\n"
" fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
+ " cpu_entry : 0x%08lx - 0x%08lx (%4ld kB)\n"
#ifdef CONFIG_HIGHMEM
" pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
#endif
@@ -777,6 +779,10 @@ void __init mem_init(void)
FIXADDR_START, FIXADDR_TOP,
(FIXADDR_TOP - FIXADDR_START) >> 10,
+ CPU_ENTRY_AREA_BASE,
+ CPU_ENTRY_AREA_BASE + CPU_ENTRY_AREA_MAP_SIZE,
+ CPU_ENTRY_AREA_MAP_SIZE >> 10,
+
#ifdef CONFIG_HIGHMEM
PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
(LAST_PKMAP*PAGE_SIZE) >> 10,
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index adcea90a2046..4a837289f2ad 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -184,7 +184,7 @@ static __ref void *spp_getpage(void)
void *ptr;
if (after_bootmem)
- ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
+ ptr = (void *) get_zeroed_page(GFP_ATOMIC);
else
ptr = alloc_bootmem_pages(PAGE_SIZE);
@@ -1173,12 +1173,18 @@ void __init mem_init(void)
/* clear_bss() already clear the empty_zero_page */
- register_page_bootmem_info();
-
/* this will put all memory onto the freelists */
free_all_bootmem();
after_bootmem = 1;
+ /*
+ * Must be done after boot memory is put on freelist, because here we
+ * might set fields in deferred struct pages that have not yet been
+ * initialized, and free_all_bootmem() initializes all the reserved
+ * deferred pages for us.
+ */
+ register_page_bootmem_info();
+
/* Register memory areas for /proc/kcore */
kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR,
PAGE_SIZE, KCORE_OTHER);
@@ -1399,7 +1405,6 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
vmemmap_verify((pte_t *)pmd, node, addr, next);
continue;
}
- pr_warn_once("vmemmap: falling back to regular page backing\n");
if (vmemmap_populate_basepages(addr, next, node))
return -ENOMEM;
}
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 99dfed6dfef8..47388f0c0e59 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -15,6 +15,7 @@
#include <asm/tlbflush.h>
#include <asm/sections.h>
#include <asm/pgtable.h>
+#include <asm/cpu_entry_area.h>
extern struct range pfn_mapped[E820_MAX_ENTRIES];
@@ -277,6 +278,7 @@ void __init kasan_early_init(void)
void __init kasan_init(void)
{
int i;
+ void *shadow_cpu_entry_begin, *shadow_cpu_entry_end;
#ifdef CONFIG_KASAN_INLINE
register_die_notifier(&kasan_die_notifier);
@@ -321,16 +323,33 @@ void __init kasan_init(void)
map_range(&pfn_mapped[i]);
}
+ shadow_cpu_entry_begin = (void *)CPU_ENTRY_AREA_BASE;
+ shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin);
+ shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin,
+ PAGE_SIZE);
+
+ shadow_cpu_entry_end = (void *)(CPU_ENTRY_AREA_BASE +
+ CPU_ENTRY_AREA_MAP_SIZE);
+ shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end);
+ shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end,
+ PAGE_SIZE);
+
kasan_populate_zero_shadow(
kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
- kasan_mem_to_shadow((void *)__START_KERNEL_map));
+ shadow_cpu_entry_begin);
+
+ kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin,
+ (unsigned long)shadow_cpu_entry_end, 0);
+
+ kasan_populate_zero_shadow(shadow_cpu_entry_end,
+ kasan_mem_to_shadow((void *)__START_KERNEL_map));
kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext),
(unsigned long)kasan_mem_to_shadow(_end),
early_pfn_to_nid(__pa(_stext)));
kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
- (void *)KASAN_SHADOW_END);
+ (void *)KASAN_SHADOW_END);
load_cr3(init_top_pgt);
__flush_tlb_all();
diff --git a/arch/x86/mm/kmemcheck/Makefile b/arch/x86/mm/kmemcheck/Makefile
deleted file mode 100644
index 520b3bce4095..000000000000
--- a/arch/x86/mm/kmemcheck/Makefile
+++ /dev/null
@@ -1 +0,0 @@
-obj-y := error.o kmemcheck.o opcode.o pte.o selftest.o shadow.o
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
deleted file mode 100644
index 872ec4159a68..000000000000
--- a/arch/x86/mm/kmemcheck/error.c
+++ /dev/null
@@ -1,228 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/interrupt.h>
-#include <linux/kdebug.h>
-#include <linux/kmemcheck.h>
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/ptrace.h>
-#include <linux/stacktrace.h>
-#include <linux/string.h>
-
-#include "error.h"
-#include "shadow.h"
-
-enum kmemcheck_error_type {
- KMEMCHECK_ERROR_INVALID_ACCESS,
- KMEMCHECK_ERROR_BUG,
-};
-
-#define SHADOW_COPY_SIZE (1 << CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT)
-
-struct kmemcheck_error {
- enum kmemcheck_error_type type;
-
- union {
- /* KMEMCHECK_ERROR_INVALID_ACCESS */
- struct {
- /* Kind of access that caused the error */
- enum kmemcheck_shadow state;
- /* Address and size of the erroneous read */
- unsigned long address;
- unsigned int size;
- };
- };
-
- struct pt_regs regs;
- struct stack_trace trace;
- unsigned long trace_entries[32];
-
- /* We compress it to a char. */
- unsigned char shadow_copy[SHADOW_COPY_SIZE];
- unsigned char memory_copy[SHADOW_COPY_SIZE];
-};
-
-/*
- * Create a ring queue of errors to output. We can't call printk() directly
- * from the kmemcheck traps, since this may call the console drivers and
- * result in a recursive fault.
- */
-static struct kmemcheck_error error_fifo[CONFIG_KMEMCHECK_QUEUE_SIZE];
-static unsigned int error_count;
-static unsigned int error_rd;
-static unsigned int error_wr;
-static unsigned int error_missed_count;
-
-static struct kmemcheck_error *error_next_wr(void)
-{
- struct kmemcheck_error *e;
-
- if (error_count == ARRAY_SIZE(error_fifo)) {
- ++error_missed_count;
- return NULL;
- }
-
- e = &error_fifo[error_wr];
- if (++error_wr == ARRAY_SIZE(error_fifo))
- error_wr = 0;
- ++error_count;
- return e;
-}
-
-static struct kmemcheck_error *error_next_rd(void)
-{
- struct kmemcheck_error *e;
-
- if (error_count == 0)
- return NULL;
-
- e = &error_fifo[error_rd];
- if (++error_rd == ARRAY_SIZE(error_fifo))
- error_rd = 0;
- --error_count;
- return e;
-}
-
-void kmemcheck_error_recall(void)
-{
- static const char *desc[] = {
- [KMEMCHECK_SHADOW_UNALLOCATED] = "unallocated",
- [KMEMCHECK_SHADOW_UNINITIALIZED] = "uninitialized",
- [KMEMCHECK_SHADOW_INITIALIZED] = "initialized",
- [KMEMCHECK_SHADOW_FREED] = "freed",
- };
-
- static const char short_desc[] = {
- [KMEMCHECK_SHADOW_UNALLOCATED] = 'a',
- [KMEMCHECK_SHADOW_UNINITIALIZED] = 'u',
- [KMEMCHECK_SHADOW_INITIALIZED] = 'i',
- [KMEMCHECK_SHADOW_FREED] = 'f',
- };
-
- struct kmemcheck_error *e;
- unsigned int i;
-
- e = error_next_rd();
- if (!e)
- return;
-
- switch (e->type) {
- case KMEMCHECK_ERROR_INVALID_ACCESS:
- printk(KERN_WARNING "WARNING: kmemcheck: Caught %d-bit read from %s memory (%p)\n",
- 8 * e->size, e->state < ARRAY_SIZE(desc) ?
- desc[e->state] : "(invalid shadow state)",
- (void *) e->address);
-
- printk(KERN_WARNING);
- for (i = 0; i < SHADOW_COPY_SIZE; ++i)
- printk(KERN_CONT "%02x", e->memory_copy[i]);
- printk(KERN_CONT "\n");
-
- printk(KERN_WARNING);
- for (i = 0; i < SHADOW_COPY_SIZE; ++i) {
- if (e->shadow_copy[i] < ARRAY_SIZE(short_desc))
- printk(KERN_CONT " %c", short_desc[e->shadow_copy[i]]);
- else
- printk(KERN_CONT " ?");
- }
- printk(KERN_CONT "\n");
- printk(KERN_WARNING "%*c\n", 2 + 2
- * (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^');
- break;
- case KMEMCHECK_ERROR_BUG:
- printk(KERN_EMERG "ERROR: kmemcheck: Fatal error\n");
- break;
- }
-
- __show_regs(&e->regs, 1);
- print_stack_trace(&e->trace, 0);
-}
-
-static void do_wakeup(unsigned long data)
-{
- while (error_count > 0)
- kmemcheck_error_recall();
-
- if (error_missed_count > 0) {
- printk(KERN_WARNING "kmemcheck: Lost %d error reports because "
- "the queue was too small\n", error_missed_count);
- error_missed_count = 0;
- }
-}
-
-static DECLARE_TASKLET(kmemcheck_tasklet, &do_wakeup, 0);
-
-/*
- * Save the context of an error report.
- */
-void kmemcheck_error_save(enum kmemcheck_shadow state,
- unsigned long address, unsigned int size, struct pt_regs *regs)
-{
- static unsigned long prev_ip;
-
- struct kmemcheck_error *e;
- void *shadow_copy;
- void *memory_copy;
-
- /* Don't report several adjacent errors from the same EIP. */
- if (regs->ip == prev_ip)
- return;
- prev_ip = regs->ip;
-
- e = error_next_wr();
- if (!e)
- return;
-
- e->type = KMEMCHECK_ERROR_INVALID_ACCESS;
-
- e->state = state;
- e->address = address;
- e->size = size;
-
- /* Save regs */
- memcpy(&e->regs, regs, sizeof(*regs));
-
- /* Save stack trace */
- e->trace.nr_entries = 0;
- e->trace.entries = e->trace_entries;
- e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
- e->trace.skip = 0;
- save_stack_trace_regs(regs, &e->trace);
-
- /* Round address down to nearest 16 bytes */
- shadow_copy = kmemcheck_shadow_lookup(address
- & ~(SHADOW_COPY_SIZE - 1));
- BUG_ON(!shadow_copy);
-
- memcpy(e->shadow_copy, shadow_copy, SHADOW_COPY_SIZE);
-
- kmemcheck_show_addr(address);
- memory_copy = (void *) (address & ~(SHADOW_COPY_SIZE - 1));
- memcpy(e->memory_copy, memory_copy, SHADOW_COPY_SIZE);
- kmemcheck_hide_addr(address);
-
- tasklet_hi_schedule_first(&kmemcheck_tasklet);
-}
-
-/*
- * Save the context of a kmemcheck bug.
- */
-void kmemcheck_error_save_bug(struct pt_regs *regs)
-{
- struct kmemcheck_error *e;
-
- e = error_next_wr();
- if (!e)
- return;
-
- e->type = KMEMCHECK_ERROR_BUG;
-
- memcpy(&e->regs, regs, sizeof(*regs));
-
- e->trace.nr_entries = 0;
- e->trace.entries = e->trace_entries;
- e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
- e->trace.skip = 1;
- save_stack_trace(&e->trace);
-
- tasklet_hi_schedule_first(&kmemcheck_tasklet);
-}
diff --git a/arch/x86/mm/kmemcheck/error.h b/arch/x86/mm/kmemcheck/error.h
deleted file mode 100644
index 39f80d7a874d..000000000000
--- a/arch/x86/mm/kmemcheck/error.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ARCH__X86__MM__KMEMCHECK__ERROR_H
-#define ARCH__X86__MM__KMEMCHECK__ERROR_H
-
-#include <linux/ptrace.h>
-
-#include "shadow.h"
-
-void kmemcheck_error_save(enum kmemcheck_shadow state,
- unsigned long address, unsigned int size, struct pt_regs *regs);
-
-void kmemcheck_error_save_bug(struct pt_regs *regs);
-
-void kmemcheck_error_recall(void);
-
-#endif
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
deleted file mode 100644
index 4515bae36bbe..000000000000
--- a/arch/x86/mm/kmemcheck/kmemcheck.c
+++ /dev/null
@@ -1,658 +0,0 @@
-/**
- * kmemcheck - a heavyweight memory checker for the linux kernel
- * Copyright (C) 2007, 2008 Vegard Nossum <vegardno@ifi.uio.no>
- * (With a lot of help from Ingo Molnar and Pekka Enberg.)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License (version 2) as
- * published by the Free Software Foundation.
- */
-
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/kallsyms.h>
-#include <linux/kernel.h>
-#include <linux/kmemcheck.h>
-#include <linux/mm.h>
-#include <linux/page-flags.h>
-#include <linux/percpu.h>
-#include <linux/ptrace.h>
-#include <linux/string.h>
-#include <linux/types.h>
-
-#include <asm/cacheflush.h>
-#include <asm/kmemcheck.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-
-#include "error.h"
-#include "opcode.h"
-#include "pte.h"
-#include "selftest.h"
-#include "shadow.h"
-
-
-#ifdef CONFIG_KMEMCHECK_DISABLED_BY_DEFAULT
-# define KMEMCHECK_ENABLED 0
-#endif
-
-#ifdef CONFIG_KMEMCHECK_ENABLED_BY_DEFAULT
-# define KMEMCHECK_ENABLED 1
-#endif
-
-#ifdef CONFIG_KMEMCHECK_ONESHOT_BY_DEFAULT
-# define KMEMCHECK_ENABLED 2
-#endif
-
-int kmemcheck_enabled = KMEMCHECK_ENABLED;
-
-int __init kmemcheck_init(void)
-{
-#ifdef CONFIG_SMP
- /*
- * Limit SMP to use a single CPU. We rely on the fact that this code
- * runs before SMP is set up.
- */
- if (setup_max_cpus > 1) {
- printk(KERN_INFO
- "kmemcheck: Limiting number of CPUs to 1.\n");
- setup_max_cpus = 1;
- }
-#endif
-
- if (!kmemcheck_selftest()) {
- printk(KERN_INFO "kmemcheck: self-tests failed; disabling\n");
- kmemcheck_enabled = 0;
- return -EINVAL;
- }
-
- printk(KERN_INFO "kmemcheck: Initialized\n");
- return 0;
-}
-
-early_initcall(kmemcheck_init);
-
-/*
- * We need to parse the kmemcheck= option before any memory is allocated.
- */
-static int __init param_kmemcheck(char *str)
-{
- int val;
- int ret;
-
- if (!str)
- return -EINVAL;
-
- ret = kstrtoint(str, 0, &val);
- if (ret)
- return ret;
- kmemcheck_enabled = val;
- return 0;
-}
-
-early_param("kmemcheck", param_kmemcheck);
-
-int kmemcheck_show_addr(unsigned long address)
-{
- pte_t *pte;
-
- pte = kmemcheck_pte_lookup(address);
- if (!pte)
- return 0;
-
- set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
- __flush_tlb_one(address);
- return 1;
-}
-
-int kmemcheck_hide_addr(unsigned long address)
-{
- pte_t *pte;
-
- pte = kmemcheck_pte_lookup(address);
- if (!pte)
- return 0;
-
- set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
- __flush_tlb_one(address);
- return 1;
-}
-
-struct kmemcheck_context {
- bool busy;
- int balance;
-
- /*
- * There can be at most two memory operands to an instruction, but
- * each address can cross a page boundary -- so we may need up to
- * four addresses that must be hidden/revealed for each fault.
- */
- unsigned long addr[4];
- unsigned long n_addrs;
- unsigned long flags;
-
- /* Data size of the instruction that caused a fault. */
- unsigned int size;
-};
-
-static DEFINE_PER_CPU(struct kmemcheck_context, kmemcheck_context);
-
-bool kmemcheck_active(struct pt_regs *regs)
-{
- struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-
- return data->balance > 0;
-}
-
-/* Save an address that needs to be shown/hidden */
-static void kmemcheck_save_addr(unsigned long addr)
-{
- struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-
- BUG_ON(data->n_addrs >= ARRAY_SIZE(data->addr));
- data->addr[data->n_addrs++] = addr;
-}
-
-static unsigned int kmemcheck_show_all(void)
-{
- struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
- unsigned int i;
- unsigned int n;
-
- n = 0;
- for (i = 0; i < data->n_addrs; ++i)
- n += kmemcheck_show_addr(data->addr[i]);
-
- return n;
-}
-
-static unsigned int kmemcheck_hide_all(void)
-{
- struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
- unsigned int i;
- unsigned int n;
-
- n = 0;
- for (i = 0; i < data->n_addrs; ++i)
- n += kmemcheck_hide_addr(data->addr[i]);
-
- return n;
-}
-
-/*
- * Called from the #PF handler.
- */
-void kmemcheck_show(struct pt_regs *regs)
-{
- struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-
- BUG_ON(!irqs_disabled());
-
- if (unlikely(data->balance != 0)) {
- kmemcheck_show_all();
- kmemcheck_error_save_bug(regs);
- data->balance = 0;
- return;
- }
-
- /*
- * None of the addresses actually belonged to kmemcheck. Note that
- * this is not an error.
- */
- if (kmemcheck_show_all() == 0)
- return;
-
- ++data->balance;
-
- /*
- * The IF needs to be cleared as well, so that the faulting
- * instruction can run "uninterrupted". Otherwise, we might take
- * an interrupt and start executing that before we've had a chance
- * to hide the page again.
- *
- * NOTE: In the rare case of multiple faults, we must not override
- * the original flags:
- */
- if (!(regs->flags & X86_EFLAGS_TF))
- data->flags = regs->flags;
-
- regs->flags |= X86_EFLAGS_TF;
- regs->flags &= ~X86_EFLAGS_IF;
-}
-
-/*
- * Called from the #DB handler.
- */
-void kmemcheck_hide(struct pt_regs *regs)
-{
- struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
- int n;
-
- BUG_ON(!irqs_disabled());
-
- if (unlikely(data->balance != 1)) {
- kmemcheck_show_all();
- kmemcheck_error_save_bug(regs);
- data->n_addrs = 0;
- data->balance = 0;
-
- if (!(data->flags & X86_EFLAGS_TF))
- regs->flags &= ~X86_EFLAGS_TF;
- if (data->flags & X86_EFLAGS_IF)
- regs->flags |= X86_EFLAGS_IF;
- return;
- }
-
- if (kmemcheck_enabled)
- n = kmemcheck_hide_all();
- else
- n = kmemcheck_show_all();
-
- if (n == 0)
- return;
-
- --data->balance;
-
- data->n_addrs = 0;
-
- if (!(data->flags & X86_EFLAGS_TF))
- regs->flags &= ~X86_EFLAGS_TF;
- if (data->flags & X86_EFLAGS_IF)
- regs->flags |= X86_EFLAGS_IF;
-}
-
-void kmemcheck_show_pages(struct page *p, unsigned int n)
-{
- unsigned int i;
-
- for (i = 0; i < n; ++i) {
- unsigned long address;
- pte_t *pte;
- unsigned int level;
-
- address = (unsigned long) page_address(&p[i]);
- pte = lookup_address(address, &level);
- BUG_ON(!pte);
- BUG_ON(level != PG_LEVEL_4K);
-
- set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
- set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_HIDDEN));
- __flush_tlb_one(address);
- }
-}
-
-bool kmemcheck_page_is_tracked(struct page *p)
-{
- /* This will also check the "hidden" flag of the PTE. */
- return kmemcheck_pte_lookup((unsigned long) page_address(p));
-}
-
-void kmemcheck_hide_pages(struct page *p, unsigned int n)
-{
- unsigned int i;
-
- for (i = 0; i < n; ++i) {
- unsigned long address;
- pte_t *pte;
- unsigned int level;
-
- address = (unsigned long) page_address(&p[i]);
- pte = lookup_address(address, &level);
- BUG_ON(!pte);
- BUG_ON(level != PG_LEVEL_4K);
-
- set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
- set_pte(pte, __pte(pte_val(*pte) | _PAGE_HIDDEN));
- __flush_tlb_one(address);
- }
-}
-
-/* Access may NOT cross page boundary */
-static void kmemcheck_read_strict(struct pt_regs *regs,
- unsigned long addr, unsigned int size)
-{
- void *shadow;
- enum kmemcheck_shadow status;
-
- shadow = kmemcheck_shadow_lookup(addr);
- if (!shadow)
- return;
-
- kmemcheck_save_addr(addr);
- status = kmemcheck_shadow_test(shadow, size);
- if (status == KMEMCHECK_SHADOW_INITIALIZED)
- return;
-
- if (kmemcheck_enabled)
- kmemcheck_error_save(status, addr, size, regs);
-
- if (kmemcheck_enabled == 2)
- kmemcheck_enabled = 0;
-
- /* Don't warn about it again. */
- kmemcheck_shadow_set(shadow, size);
-}
-
-bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size)
-{
- enum kmemcheck_shadow status;
- void *shadow;
-
- shadow = kmemcheck_shadow_lookup(addr);
- if (!shadow)
- return true;
-
- status = kmemcheck_shadow_test_all(shadow, size);
-
- return status == KMEMCHECK_SHADOW_INITIALIZED;
-}
-
-/* Access may cross page boundary */
-static void kmemcheck_read(struct pt_regs *regs,
- unsigned long addr, unsigned int size)
-{
- unsigned long page = addr & PAGE_MASK;
- unsigned long next_addr = addr + size - 1;
- unsigned long next_page = next_addr & PAGE_MASK;
-
- if (likely(page == next_page)) {
- kmemcheck_read_strict(regs, addr, size);
- return;
- }
-
- /*
- * What we do is basically to split the access across the
- * two pages and handle each part separately. Yes, this means
- * that we may now see reads that are 3 + 5 bytes, for
- * example (and if both are uninitialized, there will be two
- * reports), but it makes the code a lot simpler.
- */
- kmemcheck_read_strict(regs, addr, next_page - addr);
- kmemcheck_read_strict(regs, next_page, next_addr - next_page);
-}
-
-static void kmemcheck_write_strict(struct pt_regs *regs,
- unsigned long addr, unsigned int size)
-{
- void *shadow;
-
- shadow = kmemcheck_shadow_lookup(addr);
- if (!shadow)
- return;
-
- kmemcheck_save_addr(addr);
- kmemcheck_shadow_set(shadow, size);
-}
-
-static void kmemcheck_write(struct pt_regs *regs,
- unsigned long addr, unsigned int size)
-{
- unsigned long page = addr & PAGE_MASK;
- unsigned long next_addr = addr + size - 1;
- unsigned long next_page = next_addr & PAGE_MASK;
-
- if (likely(page == next_page)) {
- kmemcheck_write_strict(regs, addr, size);
- return;
- }
-
- /* See comment in kmemcheck_read(). */
- kmemcheck_write_strict(regs, addr, next_page - addr);
- kmemcheck_write_strict(regs, next_page, next_addr - next_page);
-}
-
-/*
- * Copying is hard. We have two addresses, each of which may be split across
- * a page (and each page will have different shadow addresses).
- */
-static void kmemcheck_copy(struct pt_regs *regs,
- unsigned long src_addr, unsigned long dst_addr, unsigned int size)
-{
- uint8_t shadow[8];
- enum kmemcheck_shadow status;
-
- unsigned long page;
- unsigned long next_addr;
- unsigned long next_page;
-
- uint8_t *x;
- unsigned int i;
- unsigned int n;
-
- BUG_ON(size > sizeof(shadow));
-
- page = src_addr & PAGE_MASK;
- next_addr = src_addr + size - 1;
- next_page = next_addr & PAGE_MASK;
-
- if (likely(page == next_page)) {
- /* Same page */
- x = kmemcheck_shadow_lookup(src_addr);
- if (x) {
- kmemcheck_save_addr(src_addr);
- for (i = 0; i < size; ++i)
- shadow[i] = x[i];
- } else {
- for (i = 0; i < size; ++i)
- shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
- }
- } else {
- n = next_page - src_addr;
- BUG_ON(n > sizeof(shadow));
-
- /* First page */
- x = kmemcheck_shadow_lookup(src_addr);
- if (x) {
- kmemcheck_save_addr(src_addr);
- for (i = 0; i < n; ++i)
- shadow[i] = x[i];
- } else {
- /* Not tracked */
- for (i = 0; i < n; ++i)
- shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
- }
-
- /* Second page */
- x = kmemcheck_shadow_lookup(next_page);
- if (x) {
- kmemcheck_save_addr(next_page);
- for (i = n; i < size; ++i)
- shadow[i] = x[i - n];
- } else {
- /* Not tracked */
- for (i = n; i < size; ++i)
- shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
- }
- }
-
- page = dst_addr & PAGE_MASK;
- next_addr = dst_addr + size - 1;
- next_page = next_addr & PAGE_MASK;
-
- if (likely(page == next_page)) {
- /* Same page */
- x = kmemcheck_shadow_lookup(dst_addr);
- if (x) {
- kmemcheck_save_addr(dst_addr);
- for (i = 0; i < size; ++i) {
- x[i] = shadow[i];
- shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
- }
- }
- } else {
- n = next_page - dst_addr;
- BUG_ON(n > sizeof(shadow));
-
- /* First page */
- x = kmemcheck_shadow_lookup(dst_addr);
- if (x) {
- kmemcheck_save_addr(dst_addr);
- for (i = 0; i < n; ++i) {
- x[i] = shadow[i];
- shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
- }
- }
-
- /* Second page */
- x = kmemcheck_shadow_lookup(next_page);
- if (x) {
- kmemcheck_save_addr(next_page);
- for (i = n; i < size; ++i) {
- x[i - n] = shadow[i];
- shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
- }
- }
- }
-
- status = kmemcheck_shadow_test(shadow, size);
- if (status == KMEMCHECK_SHADOW_INITIALIZED)
- return;
-
- if (kmemcheck_enabled)
- kmemcheck_error_save(status, src_addr, size, regs);
-
- if (kmemcheck_enabled == 2)
- kmemcheck_enabled = 0;
-}
-
-enum kmemcheck_method {
- KMEMCHECK_READ,
- KMEMCHECK_WRITE,
-};
-
-static void kmemcheck_access(struct pt_regs *regs,
- unsigned long fallback_address, enum kmemcheck_method fallback_method)
-{
- const uint8_t *insn;
- const uint8_t *insn_primary;
- unsigned int size;
-
- struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-
- /* Recursive fault -- ouch. */
- if (data->busy) {
- kmemcheck_show_addr(fallback_address);
- kmemcheck_error_save_bug(regs);
- return;
- }
-
- data->busy = true;
-
- insn = (const uint8_t *) regs->ip;
- insn_primary = kmemcheck_opcode_get_primary(insn);
-
- kmemcheck_opcode_decode(insn, &size);
-
- switch (insn_primary[0]) {
-#ifdef CONFIG_KMEMCHECK_BITOPS_OK
- /* AND, OR, XOR */
- /*
- * Unfortunately, these instructions have to be excluded from
- * our regular checking since they access only some (and not
- * all) bits. This clears out "bogus" bitfield-access warnings.
- */
- case 0x80:
- case 0x81:
- case 0x82:
- case 0x83:
- switch ((insn_primary[1] >> 3) & 7) {
- /* OR */
- case 1:
- /* AND */
- case 4:
- /* XOR */
- case 6:
- kmemcheck_write(regs, fallback_address, size);
- goto out;
-
- /* ADD */
- case 0:
- /* ADC */
- case 2:
- /* SBB */
- case 3:
- /* SUB */
- case 5:
- /* CMP */
- case 7:
- break;
- }
- break;
-#endif
-
- /* MOVS, MOVSB, MOVSW, MOVSD */
- case 0xa4:
- case 0xa5:
- /*
- * These instructions are special because they take two
- * addresses, but we only get one page fault.
- */
- kmemcheck_copy(regs, regs->si, regs->di, size);
- goto out;
-
- /* CMPS, CMPSB, CMPSW, CMPSD */
- case 0xa6:
- case 0xa7:
- kmemcheck_read(regs, regs->si, size);
- kmemcheck_read(regs, regs->di, size);
- goto out;
- }
-
- /*
- * If the opcode isn't special in any way, we use the data from the
- * page fault handler to determine the address and type of memory
- * access.
- */
- switch (fallback_method) {
- case KMEMCHECK_READ:
- kmemcheck_read(regs, fallback_address, size);
- goto out;
- case KMEMCHECK_WRITE:
- kmemcheck_write(regs, fallback_address, size);
- goto out;
- }
-
-out:
- data->busy = false;
-}
-
-bool kmemcheck_fault(struct pt_regs *regs, unsigned long address,
- unsigned long error_code)
-{
- pte_t *pte;
-
- /*
- * XXX: Is it safe to assume that memory accesses from virtual 86
- * mode or non-kernel code segments will _never_ access kernel
- * memory (e.g. tracked pages)? For now, we need this to avoid
- * invoking kmemcheck for PnP BIOS calls.
- */
- if (regs->flags & X86_VM_MASK)
- return false;
- if (regs->cs != __KERNEL_CS)
- return false;
-
- pte = kmemcheck_pte_lookup(address);
- if (!pte)
- return false;
-
- WARN_ON_ONCE(in_nmi());
-
- if (error_code & 2)
- kmemcheck_access(regs, address, KMEMCHECK_WRITE);
- else
- kmemcheck_access(regs, address, KMEMCHECK_READ);
-
- kmemcheck_show(regs);
- return true;
-}
-
-bool kmemcheck_trap(struct pt_regs *regs)
-{
- if (!kmemcheck_active(regs))
- return false;
-
- /* We're done. */
- kmemcheck_hide(regs);
- return true;
-}
diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c
deleted file mode 100644
index df8109ddf7fe..000000000000
--- a/arch/x86/mm/kmemcheck/opcode.c
+++ /dev/null
@@ -1,107 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/types.h>
-
-#include "opcode.h"
-
-static bool opcode_is_prefix(uint8_t b)
-{
- return
- /* Group 1 */
- b == 0xf0 || b == 0xf2 || b == 0xf3
- /* Group 2 */
- || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26
- || b == 0x64 || b == 0x65
- /* Group 3 */
- || b == 0x66
- /* Group 4 */
- || b == 0x67;
-}
-
-#ifdef CONFIG_X86_64
-static bool opcode_is_rex_prefix(uint8_t b)
-{
- return (b & 0xf0) == 0x40;
-}
-#else
-static bool opcode_is_rex_prefix(uint8_t b)
-{
- return false;
-}
-#endif
-
-#define REX_W (1 << 3)
-
-/*
- * This is a VERY crude opcode decoder. We only need to find the size of the
- * load/store that caused our #PF and this should work for all the opcodes
- * that we care about. Moreover, the ones who invented this instruction set
- * should be shot.
- */
-void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size)
-{
- /* Default operand size */
- int operand_size_override = 4;
-
- /* prefixes */
- for (; opcode_is_prefix(*op); ++op) {
- if (*op == 0x66)
- operand_size_override = 2;
- }
-
- /* REX prefix */
- if (opcode_is_rex_prefix(*op)) {
- uint8_t rex = *op;
-
- ++op;
- if (rex & REX_W) {
- switch (*op) {
- case 0x63:
- *size = 4;
- return;
- case 0x0f:
- ++op;
-
- switch (*op) {
- case 0xb6:
- case 0xbe:
- *size = 1;
- return;
- case 0xb7:
- case 0xbf:
- *size = 2;
- return;
- }
-
- break;
- }
-
- *size = 8;
- return;
- }
- }
-
- /* escape opcode */
- if (*op == 0x0f) {
- ++op;
-
- /*
- * This is move with zero-extend and sign-extend, respectively;
- * we don't have to think about 0xb6/0xbe, because this is
- * already handled in the conditional below.
- */
- if (*op == 0xb7 || *op == 0xbf)
- operand_size_override = 2;
- }
-
- *size = (*op & 1) ? operand_size_override : 1;
-}
-
-const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op)
-{
- /* skip prefixes */
- while (opcode_is_prefix(*op))
- ++op;
- if (opcode_is_rex_prefix(*op))
- ++op;
- return op;
-}
diff --git a/arch/x86/mm/kmemcheck/opcode.h b/arch/x86/mm/kmemcheck/opcode.h
deleted file mode 100644
index 51a1ce94c24a..000000000000
--- a/arch/x86/mm/kmemcheck/opcode.h
+++ /dev/null
@@ -1,10 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ARCH__X86__MM__KMEMCHECK__OPCODE_H
-#define ARCH__X86__MM__KMEMCHECK__OPCODE_H
-
-#include <linux/types.h>
-
-void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size);
-const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op);
-
-#endif
diff --git a/arch/x86/mm/kmemcheck/pte.c b/arch/x86/mm/kmemcheck/pte.c
deleted file mode 100644
index 8a03be90272a..000000000000
--- a/arch/x86/mm/kmemcheck/pte.c
+++ /dev/null
@@ -1,23 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/mm.h>
-
-#include <asm/pgtable.h>
-
-#include "pte.h"
-
-pte_t *kmemcheck_pte_lookup(unsigned long address)
-{
- pte_t *pte;
- unsigned int level;
-
- pte = lookup_address(address, &level);
- if (!pte)
- return NULL;
- if (level != PG_LEVEL_4K)
- return NULL;
- if (!pte_hidden(*pte))
- return NULL;
-
- return pte;
-}
-
diff --git a/arch/x86/mm/kmemcheck/pte.h b/arch/x86/mm/kmemcheck/pte.h
deleted file mode 100644
index b595612382c2..000000000000
--- a/arch/x86/mm/kmemcheck/pte.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ARCH__X86__MM__KMEMCHECK__PTE_H
-#define ARCH__X86__MM__KMEMCHECK__PTE_H
-
-#include <linux/mm.h>
-
-#include <asm/pgtable.h>
-
-pte_t *kmemcheck_pte_lookup(unsigned long address);
-
-#endif
diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c
deleted file mode 100644
index 7ce0be1f99eb..000000000000
--- a/arch/x86/mm/kmemcheck/selftest.c
+++ /dev/null
@@ -1,71 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/bug.h>
-#include <linux/kernel.h>
-
-#include "opcode.h"
-#include "selftest.h"
-
-struct selftest_opcode {
- unsigned int expected_size;
- const uint8_t *insn;
- const char *desc;
-};
-
-static const struct selftest_opcode selftest_opcodes[] = {
- /* REP MOVS */
- {1, "\xf3\xa4", "rep movsb <mem8>, <mem8>"},
- {4, "\xf3\xa5", "rep movsl <mem32>, <mem32>"},
-
- /* MOVZX / MOVZXD */
- {1, "\x66\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg16>"},
- {1, "\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg32>"},
-
- /* MOVSX / MOVSXD */
- {1, "\x66\x0f\xbe\x51\xf8", "movswq <mem8>, <reg16>"},
- {1, "\x0f\xbe\x51\xf8", "movswq <mem8>, <reg32>"},
-
-#ifdef CONFIG_X86_64
- /* MOVZX / MOVZXD */
- {1, "\x49\x0f\xb6\x51\xf8", "movzbq <mem8>, <reg64>"},
- {2, "\x49\x0f\xb7\x51\xf8", "movzbq <mem16>, <reg64>"},
-
- /* MOVSX / MOVSXD */
- {1, "\x49\x0f\xbe\x51\xf8", "movsbq <mem8>, <reg64>"},
- {2, "\x49\x0f\xbf\x51\xf8", "movsbq <mem16>, <reg64>"},
- {4, "\x49\x63\x51\xf8", "movslq <mem32>, <reg64>"},
-#endif
-};
-
-static bool selftest_opcode_one(const struct selftest_opcode *op)
-{
- unsigned size;
-
- kmemcheck_opcode_decode(op->insn, &size);
-
- if (size == op->expected_size)
- return true;
-
- printk(KERN_WARNING "kmemcheck: opcode %s: expected size %d, got %d\n",
- op->desc, op->expected_size, size);
- return false;
-}
-
-static bool selftest_opcodes_all(void)
-{
- bool pass = true;
- unsigned int i;
-
- for (i = 0; i < ARRAY_SIZE(selftest_opcodes); ++i)
- pass = pass && selftest_opcode_one(&selftest_opcodes[i]);
-
- return pass;
-}
-
-bool kmemcheck_selftest(void)
-{
- bool pass = true;
-
- pass = pass && selftest_opcodes_all();
-
- return pass;
-}
diff --git a/arch/x86/mm/kmemcheck/selftest.h b/arch/x86/mm/kmemcheck/selftest.h
deleted file mode 100644
index 8d759aae453d..000000000000
--- a/arch/x86/mm/kmemcheck/selftest.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ARCH_X86_MM_KMEMCHECK_SELFTEST_H
-#define ARCH_X86_MM_KMEMCHECK_SELFTEST_H
-
-bool kmemcheck_selftest(void);
-
-#endif
diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c
deleted file mode 100644
index c2638a7d2c10..000000000000
--- a/arch/x86/mm/kmemcheck/shadow.c
+++ /dev/null
@@ -1,173 +0,0 @@
-#include <linux/kmemcheck.h>
-#include <linux/export.h>
-#include <linux/mm.h>
-
-#include <asm/page.h>
-#include <asm/pgtable.h>
-
-#include "pte.h"
-#include "shadow.h"
-
-/*
- * Return the shadow address for the given address. Returns NULL if the
- * address is not tracked.
- *
- * We need to be extremely careful not to follow any invalid pointers,
- * because this function can be called for *any* possible address.
- */
-void *kmemcheck_shadow_lookup(unsigned long address)
-{
- pte_t *pte;
- struct page *page;
-
- if (!virt_addr_valid(address))
- return NULL;
-
- pte = kmemcheck_pte_lookup(address);
- if (!pte)
- return NULL;
-
- page = virt_to_page(address);
- if (!page->shadow)
- return NULL;
- return page->shadow + (address & (PAGE_SIZE - 1));
-}
-
-static void mark_shadow(void *address, unsigned int n,
- enum kmemcheck_shadow status)
-{
- unsigned long addr = (unsigned long) address;
- unsigned long last_addr = addr + n - 1;
- unsigned long page = addr & PAGE_MASK;
- unsigned long last_page = last_addr & PAGE_MASK;
- unsigned int first_n;
- void *shadow;
-
- /* If the memory range crosses a page boundary, stop there. */
- if (page == last_page)
- first_n = n;
- else
- first_n = page + PAGE_SIZE - addr;
-
- shadow = kmemcheck_shadow_lookup(addr);
- if (shadow)
- memset(shadow, status, first_n);
-
- addr += first_n;
- n -= first_n;
-
- /* Do full-page memset()s. */
- while (n >= PAGE_SIZE) {
- shadow = kmemcheck_shadow_lookup(addr);
- if (shadow)
- memset(shadow, status, PAGE_SIZE);
-
- addr += PAGE_SIZE;
- n -= PAGE_SIZE;
- }
-
- /* Do the remaining page, if any. */
- if (n > 0) {
- shadow = kmemcheck_shadow_lookup(addr);
- if (shadow)
- memset(shadow, status, n);
- }
-}
-
-void kmemcheck_mark_unallocated(void *address, unsigned int n)
-{
- mark_shadow(address, n, KMEMCHECK_SHADOW_UNALLOCATED);
-}
-
-void kmemcheck_mark_uninitialized(void *address, unsigned int n)
-{
- mark_shadow(address, n, KMEMCHECK_SHADOW_UNINITIALIZED);
-}
-
-/*
- * Fill the shadow memory of the given address such that the memory at that
- * address is marked as being initialized.
- */
-void kmemcheck_mark_initialized(void *address, unsigned int n)
-{
- mark_shadow(address, n, KMEMCHECK_SHADOW_INITIALIZED);
-}
-EXPORT_SYMBOL_GPL(kmemcheck_mark_initialized);
-
-void kmemcheck_mark_freed(void *address, unsigned int n)
-{
- mark_shadow(address, n, KMEMCHECK_SHADOW_FREED);
-}
-
-void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n)
-{
- unsigned int i;
-
- for (i = 0; i < n; ++i)
- kmemcheck_mark_unallocated(page_address(&p[i]), PAGE_SIZE);
-}
-
-void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n)
-{
- unsigned int i;
-
- for (i = 0; i < n; ++i)
- kmemcheck_mark_uninitialized(page_address(&p[i]), PAGE_SIZE);
-}
-
-void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n)
-{
- unsigned int i;
-
- for (i = 0; i < n; ++i)
- kmemcheck_mark_initialized(page_address(&p[i]), PAGE_SIZE);
-}
-
-enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size)
-{
-#ifdef CONFIG_KMEMCHECK_PARTIAL_OK
- uint8_t *x;
- unsigned int i;
-
- x = shadow;
-
- /*
- * Make sure _some_ bytes are initialized. Gcc frequently generates
- * code to access neighboring bytes.
- */
- for (i = 0; i < size; ++i) {
- if (x[i] == KMEMCHECK_SHADOW_INITIALIZED)
- return x[i];
- }
-
- return x[0];
-#else
- return kmemcheck_shadow_test_all(shadow, size);
-#endif
-}
-
-enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow, unsigned int size)
-{
- uint8_t *x;
- unsigned int i;
-
- x = shadow;
-
- /* All bytes must be initialized. */
- for (i = 0; i < size; ++i) {
- if (x[i] != KMEMCHECK_SHADOW_INITIALIZED)
- return x[i];
- }
-
- return x[0];
-}
-
-void kmemcheck_shadow_set(void *shadow, unsigned int size)
-{
- uint8_t *x;
- unsigned int i;
-
- x = shadow;
- for (i = 0; i < size; ++i)
- x[i] = KMEMCHECK_SHADOW_INITIALIZED;
-}
diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h
deleted file mode 100644
index 49768dc18664..000000000000
--- a/arch/x86/mm/kmemcheck/shadow.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ARCH__X86__MM__KMEMCHECK__SHADOW_H
-#define ARCH__X86__MM__KMEMCHECK__SHADOW_H
-
-enum kmemcheck_shadow {
- KMEMCHECK_SHADOW_UNALLOCATED,
- KMEMCHECK_SHADOW_UNINITIALIZED,
- KMEMCHECK_SHADOW_INITIALIZED,
- KMEMCHECK_SHADOW_FREED,
-};
-
-void *kmemcheck_shadow_lookup(unsigned long address);
-
-enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size);
-enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow,
- unsigned int size);
-void kmemcheck_shadow_set(void *shadow, unsigned int size);
-
-#endif
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 3fe68483463c..85cf12219dea 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -753,7 +753,7 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
if (!debug_pagealloc_enabled())
spin_unlock(&cpa_lock);
- base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
+ base = alloc_pages(GFP_KERNEL, 0);
if (!debug_pagealloc_enabled())
spin_lock(&cpa_lock);
if (!base)
@@ -904,7 +904,7 @@ static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end)
static int alloc_pte_page(pmd_t *pmd)
{
- pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+ pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
if (!pte)
return -1;
@@ -914,7 +914,7 @@ static int alloc_pte_page(pmd_t *pmd)
static int alloc_pmd_page(pud_t *pud)
{
- pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+ pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
if (!pmd)
return -1;
@@ -1120,7 +1120,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
pgd_entry = cpa->pgd + pgd_index(addr);
if (pgd_none(*pgd_entry)) {
- p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+ p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL);
if (!p4d)
return -1;
@@ -1132,7 +1132,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
*/
p4d = p4d_offset(pgd_entry, addr);
if (p4d_none(*p4d)) {
- pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+ pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
if (!pud)
return -1;
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 17ebc5a978cc..004abf9ebf12 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -7,7 +7,7 @@
#include <asm/fixmap.h>
#include <asm/mtrr.h>
-#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | __GFP_ZERO)
+#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO)
#ifdef CONFIG_HIGHPTE
#define PGALLOC_USER_GFP __GFP_HIGHMEM
@@ -355,14 +355,15 @@ static inline void _pgd_free(pgd_t *pgd)
kmem_cache_free(pgd_cache, pgd);
}
#else
+
static inline pgd_t *_pgd_alloc(void)
{
- return (pgd_t *)__get_free_page(PGALLOC_GFP);
+ return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
}
static inline void _pgd_free(pgd_t *pgd)
{
- free_page((unsigned long)pgd);
+ free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
}
#endif /* CONFIG_X86_PAE */
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 6b9bf023a700..c3c5274410a9 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -10,6 +10,7 @@
#include <linux/pagemap.h>
#include <linux/spinlock.h>
+#include <asm/cpu_entry_area.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/fixmap.h>
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
new file mode 100644
index 000000000000..bce8aea65606
--- /dev/null
+++ b/arch/x86/mm/pti.c
@@ -0,0 +1,387 @@
+/*
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * This code is based in part on work published here:
+ *
+ * https://github.com/IAIK/KAISER
+ *
+ * The original work was written by and and signed off by for the Linux
+ * kernel by:
+ *
+ * Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at>
+ * Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at>
+ * Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at>
+ * Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at>
+ *
+ * Major changes to the original code by: Dave Hansen <dave.hansen@intel.com>
+ * Mostly rewritten by Thomas Gleixner <tglx@linutronix.de> and
+ * Andy Lutomirsky <luto@amacapital.net>
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/bug.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+
+#include <asm/cpufeature.h>
+#include <asm/hypervisor.h>
+#include <asm/vsyscall.h>
+#include <asm/cmdline.h>
+#include <asm/pti.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/desc.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
+
+/* Backporting helper */
+#ifndef __GFP_NOTRACK
+#define __GFP_NOTRACK 0
+#endif
+
+static void __init pti_print_if_insecure(const char *reason)
+{
+ if (boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
+ pr_info("%s\n", reason);
+}
+
+static void __init pti_print_if_secure(const char *reason)
+{
+ if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
+ pr_info("%s\n", reason);
+}
+
+void __init pti_check_boottime_disable(void)
+{
+ char arg[5];
+ int ret;
+
+ if (hypervisor_is_type(X86_HYPER_XEN_PV)) {
+ pti_print_if_insecure("disabled on XEN PV.");
+ return;
+ }
+
+ ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
+ if (ret > 0) {
+ if (ret == 3 && !strncmp(arg, "off", 3)) {
+ pti_print_if_insecure("disabled on command line.");
+ return;
+ }
+ if (ret == 2 && !strncmp(arg, "on", 2)) {
+ pti_print_if_secure("force enabled on command line.");
+ goto enable;
+ }
+ if (ret == 4 && !strncmp(arg, "auto", 4))
+ goto autosel;
+ }
+
+ if (cmdline_find_option_bool(boot_command_line, "nopti")) {
+ pti_print_if_insecure("disabled on command line.");
+ return;
+ }
+
+autosel:
+ if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
+ return;
+enable:
+ setup_force_cpu_cap(X86_FEATURE_PTI);
+}
+
+pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+ /*
+ * Changes to the high (kernel) portion of the kernelmode page
+ * tables are not automatically propagated to the usermode tables.
+ *
+ * Users should keep in mind that, unlike the kernelmode tables,
+ * there is no vmalloc_fault equivalent for the usermode tables.
+ * Top-level entries added to init_mm's usermode pgd after boot
+ * will not be automatically propagated to other mms.
+ */
+ if (!pgdp_maps_userspace(pgdp))
+ return pgd;
+
+ /*
+ * The user page tables get the full PGD, accessible from
+ * userspace:
+ */
+ kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd;
+
+ /*
+ * If this is normal user memory, make it NX in the kernel
+ * pagetables so that, if we somehow screw up and return to
+ * usermode with the kernel CR3 loaded, we'll get a page fault
+ * instead of allowing user code to execute with the wrong CR3.
+ *
+ * As exceptions, we don't set NX if:
+ * - _PAGE_USER is not set. This could be an executable
+ * EFI runtime mapping or something similar, and the kernel
+ * may execute from it
+ * - we don't have NX support
+ * - we're clearing the PGD (i.e. the new pgd is not present).
+ */
+ if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) &&
+ (__supported_pte_mask & _PAGE_NX))
+ pgd.pgd |= _PAGE_NX;
+
+ /* return the copy of the PGD we want the kernel to use: */
+ return pgd;
+}
+
+/*
+ * Walk the user copy of the page tables (optionally) trying to allocate
+ * page table pages on the way down.
+ *
+ * Returns a pointer to a P4D on success, or NULL on failure.
+ */
+static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
+{
+ pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address));
+ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+
+ if (address < PAGE_OFFSET) {
+ WARN_ONCE(1, "attempt to walk user address\n");
+ return NULL;
+ }
+
+ if (pgd_none(*pgd)) {
+ unsigned long new_p4d_page = __get_free_page(gfp);
+ if (!new_p4d_page)
+ return NULL;
+
+ if (pgd_none(*pgd)) {
+ set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
+ new_p4d_page = 0;
+ }
+ if (new_p4d_page)
+ free_page(new_p4d_page);
+ }
+ BUILD_BUG_ON(pgd_large(*pgd) != 0);
+
+ return p4d_offset(pgd, address);
+}
+
+/*
+ * Walk the user copy of the page tables (optionally) trying to allocate
+ * page table pages on the way down.
+ *
+ * Returns a pointer to a PMD on success, or NULL on failure.
+ */
+static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
+{
+ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+ p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
+ pud_t *pud;
+
+ BUILD_BUG_ON(p4d_large(*p4d) != 0);
+ if (p4d_none(*p4d)) {
+ unsigned long new_pud_page = __get_free_page(gfp);
+ if (!new_pud_page)
+ return NULL;
+
+ if (p4d_none(*p4d)) {
+ set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
+ new_pud_page = 0;
+ }
+ if (new_pud_page)
+ free_page(new_pud_page);
+ }
+
+ pud = pud_offset(p4d, address);
+ /* The user page tables do not use large mappings: */
+ if (pud_large(*pud)) {
+ WARN_ON(1);
+ return NULL;
+ }
+ if (pud_none(*pud)) {
+ unsigned long new_pmd_page = __get_free_page(gfp);
+ if (!new_pmd_page)
+ return NULL;
+
+ if (pud_none(*pud)) {
+ set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
+ new_pmd_page = 0;
+ }
+ if (new_pmd_page)
+ free_page(new_pmd_page);
+ }
+
+ return pmd_offset(pud, address);
+}
+
+#ifdef CONFIG_X86_VSYSCALL_EMULATION
+/*
+ * Walk the shadow copy of the page tables (optionally) trying to allocate
+ * page table pages on the way down. Does not support large pages.
+ *
+ * Note: this is only used when mapping *new* kernel data into the
+ * user/shadow page tables. It is never used for userspace data.
+ *
+ * Returns a pointer to a PTE on success, or NULL on failure.
+ */
+static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
+{
+ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+ pmd_t *pmd = pti_user_pagetable_walk_pmd(address);
+ pte_t *pte;
+
+ /* We can't do anything sensible if we hit a large mapping. */
+ if (pmd_large(*pmd)) {
+ WARN_ON(1);
+ return NULL;
+ }
+
+ if (pmd_none(*pmd)) {
+ unsigned long new_pte_page = __get_free_page(gfp);
+ if (!new_pte_page)
+ return NULL;
+
+ if (pmd_none(*pmd)) {
+ set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
+ new_pte_page = 0;
+ }
+ if (new_pte_page)
+ free_page(new_pte_page);
+ }
+
+ pte = pte_offset_kernel(pmd, address);
+ if (pte_flags(*pte) & _PAGE_USER) {
+ WARN_ONCE(1, "attempt to walk to user pte\n");
+ return NULL;
+ }
+ return pte;
+}
+
+static void __init pti_setup_vsyscall(void)
+{
+ pte_t *pte, *target_pte;
+ unsigned int level;
+
+ pte = lookup_address(VSYSCALL_ADDR, &level);
+ if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte))
+ return;
+
+ target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR);
+ if (WARN_ON(!target_pte))
+ return;
+
+ *target_pte = *pte;
+ set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir));
+}
+#else
+static void __init pti_setup_vsyscall(void) { }
+#endif
+
+static void __init
+pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear)
+{
+ unsigned long addr;
+
+ /*
+ * Clone the populated PMDs which cover start to end. These PMD areas
+ * can have holes.
+ */
+ for (addr = start; addr < end; addr += PMD_SIZE) {
+ pmd_t *pmd, *target_pmd;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+
+ pgd = pgd_offset_k(addr);
+ if (WARN_ON(pgd_none(*pgd)))
+ return;
+ p4d = p4d_offset(pgd, addr);
+ if (WARN_ON(p4d_none(*p4d)))
+ return;
+ pud = pud_offset(p4d, addr);
+ if (pud_none(*pud))
+ continue;
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd))
+ continue;
+
+ target_pmd = pti_user_pagetable_walk_pmd(addr);
+ if (WARN_ON(!target_pmd))
+ return;
+
+ /*
+ * Copy the PMD. That is, the kernelmode and usermode
+ * tables will share the last-level page tables of this
+ * address range
+ */
+ *target_pmd = pmd_clear_flags(*pmd, clear);
+ }
+}
+
+/*
+ * Clone a single p4d (i.e. a top-level entry on 4-level systems and a
+ * next-level entry on 5-level systems.
+ */
+static void __init pti_clone_p4d(unsigned long addr)
+{
+ p4d_t *kernel_p4d, *user_p4d;
+ pgd_t *kernel_pgd;
+
+ user_p4d = pti_user_pagetable_walk_p4d(addr);
+ kernel_pgd = pgd_offset_k(addr);
+ kernel_p4d = p4d_offset(kernel_pgd, addr);
+ *user_p4d = *kernel_p4d;
+}
+
+/*
+ * Clone the CPU_ENTRY_AREA into the user space visible page table.
+ */
+static void __init pti_clone_user_shared(void)
+{
+ pti_clone_p4d(CPU_ENTRY_AREA_BASE);
+}
+
+/*
+ * Clone the ESPFIX P4D into the user space visinble page table
+ */
+static void __init pti_setup_espfix64(void)
+{
+#ifdef CONFIG_X86_ESPFIX64
+ pti_clone_p4d(ESPFIX_BASE_ADDR);
+#endif
+}
+
+/*
+ * Clone the populated PMDs of the entry and irqentry text and force it RO.
+ */
+static void __init pti_clone_entry_text(void)
+{
+ pti_clone_pmds((unsigned long) __entry_text_start,
+ (unsigned long) __irqentry_text_end, _PAGE_RW);
+}
+
+/*
+ * Initialize kernel page table isolation
+ */
+void __init pti_init(void)
+{
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
+
+ pr_info("enabled\n");
+
+ pti_clone_user_shared();
+ pti_clone_entry_text();
+ pti_setup_espfix64();
+ pti_setup_vsyscall();
+}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 3118392cdf75..a1561957dccb 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -28,6 +28,38 @@
* Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
*/
+/*
+ * We get here when we do something requiring a TLB invalidation
+ * but could not go invalidate all of the contexts. We do the
+ * necessary invalidation by clearing out the 'ctx_id' which
+ * forces a TLB flush when the context is loaded.
+ */
+void clear_asid_other(void)
+{
+ u16 asid;
+
+ /*
+ * This is only expected to be set if we have disabled
+ * kernel _PAGE_GLOBAL pages.
+ */
+ if (!static_cpu_has(X86_FEATURE_PTI)) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
+ /* Do not need to flush the current asid */
+ if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid))
+ continue;
+ /*
+ * Make sure the next time we go to switch to
+ * this asid, we do a flush:
+ */
+ this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0);
+ }
+ this_cpu_write(cpu_tlbstate.invalidate_other, false);
+}
+
atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
@@ -42,6 +74,9 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
return;
}
+ if (this_cpu_read(cpu_tlbstate.invalidate_other))
+ clear_asid_other();
+
for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
next->context.ctx_id)
@@ -65,6 +100,25 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
*need_flush = true;
}
+static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
+{
+ unsigned long new_mm_cr3;
+
+ if (need_flush) {
+ invalidate_user_asid(new_asid);
+ new_mm_cr3 = build_cr3(pgdir, new_asid);
+ } else {
+ new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
+ }
+
+ /*
+ * Caution: many callers of this function expect
+ * that load_cr3() is serializing and orders TLB
+ * fills with respect to the mm_cpumask writes.
+ */
+ write_cr3(new_mm_cr3);
+}
+
void leave_mm(int cpu)
{
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
@@ -128,7 +182,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* isn't free.
*/
#ifdef CONFIG_DEBUG_VM
- if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev, prev_asid))) {
+ if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) {
/*
* If we were to BUG here, we'd be very likely to kill
* the system so hard that we don't see the call trace.
@@ -195,7 +249,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
if (need_flush) {
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
- write_cr3(build_cr3(next, new_asid));
+ load_new_mm_cr3(next->pgd, new_asid, true);
/*
* NB: This gets called via leave_mm() in the idle path
@@ -208,7 +262,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
} else {
/* The new ASID is already up to date. */
- write_cr3(build_cr3_noflush(next, new_asid));
+ load_new_mm_cr3(next->pgd, new_asid, false);
/* See above wrt _rcuidle. */
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
@@ -288,7 +342,7 @@ void initialize_tlbstate_and_flush(void)
!(cr4_read_shadow() & X86_CR4_PCIDE));
/* Force ASID 0 and force a TLB flush. */
- write_cr3(build_cr3(mm, 0));
+ write_cr3(build_cr3(mm->pgd, 0));
/* Reinitialize tlbstate. */
this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
@@ -551,7 +605,7 @@ static void do_kernel_range_flush(void *info)
/* flush range by one by one 'invlpg' */
for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
- __flush_tlb_single(addr);
+ __flush_tlb_one(addr);
}
void flush_tlb_kernel_range(unsigned long start, unsigned long end)