diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-05-07 01:13:31 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-05-07 01:13:31 +0200 |
commit | 0bc40e549aeea2de20fc571749de9bbfc099fb34 (patch) | |
tree | d18f3339bd383a17431fca23b6c5f3e54c93cf2f /arch/x86/mm | |
parent | Merge branch 'x86-kdump-for-linus' of git://git.kernel.org/pub/scm/linux/kern... (diff) | |
parent | x86/mm: Initialize PGD cache during mm initialization (diff) | |
download | linux-0bc40e549aeea2de20fc571749de9bbfc099fb34.tar.xz linux-0bc40e549aeea2de20fc571749de9bbfc099fb34.zip |
Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 mm updates from Ingo Molnar:
"The changes in here are:
- text_poke() fixes and an extensive set of executability lockdowns,
to (hopefully) eliminate the last residual circumstances under
which we are using W|X mappings even temporarily on x86 kernels.
This required a broad range of surgery in text patching facilities,
module loading, trampoline handling and other bits.
- tweak page fault messages to be more informative and more
structured.
- remove DISCONTIGMEM support on x86-32 and make SPARSEMEM the
default.
- reduce KASLR granularity on 5-level paging kernels from 512 GB to
1 GB.
- misc other changes and updates"
* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (36 commits)
x86/mm: Initialize PGD cache during mm initialization
x86/alternatives: Add comment about module removal races
x86/kprobes: Use vmalloc special flag
x86/ftrace: Use vmalloc special flag
bpf: Use vmalloc special flag
modules: Use vmalloc special flag
mm/vmalloc: Add flag for freeing of special permsissions
mm/hibernation: Make hibernation handle unmapped pages
x86/mm/cpa: Add set_direct_map_*() functions
x86/alternatives: Remove the return value of text_poke_*()
x86/jump-label: Remove support for custom text poker
x86/modules: Avoid breaking W^X while loading modules
x86/kprobes: Set instruction page as executable
x86/ftrace: Set trampoline pages as executable
x86/kgdb: Avoid redundant comparison of patched code
x86/alternatives: Use temporary mm for text poking
x86/alternatives: Initialize temporary mm for patching
fork: Provide a function for copying init_mm
uprobes: Initialize uprobes earlier
x86/mm: Save debug registers when loading a temporary mm
...
Diffstat (limited to 'arch/x86/mm')
-rw-r--r-- | arch/x86/mm/fault.c | 55 | ||||
-rw-r--r-- | arch/x86/mm/init.c | 37 | ||||
-rw-r--r-- | arch/x86/mm/kaslr.c | 94 | ||||
-rw-r--r-- | arch/x86/mm/pageattr.c | 16 | ||||
-rw-r--r-- | arch/x86/mm/pgtable.c | 10 | ||||
-rw-r--r-- | arch/x86/mm/tlb.c | 116 |
6 files changed, 192 insertions, 136 deletions
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 06c089513d39..46df4c6aae46 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -360,8 +360,6 @@ static noinline int vmalloc_fault(unsigned long address) if (!(address >= VMALLOC_START && address < VMALLOC_END)) return -1; - WARN_ON_ONCE(in_nmi()); - /* * Copy kernel mappings over when needed. This can also * happen within a race in page table update. In the later @@ -604,24 +602,9 @@ static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index) name, index, addr, (desc.limit0 | (desc.limit1 << 16))); } -/* - * This helper function transforms the #PF error_code bits into - * "[PROT] [USER]" type of descriptive, almost human-readable error strings: - */ -static void err_str_append(unsigned long error_code, char *buf, unsigned long mask, const char *txt) -{ - if (error_code & mask) { - if (buf[0]) - strcat(buf, " "); - strcat(buf, txt); - } -} - static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address) { - char err_txt[64]; - if (!oops_may_print()) return; @@ -645,31 +628,29 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad from_kuid(&init_user_ns, current_uid())); } - pr_alert("BUG: unable to handle kernel %s at %px\n", - address < PAGE_SIZE ? "NULL pointer dereference" : "paging request", - (void *)address); - - err_txt[0] = 0; - - /* - * Note: length of these appended strings including the separation space and the - * zero delimiter must fit into err_txt[]. - */ - err_str_append(error_code, err_txt, X86_PF_PROT, "[PROT]" ); - err_str_append(error_code, err_txt, X86_PF_WRITE, "[WRITE]"); - err_str_append(error_code, err_txt, X86_PF_USER, "[USER]" ); - err_str_append(error_code, err_txt, X86_PF_RSVD, "[RSVD]" ); - err_str_append(error_code, err_txt, X86_PF_INSTR, "[INSTR]"); - err_str_append(error_code, err_txt, X86_PF_PK, "[PK]" ); - - pr_alert("#PF error: %s\n", error_code ? err_txt : "[normal kernel read fault]"); + if (address < PAGE_SIZE && !user_mode(regs)) + pr_alert("BUG: kernel NULL pointer dereference, address: %px\n", + (void *)address); + else + pr_alert("BUG: unable to handle page fault for address: %px\n", + (void *)address); + + pr_alert("#PF: %s %s in %s mode\n", + (error_code & X86_PF_USER) ? "user" : "supervisor", + (error_code & X86_PF_INSTR) ? "instruction fetch" : + (error_code & X86_PF_WRITE) ? "write access" : + "read access", + user_mode(regs) ? "user" : "kernel"); + pr_alert("#PF: error_code(0x%04lx) - %s\n", error_code, + !(error_code & X86_PF_PROT) ? "not-present page" : + (error_code & X86_PF_RSVD) ? "reserved bit violation" : + (error_code & X86_PF_PK) ? "protection keys violation" : + "permissions violation"); if (!(error_code & X86_PF_USER) && user_mode(regs)) { struct desc_ptr idt, gdt; u16 ldtr, tr; - pr_alert("This was a system access from user code\n"); - /* * This can happen for quite a few reasons. The more obvious * ones are faults accessing the GDT, or LDT. Perhaps diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 8dacdb96899e..fd10d91a6115 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -6,6 +6,7 @@ #include <linux/swapfile.h> #include <linux/swapops.h> #include <linux/kmemleak.h> +#include <linux/sched/task.h> #include <asm/set_memory.h> #include <asm/e820/api.h> @@ -23,6 +24,7 @@ #include <asm/hypervisor.h> #include <asm/cpufeature.h> #include <asm/pti.h> +#include <asm/text-patching.h> /* * We need to define the tracepoints somewhere, and tlb.c @@ -702,6 +704,41 @@ void __init init_mem_mapping(void) } /* + * Initialize an mm_struct to be used during poking and a pointer to be used + * during patching. + */ +void __init poking_init(void) +{ + spinlock_t *ptl; + pte_t *ptep; + + poking_mm = copy_init_mm(); + BUG_ON(!poking_mm); + + /* + * Randomize the poking address, but make sure that the following page + * will be mapped at the same PMD. We need 2 pages, so find space for 3, + * and adjust the address if the PMD ends after the first one. + */ + poking_addr = TASK_UNMAPPED_BASE; + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) + poking_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) % + (TASK_SIZE - TASK_UNMAPPED_BASE - 3 * PAGE_SIZE); + + if (((poking_addr + PAGE_SIZE) & ~PMD_MASK) == 0) + poking_addr += PAGE_SIZE; + + /* + * We need to trigger the allocation of the page-tables that will be + * needed for poking now. Later, poking may be performed in an atomic + * section, which might cause allocation to fail. + */ + ptep = get_locked_pte(poking_mm, poking_addr, &ptl); + BUG_ON(!ptep); + pte_unmap_unlock(ptep, ptl); +} + +/* * devmem_is_allowed() checks to see if /dev/mem access to a certain address * is valid. The argument is a physical page number. * diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index d669c5e797e0..dc3f058bdf9b 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -125,10 +125,7 @@ void __init kernel_randomize_memory(void) */ entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i); prandom_bytes_state(&rand_state, &rand, sizeof(rand)); - if (pgtable_l5_enabled()) - entropy = (rand % (entropy + 1)) & P4D_MASK; - else - entropy = (rand % (entropy + 1)) & PUD_MASK; + entropy = (rand % (entropy + 1)) & PUD_MASK; vaddr += entropy; *kaslr_regions[i].base = vaddr; @@ -137,84 +134,71 @@ void __init kernel_randomize_memory(void) * randomization alignment. */ vaddr += get_padding(&kaslr_regions[i]); - if (pgtable_l5_enabled()) - vaddr = round_up(vaddr + 1, P4D_SIZE); - else - vaddr = round_up(vaddr + 1, PUD_SIZE); + vaddr = round_up(vaddr + 1, PUD_SIZE); remain_entropy -= entropy; } } static void __meminit init_trampoline_pud(void) { - unsigned long paddr, paddr_next; + pud_t *pud_page_tramp, *pud, *pud_tramp; + p4d_t *p4d_page_tramp, *p4d, *p4d_tramp; + unsigned long paddr, vaddr; pgd_t *pgd; - pud_t *pud_page, *pud_page_tramp; - int i; pud_page_tramp = alloc_low_page(); + /* + * There are two mappings for the low 1MB area, the direct mapping + * and the 1:1 mapping for the real mode trampoline: + * + * Direct mapping: virt_addr = phys_addr + PAGE_OFFSET + * 1:1 mapping: virt_addr = phys_addr + */ paddr = 0; - pgd = pgd_offset_k((unsigned long)__va(paddr)); - pud_page = (pud_t *) pgd_page_vaddr(*pgd); - - for (i = pud_index(paddr); i < PTRS_PER_PUD; i++, paddr = paddr_next) { - pud_t *pud, *pud_tramp; - unsigned long vaddr = (unsigned long)__va(paddr); + vaddr = (unsigned long)__va(paddr); + pgd = pgd_offset_k(vaddr); - pud_tramp = pud_page_tramp + pud_index(paddr); - pud = pud_page + pud_index(vaddr); - paddr_next = (paddr & PUD_MASK) + PUD_SIZE; - - *pud_tramp = *pud; - } + p4d = p4d_offset(pgd, vaddr); + pud = pud_offset(p4d, vaddr); - set_pgd(&trampoline_pgd_entry, - __pgd(_KERNPG_TABLE | __pa(pud_page_tramp))); -} - -static void __meminit init_trampoline_p4d(void) -{ - unsigned long paddr, paddr_next; - pgd_t *pgd; - p4d_t *p4d_page, *p4d_page_tramp; - int i; + pud_tramp = pud_page_tramp + pud_index(paddr); + *pud_tramp = *pud; - p4d_page_tramp = alloc_low_page(); - - paddr = 0; - pgd = pgd_offset_k((unsigned long)__va(paddr)); - p4d_page = (p4d_t *) pgd_page_vaddr(*pgd); - - for (i = p4d_index(paddr); i < PTRS_PER_P4D; i++, paddr = paddr_next) { - p4d_t *p4d, *p4d_tramp; - unsigned long vaddr = (unsigned long)__va(paddr); + if (pgtable_l5_enabled()) { + p4d_page_tramp = alloc_low_page(); p4d_tramp = p4d_page_tramp + p4d_index(paddr); - p4d = p4d_page + p4d_index(vaddr); - paddr_next = (paddr & P4D_MASK) + P4D_SIZE; - *p4d_tramp = *p4d; - } + set_p4d(p4d_tramp, + __p4d(_KERNPG_TABLE | __pa(pud_page_tramp))); - set_pgd(&trampoline_pgd_entry, - __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp))); + set_pgd(&trampoline_pgd_entry, + __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp))); + } else { + set_pgd(&trampoline_pgd_entry, + __pgd(_KERNPG_TABLE | __pa(pud_page_tramp))); + } } /* - * Create PGD aligned trampoline table to allow real mode initialization - * of additional CPUs. Consume only 1 low memory page. + * The real mode trampoline, which is required for bootstrapping CPUs + * occupies only a small area under the low 1MB. See reserve_real_mode() + * for details. + * + * If KASLR is disabled the first PGD entry of the direct mapping is copied + * to map the real mode trampoline. + * + * If KASLR is enabled, copy only the PUD which covers the low 1MB + * area. This limits the randomization granularity to 1GB for both 4-level + * and 5-level paging. */ void __meminit init_trampoline(void) { - if (!kaslr_memory_enabled()) { init_trampoline_default(); return; } - if (pgtable_l5_enabled()) - init_trampoline_p4d(); - else - init_trampoline_pud(); + init_trampoline_pud(); } diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 4c570612e24e..daf4d645e537 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -2209,8 +2209,6 @@ int set_pages_rw(struct page *page, int numpages) return set_memory_rw(addr, numpages); } -#ifdef CONFIG_DEBUG_PAGEALLOC - static int __set_pages_p(struct page *page, int numpages) { unsigned long tempaddr = (unsigned long) page_address(page); @@ -2249,6 +2247,16 @@ static int __set_pages_np(struct page *page, int numpages) return __change_page_attr_set_clr(&cpa, 0); } +int set_direct_map_invalid_noflush(struct page *page) +{ + return __set_pages_np(page, 1); +} + +int set_direct_map_default_noflush(struct page *page) +{ + return __set_pages_p(page, 1); +} + void __kernel_map_pages(struct page *page, int numpages, int enable) { if (PageHighMem(page)) @@ -2282,7 +2290,6 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) } #ifdef CONFIG_HIBERNATION - bool kernel_page_present(struct page *page) { unsigned int level; @@ -2294,11 +2301,8 @@ bool kernel_page_present(struct page *page) pte = lookup_address((unsigned long)page_address(page), &level); return (pte_val(*pte) & _PAGE_PRESENT); } - #endif /* CONFIG_HIBERNATION */ -#endif /* CONFIG_DEBUG_PAGEALLOC */ - int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, unsigned numpages, unsigned long page_flags) { diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 3dbf440d4114..1f67b1e15bf6 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -373,14 +373,14 @@ static void pgd_prepopulate_user_pmd(struct mm_struct *mm, static struct kmem_cache *pgd_cache; -static int __init pgd_cache_init(void) +void __init pgd_cache_init(void) { /* * When PAE kernel is running as a Xen domain, it does not use * shared kernel pmd. And this requires a whole page for pgd. */ if (!SHARED_KERNEL_PMD) - return 0; + return; /* * when PAE kernel is not running as a Xen domain, it uses @@ -390,9 +390,7 @@ static int __init pgd_cache_init(void) */ pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN, SLAB_PANIC, NULL); - return 0; } -core_initcall(pgd_cache_init); static inline pgd_t *_pgd_alloc(void) { @@ -420,6 +418,10 @@ static inline void _pgd_free(pgd_t *pgd) } #else +void __init pgd_cache_init(void) +{ +} + static inline pgd_t *_pgd_alloc(void) { return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 487b8474c01c..7f61431c75fb 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -634,7 +634,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen); } -static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason) +static void flush_tlb_func_local(const void *info, enum tlb_flush_reason reason) { const struct flush_tlb_info *f = info; @@ -722,43 +722,81 @@ void native_flush_tlb_others(const struct cpumask *cpumask, */ unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; +static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info); + +#ifdef CONFIG_DEBUG_VM +static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx); +#endif + +static inline struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm, + unsigned long start, unsigned long end, + unsigned int stride_shift, bool freed_tables, + u64 new_tlb_gen) +{ + struct flush_tlb_info *info = this_cpu_ptr(&flush_tlb_info); + +#ifdef CONFIG_DEBUG_VM + /* + * Ensure that the following code is non-reentrant and flush_tlb_info + * is not overwritten. This means no TLB flushing is initiated by + * interrupt handlers and machine-check exception handlers. + */ + BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1); +#endif + + info->start = start; + info->end = end; + info->mm = mm; + info->stride_shift = stride_shift; + info->freed_tables = freed_tables; + info->new_tlb_gen = new_tlb_gen; + + return info; +} + +static inline void put_flush_tlb_info(void) +{ +#ifdef CONFIG_DEBUG_VM + /* Complete reentrency prevention checks */ + barrier(); + this_cpu_dec(flush_tlb_info_idx); +#endif +} + void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned int stride_shift, bool freed_tables) { + struct flush_tlb_info *info; + u64 new_tlb_gen; int cpu; - struct flush_tlb_info info = { - .mm = mm, - .stride_shift = stride_shift, - .freed_tables = freed_tables, - }; - cpu = get_cpu(); - /* This is also a barrier that synchronizes with switch_mm(). */ - info.new_tlb_gen = inc_mm_tlb_gen(mm); - /* Should we flush just the requested range? */ - if ((end != TLB_FLUSH_ALL) && - ((end - start) >> stride_shift) <= tlb_single_page_flush_ceiling) { - info.start = start; - info.end = end; - } else { - info.start = 0UL; - info.end = TLB_FLUSH_ALL; + if ((end == TLB_FLUSH_ALL) || + ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) { + start = 0; + end = TLB_FLUSH_ALL; } + /* This is also a barrier that synchronizes with switch_mm(). */ + new_tlb_gen = inc_mm_tlb_gen(mm); + + info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables, + new_tlb_gen); + if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) { - VM_WARN_ON(irqs_disabled()); + lockdep_assert_irqs_enabled(); local_irq_disable(); - flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN); + flush_tlb_func_local(info, TLB_LOCAL_MM_SHOOTDOWN); local_irq_enable(); } if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) - flush_tlb_others(mm_cpumask(mm), &info); + flush_tlb_others(mm_cpumask(mm), info); + put_flush_tlb_info(); put_cpu(); } @@ -787,38 +825,48 @@ static void do_kernel_range_flush(void *info) void flush_tlb_kernel_range(unsigned long start, unsigned long end) { - /* Balance as user space task's flush, a bit conservative */ if (end == TLB_FLUSH_ALL || (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) { on_each_cpu(do_flush_tlb_all, NULL, 1); } else { - struct flush_tlb_info info; - info.start = start; - info.end = end; - on_each_cpu(do_kernel_range_flush, &info, 1); + struct flush_tlb_info *info; + + preempt_disable(); + info = get_flush_tlb_info(NULL, start, end, 0, false, 0); + + on_each_cpu(do_kernel_range_flush, info, 1); + + put_flush_tlb_info(); + preempt_enable(); } } +/* + * arch_tlbbatch_flush() performs a full TLB flush regardless of the active mm. + * This means that the 'struct flush_tlb_info' that describes which mappings to + * flush is actually fixed. We therefore set a single fixed struct and use it in + * arch_tlbbatch_flush(). + */ +static const struct flush_tlb_info full_flush_tlb_info = { + .mm = NULL, + .start = 0, + .end = TLB_FLUSH_ALL, +}; + void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) { - struct flush_tlb_info info = { - .mm = NULL, - .start = 0UL, - .end = TLB_FLUSH_ALL, - }; - int cpu = get_cpu(); if (cpumask_test_cpu(cpu, &batch->cpumask)) { - VM_WARN_ON(irqs_disabled()); + lockdep_assert_irqs_enabled(); local_irq_disable(); - flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN); + flush_tlb_func_local(&full_flush_tlb_info, TLB_LOCAL_SHOOTDOWN); local_irq_enable(); } if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) - flush_tlb_others(&batch->cpumask, &info); + flush_tlb_others(&batch->cpumask, &full_flush_tlb_info); cpumask_clear(&batch->cpumask); |