diff options
author | Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> | 2016-03-29 15:33:47 +0200 |
---|---|---|
committer | Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> | 2016-03-29 15:33:47 +0200 |
commit | 8041dcc881c928134c546ae85e6e59e65804357c (patch) | |
tree | be5d1c21af8cf38ac32ed8708396881aabd44d4e /arch/x86/mm | |
parent | xen/apic: Provide Xen-specific version of cpu_present_to_apicid APIC op (diff) | |
parent | Linux 4.6-rc1 (diff) | |
download | linux-8041dcc881c928134c546ae85e6e59e65804357c.tar.xz linux-8041dcc881c928134c546ae85e6e59e65804357c.zip |
Merge tag 'v4.6-rc1' into for-linus-4.6
Linux 4.6-rc1
* tag 'v4.6-rc1': (12823 commits)
Linux 4.6-rc1
f2fs/crypto: fix xts_tweak initialization
NTB: Remove _addr functions from ntb_hw_amd
orangefs: fix orangefs_superblock locking
orangefs: fix do_readv_writev() handling of error halfway through
orangefs: have ->kill_sb() evict the VFS side of things first
orangefs: sanitize ->llseek()
orangefs-bufmap.h: trim unused junk
orangefs: saner calling conventions for getting a slot
orangefs_copy_{to,from}_bufmap(): don't pass bufmap pointer
orangefs: get rid of readdir_handle_s
thp: fix typo in khugepaged_scan_pmd()
MAINTAINERS: fill entries for KASAN
mm/filemap: generic_file_read_iter(): check for zero reads unconditionally
kasan: test fix: warn if the UAF could not be detected in kmalloc_uaf2
mm, kasan: stackdepot implementation. Enable stackdepot for SLAB
arch, ftrace: for KASAN put hard/soft IRQ entries into separate sections
mm, kasan: add GFP flags to KASAN API
mm, kasan: SLAB support
kasan: modify kmalloc_large_oob_right(), add kmalloc_pagealloc_oob_right()
...
Diffstat (limited to 'arch/x86/mm')
-rw-r--r-- | arch/x86/mm/Makefile | 5 | ||||
-rw-r--r-- | arch/x86/mm/dump_pagetables.c | 11 | ||||
-rw-r--r-- | arch/x86/mm/extable.c | 200 | ||||
-rw-r--r-- | arch/x86/mm/fault.c | 152 | ||||
-rw-r--r-- | arch/x86/mm/gup.c | 47 | ||||
-rw-r--r-- | arch/x86/mm/init.c | 36 | ||||
-rw-r--r-- | arch/x86/mm/init_32.c | 6 | ||||
-rw-r--r-- | arch/x86/mm/init_64.c | 27 | ||||
-rw-r--r-- | arch/x86/mm/kasan_init_64.c | 17 | ||||
-rw-r--r-- | arch/x86/mm/kmmio.c | 88 | ||||
-rw-r--r-- | arch/x86/mm/mmap.c | 20 | ||||
-rw-r--r-- | arch/x86/mm/mpx.c | 10 | ||||
-rw-r--r-- | arch/x86/mm/numa.c | 67 | ||||
-rw-r--r-- | arch/x86/mm/pageattr.c | 68 | ||||
-rw-r--r-- | arch/x86/mm/pat.c | 6 | ||||
-rw-r--r-- | arch/x86/mm/pkeys.c | 101 | ||||
-rw-r--r-- | arch/x86/mm/setup_nx.c | 6 |
17 files changed, 535 insertions, 332 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index f9d38a48e3c8..f98913258c63 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,3 +1,6 @@ +# Kernel does not boot with instrumentation of tlb.c. +KCOV_INSTRUMENT_tlb.o := n + obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ pat.o pgtable.o physaddr.o gup.o setup_nx.o @@ -34,3 +37,5 @@ obj-$(CONFIG_ACPI_NUMA) += srat.o obj-$(CONFIG_NUMA_EMU) += numa_emulation.o obj-$(CONFIG_X86_INTEL_MPX) += mpx.o +obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o + diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 4a6f1d9b5106..99bfb192803f 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -358,20 +358,19 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr, #define pgd_none(a) pud_none(__pud(pgd_val(a))) #endif -#ifdef CONFIG_X86_64 static inline bool is_hypervisor_range(int idx) { +#ifdef CONFIG_X86_64 /* * ffff800000000000 - ffff87ffffffffff is reserved for * the hypervisor. */ - return paravirt_enabled() && - (idx >= pgd_index(__PAGE_OFFSET) - 16) && - (idx < pgd_index(__PAGE_OFFSET)); -} + return (idx >= pgd_index(__PAGE_OFFSET) - 16) && + (idx < pgd_index(__PAGE_OFFSET)); #else -static inline bool is_hypervisor_range(int idx) { return false; } + return false; #endif +} static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, bool checkwx) diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 903ec1e9c326..82447b3fba38 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -1,23 +1,64 @@ #include <linux/module.h> -#include <linux/spinlock.h> -#include <linux/sort.h> #include <asm/uaccess.h> -static inline unsigned long -ex_insn_addr(const struct exception_table_entry *x) -{ - return (unsigned long)&x->insn + x->insn; -} +typedef bool (*ex_handler_t)(const struct exception_table_entry *, + struct pt_regs *, int); + static inline unsigned long ex_fixup_addr(const struct exception_table_entry *x) { return (unsigned long)&x->fixup + x->fixup; } +static inline ex_handler_t +ex_fixup_handler(const struct exception_table_entry *x) +{ + return (ex_handler_t)((unsigned long)&x->handler + x->handler); +} -int fixup_exception(struct pt_regs *regs) +bool ex_handler_default(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) { - const struct exception_table_entry *fixup; - unsigned long new_ip; + regs->ip = ex_fixup_addr(fixup); + return true; +} +EXPORT_SYMBOL(ex_handler_default); + +bool ex_handler_fault(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + regs->ip = ex_fixup_addr(fixup); + regs->ax = trapnr; + return true; +} +EXPORT_SYMBOL_GPL(ex_handler_fault); + +bool ex_handler_ext(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + /* Special hack for uaccess_err */ + current_thread_info()->uaccess_err = 1; + regs->ip = ex_fixup_addr(fixup); + return true; +} +EXPORT_SYMBOL(ex_handler_ext); + +bool ex_has_fault_handler(unsigned long ip) +{ + const struct exception_table_entry *e; + ex_handler_t handler; + + e = search_exception_tables(ip); + if (!e) + return false; + handler = ex_fixup_handler(e); + + return handler == ex_handler_fault; +} + +int fixup_exception(struct pt_regs *regs, int trapnr) +{ + const struct exception_table_entry *e; + ex_handler_t handler; #ifdef CONFIG_PNPBIOS if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) { @@ -33,137 +74,32 @@ int fixup_exception(struct pt_regs *regs) } #endif - fixup = search_exception_tables(regs->ip); - if (fixup) { - new_ip = ex_fixup_addr(fixup); - - if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) { - /* Special hack for uaccess_err */ - current_thread_info()->uaccess_err = 1; - new_ip -= 0x7ffffff0; - } - regs->ip = new_ip; - return 1; - } + e = search_exception_tables(regs->ip); + if (!e) + return 0; - return 0; + handler = ex_fixup_handler(e); + return handler(e, regs, trapnr); } /* Restricted version used during very early boot */ int __init early_fixup_exception(unsigned long *ip) { - const struct exception_table_entry *fixup; + const struct exception_table_entry *e; unsigned long new_ip; + ex_handler_t handler; - fixup = search_exception_tables(*ip); - if (fixup) { - new_ip = ex_fixup_addr(fixup); - - if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) { - /* uaccess handling not supported during early boot */ - return 0; - } - - *ip = new_ip; - return 1; - } - - return 0; -} - -/* - * Search one exception table for an entry corresponding to the - * given instruction address, and return the address of the entry, - * or NULL if none is found. - * We use a binary search, and thus we assume that the table is - * already sorted. - */ -const struct exception_table_entry * -search_extable(const struct exception_table_entry *first, - const struct exception_table_entry *last, - unsigned long value) -{ - while (first <= last) { - const struct exception_table_entry *mid; - unsigned long addr; - - mid = ((last - first) >> 1) + first; - addr = ex_insn_addr(mid); - if (addr < value) - first = mid + 1; - else if (addr > value) - last = mid - 1; - else - return mid; - } - return NULL; -} - -/* - * The exception table needs to be sorted so that the binary - * search that we use to find entries in it works properly. - * This is used both for the kernel exception table and for - * the exception tables of modules that get loaded. - * - */ -static int cmp_ex(const void *a, const void *b) -{ - const struct exception_table_entry *x = a, *y = b; - - /* - * This value will always end up fittin in an int, because on - * both i386 and x86-64 the kernel symbol-reachable address - * space is < 2 GiB. - * - * This compare is only valid after normalization. - */ - return x->insn - y->insn; -} - -void sort_extable(struct exception_table_entry *start, - struct exception_table_entry *finish) -{ - struct exception_table_entry *p; - int i; - - /* Convert all entries to being relative to the start of the section */ - i = 0; - for (p = start; p < finish; p++) { - p->insn += i; - i += 4; - p->fixup += i; - i += 4; - } + e = search_exception_tables(*ip); + if (!e) + return 0; - sort(start, finish - start, sizeof(struct exception_table_entry), - cmp_ex, NULL); + new_ip = ex_fixup_addr(e); + handler = ex_fixup_handler(e); - /* Denormalize all entries */ - i = 0; - for (p = start; p < finish; p++) { - p->insn -= i; - i += 4; - p->fixup -= i; - i += 4; - } -} + /* special handling not supported during early boot */ + if (handler != ex_handler_default) + return 0; -#ifdef CONFIG_MODULES -/* - * If the exception table is sorted, any referring to the module init - * will be at the beginning or the end. - */ -void trim_init_extable(struct module *m) -{ - /*trim the beginning*/ - while (m->num_exentries && - within_module_init(ex_insn_addr(&m->extable[0]), m)) { - m->extable++; - m->num_exentries--; - } - /*trim the end*/ - while (m->num_exentries && - within_module_init(ex_insn_addr(&m->extable[m->num_exentries-1]), m)) - m->num_exentries--; + *ip = new_ip; + return 1; } -#endif /* CONFIG_MODULES */ diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index e830c71a1323..5ce1ed02f7e8 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -15,12 +15,14 @@ #include <linux/context_tracking.h> /* exception_enter(), ... */ #include <linux/uaccess.h> /* faulthandler_disabled() */ +#include <asm/cpufeature.h> /* boot_cpu_has, ... */ #include <asm/traps.h> /* dotraplinkage, ... */ #include <asm/pgalloc.h> /* pgd_*(), ... */ #include <asm/kmemcheck.h> /* kmemcheck_*(), ... */ #include <asm/fixmap.h> /* VSYSCALL_ADDR */ #include <asm/vsyscall.h> /* emulate_vsyscall */ #include <asm/vm86.h> /* struct vm86 */ +#include <asm/mmu_context.h> /* vma_pkey() */ #define CREATE_TRACE_POINTS #include <asm/trace/exceptions.h> @@ -33,6 +35,7 @@ * bit 2 == 0: kernel-mode access 1: user-mode access * bit 3 == 1: use of reserved bit detected * bit 4 == 1: fault was an instruction fetch + * bit 5 == 1: protection keys block access */ enum x86_pf_error_code { @@ -41,6 +44,7 @@ enum x86_pf_error_code { PF_USER = 1 << 2, PF_RSVD = 1 << 3, PF_INSTR = 1 << 4, + PF_PK = 1 << 5, }; /* @@ -167,9 +171,60 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) return prefetch; } +/* + * A protection key fault means that the PKRU value did not allow + * access to some PTE. Userspace can figure out what PKRU was + * from the XSAVE state, and this function fills out a field in + * siginfo so userspace can discover which protection key was set + * on the PTE. + * + * If we get here, we know that the hardware signaled a PF_PK + * fault and that there was a VMA once we got in the fault + * handler. It does *not* guarantee that the VMA we find here + * was the one that we faulted on. + * + * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4); + * 2. T1 : set PKRU to deny access to pkey=4, touches page + * 3. T1 : faults... + * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5); + * 5. T1 : enters fault handler, takes mmap_sem, etc... + * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really + * faulted on a pte with its pkey=4. + */ +static void fill_sig_info_pkey(int si_code, siginfo_t *info, + struct vm_area_struct *vma) +{ + /* This is effectively an #ifdef */ + if (!boot_cpu_has(X86_FEATURE_OSPKE)) + return; + + /* Fault not from Protection Keys: nothing to do */ + if (si_code != SEGV_PKUERR) + return; + /* + * force_sig_info_fault() is called from a number of + * contexts, some of which have a VMA and some of which + * do not. The PF_PK handing happens after we have a + * valid VMA, so we should never reach this without a + * valid VMA. + */ + if (!vma) { + WARN_ONCE(1, "PKU fault with no VMA passed in"); + info->si_pkey = 0; + return; + } + /* + * si_pkey should be thought of as a strong hint, but not + * absolutely guranteed to be 100% accurate because of + * the race explained above. + */ + info->si_pkey = vma_pkey(vma); +} + static void force_sig_info_fault(int si_signo, int si_code, unsigned long address, - struct task_struct *tsk, int fault) + struct task_struct *tsk, struct vm_area_struct *vma, + int fault) { unsigned lsb = 0; siginfo_t info; @@ -184,6 +239,8 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address, lsb = PAGE_SHIFT; info.si_addr_lsb = lsb; + fill_sig_info_pkey(si_code, &info, vma); + force_sig_info(si_signo, &info, tsk); } @@ -661,9 +718,11 @@ no_context(struct pt_regs *regs, unsigned long error_code, struct task_struct *tsk = current; unsigned long flags; int sig; + /* No context means no VMA to pass down */ + struct vm_area_struct *vma = NULL; /* Are we prepared to handle this kernel fault? */ - if (fixup_exception(regs)) { + if (fixup_exception(regs, X86_TRAP_PF)) { /* * Any interrupt that takes a fault gets the fixup. This makes * the below recursive fault logic only apply to a faults from @@ -684,7 +743,8 @@ no_context(struct pt_regs *regs, unsigned long error_code, tsk->thread.cr2 = address; /* XXX: hwpoison faults will set the wrong code. */ - force_sig_info_fault(signal, si_code, address, tsk, 0); + force_sig_info_fault(signal, si_code, address, + tsk, vma, 0); } /* @@ -761,7 +821,8 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, static void __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, - unsigned long address, int si_code) + unsigned long address, struct vm_area_struct *vma, + int si_code) { struct task_struct *tsk = current; @@ -804,7 +865,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_PF; - force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0); + force_sig_info_fault(SIGSEGV, si_code, address, tsk, vma, 0); return; } @@ -817,14 +878,14 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, static noinline void bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, - unsigned long address) + unsigned long address, struct vm_area_struct *vma) { - __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR); + __bad_area_nosemaphore(regs, error_code, address, vma, SEGV_MAPERR); } static void __bad_area(struct pt_regs *regs, unsigned long error_code, - unsigned long address, int si_code) + unsigned long address, struct vm_area_struct *vma, int si_code) { struct mm_struct *mm = current->mm; @@ -834,25 +895,50 @@ __bad_area(struct pt_regs *regs, unsigned long error_code, */ up_read(&mm->mmap_sem); - __bad_area_nosemaphore(regs, error_code, address, si_code); + __bad_area_nosemaphore(regs, error_code, address, vma, si_code); } static noinline void bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address) { - __bad_area(regs, error_code, address, SEGV_MAPERR); + __bad_area(regs, error_code, address, NULL, SEGV_MAPERR); +} + +static inline bool bad_area_access_from_pkeys(unsigned long error_code, + struct vm_area_struct *vma) +{ + /* This code is always called on the current mm */ + bool foreign = false; + + if (!boot_cpu_has(X86_FEATURE_OSPKE)) + return false; + if (error_code & PF_PK) + return true; + /* this checks permission keys on the VMA: */ + if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE), + (error_code & PF_INSTR), foreign)) + return true; + return false; } static noinline void bad_area_access_error(struct pt_regs *regs, unsigned long error_code, - unsigned long address) + unsigned long address, struct vm_area_struct *vma) { - __bad_area(regs, error_code, address, SEGV_ACCERR); + /* + * This OSPKE check is not strictly necessary at runtime. + * But, doing it this way allows compiler optimizations + * if pkeys are compiled out. + */ + if (bad_area_access_from_pkeys(error_code, vma)) + __bad_area(regs, error_code, address, vma, SEGV_PKUERR); + else + __bad_area(regs, error_code, address, vma, SEGV_ACCERR); } static void do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, - unsigned int fault) + struct vm_area_struct *vma, unsigned int fault) { struct task_struct *tsk = current; int code = BUS_ADRERR; @@ -879,12 +965,13 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, code = BUS_MCEERR_AR; } #endif - force_sig_info_fault(SIGBUS, code, address, tsk, fault); + force_sig_info_fault(SIGBUS, code, address, tsk, vma, fault); } static noinline void mm_fault_error(struct pt_regs *regs, unsigned long error_code, - unsigned long address, unsigned int fault) + unsigned long address, struct vm_area_struct *vma, + unsigned int fault) { if (fatal_signal_pending(current) && !(error_code & PF_USER)) { no_context(regs, error_code, address, 0, 0); @@ -908,9 +995,9 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, } else { if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| VM_FAULT_HWPOISON_LARGE)) - do_sigbus(regs, error_code, address, fault); + do_sigbus(regs, error_code, address, vma, fault); else if (fault & VM_FAULT_SIGSEGV) - bad_area_nosemaphore(regs, error_code, address); + bad_area_nosemaphore(regs, error_code, address, vma); else BUG(); } @@ -923,6 +1010,12 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte) if ((error_code & PF_INSTR) && !pte_exec(*pte)) return 0; + /* + * Note: We do not do lazy flushing on protection key + * changes, so no spurious fault will ever set PF_PK. + */ + if ((error_code & PF_PK)) + return 1; return 1; } @@ -1012,6 +1105,17 @@ int show_unhandled_signals = 1; static inline int access_error(unsigned long error_code, struct vm_area_struct *vma) { + /* This is only called for the current mm, so: */ + bool foreign = false; + /* + * Make sure to check the VMA so that we do not perform + * faults just to hit a PF_PK as soon as we fill in a + * page. + */ + if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE), + (error_code & PF_INSTR), foreign)) + return 1; + if (error_code & PF_WRITE) { /* write, present and write, not present: */ if (unlikely(!(vma->vm_flags & VM_WRITE))) @@ -1118,7 +1222,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, * Don't take the mm semaphore here. If we fixup a prefetch * fault we could otherwise deadlock: */ - bad_area_nosemaphore(regs, error_code, address); + bad_area_nosemaphore(regs, error_code, address, NULL); return; } @@ -1131,7 +1235,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, pgtable_bad(regs, error_code, address); if (unlikely(smap_violation(error_code, regs))) { - bad_area_nosemaphore(regs, error_code, address); + bad_area_nosemaphore(regs, error_code, address, NULL); return; } @@ -1140,7 +1244,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, * in a region with pagefaults disabled then we must not take the fault */ if (unlikely(faulthandler_disabled() || !mm)) { - bad_area_nosemaphore(regs, error_code, address); + bad_area_nosemaphore(regs, error_code, address, NULL); return; } @@ -1164,6 +1268,8 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, if (error_code & PF_WRITE) flags |= FAULT_FLAG_WRITE; + if (error_code & PF_INSTR) + flags |= FAULT_FLAG_INSTRUCTION; /* * When running in the kernel we expect faults to occur only to @@ -1184,7 +1290,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, if (unlikely(!down_read_trylock(&mm->mmap_sem))) { if ((error_code & PF_USER) == 0 && !search_exception_tables(regs->ip)) { - bad_area_nosemaphore(regs, error_code, address); + bad_area_nosemaphore(regs, error_code, address, NULL); return; } retry: @@ -1232,7 +1338,7 @@ retry: */ good_area: if (unlikely(access_error(error_code, vma))) { - bad_area_access_error(regs, error_code, address); + bad_area_access_error(regs, error_code, address, vma); return; } @@ -1270,7 +1376,7 @@ good_area: up_read(&mm->mmap_sem); if (unlikely(fault & VM_FAULT_ERROR)) { - mm_fault_error(regs, error_code, address, fault); + mm_fault_error(regs, error_code, address, vma, fault); return; } diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index d8a798d8bf50..b8b6a60b32cf 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -11,6 +11,7 @@ #include <linux/swap.h> #include <linux/memremap.h> +#include <asm/mmu_context.h> #include <asm/pgtable.h> static inline pte_t gup_get_pte(pte_t *ptep) @@ -75,6 +76,28 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) } /* + * 'pteval' can come from a pte, pmd or pud. We only check + * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the + * same value on all 3 types. + */ +static inline int pte_allows_gup(unsigned long pteval, int write) +{ + unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER; + + if (write) + need_pte_bits |= _PAGE_RW; + + if ((pteval & need_pte_bits) != need_pte_bits) + return 0; + + /* Check memory protection keys permissions. */ + if (!__pkru_allows_pkey(pte_flags_pkey(pteval), write)) + return 0; + + return 1; +} + +/* * The performance critical leaf functions are made noinline otherwise gcc * inlines everything into a single function which results in too much * register pressure. @@ -83,14 +106,9 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { struct dev_pagemap *pgmap = NULL; - unsigned long mask; int nr_start = *nr; pte_t *ptep; - mask = _PAGE_PRESENT|_PAGE_USER; - if (write) - mask |= _PAGE_RW; - ptep = pte_offset_map(&pmd, addr); do { pte_t pte = gup_get_pte(ptep); @@ -109,7 +127,8 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, pte_unmap(ptep); return 0; } - } else if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) { + } else if (!pte_allows_gup(pte_val(pte), write) || + pte_special(pte)) { pte_unmap(ptep); return 0; } @@ -131,7 +150,7 @@ static inline void get_head_page_multiple(struct page *page, int nr) { VM_BUG_ON_PAGE(page != compound_head(page), page); VM_BUG_ON_PAGE(page_count(page) == 0, page); - atomic_add(nr, &page->_count); + page_ref_add(page, nr); SetPageReferenced(page); } @@ -164,14 +183,10 @@ static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr, static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { - unsigned long mask; struct page *head, *page; int refs; - mask = _PAGE_PRESENT|_PAGE_USER; - if (write) - mask |= _PAGE_RW; - if ((pmd_flags(pmd) & mask) != mask) + if (!pte_allows_gup(pmd_val(pmd), write)) return 0; VM_BUG_ON(!pfn_valid(pmd_pfn(pmd))); @@ -231,14 +246,10 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, static noinline int gup_huge_pud(pud_t pud, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { - unsigned long mask; struct page *head, *page; int refs; - mask = _PAGE_PRESENT|_PAGE_USER; - if (write) - mask |= _PAGE_RW; - if ((pud_flags(pud) & mask) != mask) + if (!pte_allows_gup(pud_val(pud), write)) return 0; /* hugepages are never "special" */ VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL); @@ -422,7 +433,7 @@ slow_irqon: start += nr << PAGE_SHIFT; pages += nr; - ret = get_user_pages_unlocked(current, mm, start, + ret = get_user_pages_unlocked(start, (end - start) >> PAGE_SHIFT, write, 0, pages); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 493f54172b4a..9d56f271d519 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -150,13 +150,14 @@ static int page_size_mask; static void __init probe_page_size_mask(void) { -#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK) +#if !defined(CONFIG_KMEMCHECK) /* - * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. + * For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will + * use small pages. * This will simplify cpa(), which otherwise needs to support splitting * large pages into small in interrupt context, etc. */ - if (cpu_has_pse) + if (cpu_has_pse && !debug_pagealloc_enabled()) page_size_mask |= 1 << PG_LEVEL_2M; #endif @@ -666,21 +667,22 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) * mark them not present - any buggy init-section access will * create a kernel page fault: */ -#ifdef CONFIG_DEBUG_PAGEALLOC - printk(KERN_INFO "debug: unmapping init [mem %#010lx-%#010lx]\n", - begin, end - 1); - set_memory_np(begin, (end - begin) >> PAGE_SHIFT); -#else - /* - * We just marked the kernel text read only above, now that - * we are going to free part of that, we need to make that - * writeable and non-executable first. - */ - set_memory_nx(begin, (end - begin) >> PAGE_SHIFT); - set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); + if (debug_pagealloc_enabled()) { + pr_info("debug: unmapping init [mem %#010lx-%#010lx]\n", + begin, end - 1); + set_memory_np(begin, (end - begin) >> PAGE_SHIFT); + } else { + /* + * We just marked the kernel text read only above, now that + * we are going to free part of that, we need to make that + * writeable and non-executable first. + */ + set_memory_nx(begin, (end - begin) >> PAGE_SHIFT); + set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); - free_reserved_area((void *)begin, (void *)end, POISON_FREE_INITMEM, what); -#endif + free_reserved_area((void *)begin, (void *)end, + POISON_FREE_INITMEM, what); + } } void free_initmem(void) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index cb4ef3de61f9..bd7a9b9e2e14 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -388,7 +388,6 @@ repeat: } pte_t *kmap_pte; -pgprot_t kmap_prot; static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr) { @@ -405,8 +404,6 @@ static void __init kmap_init(void) */ kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); kmap_pte = kmap_get_fixmap_pte(kmap_vstart); - - kmap_prot = PAGE_KERNEL; } #ifdef CONFIG_HIGHMEM @@ -871,7 +868,6 @@ static noinline int do_test_wp_bit(void) return flag; } -#ifdef CONFIG_DEBUG_RODATA const int rodata_test_data = 0xC3; EXPORT_SYMBOL_GPL(rodata_test_data); @@ -960,5 +956,3 @@ void mark_rodata_ro(void) if (__supported_pte_mask & _PAGE_NX) debug_checkwx(); } -#endif - diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 5488d21123bd..214afda97911 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -53,6 +53,7 @@ #include <asm/numa.h> #include <asm/cacheflush.h> #include <asm/init.h> +#include <asm/uv/uv.h> #include <asm/setup.h> #include "mm_internal.h" @@ -1074,7 +1075,6 @@ void __init mem_init(void) mem_init_print_info(NULL); } -#ifdef CONFIG_DEBUG_RODATA const int rodata_test_data = 0xC3; EXPORT_SYMBOL_GPL(rodata_test_data); @@ -1166,8 +1166,6 @@ void mark_rodata_ro(void) debug_checkwx(); } -#endif - int kern_addr_valid(unsigned long addr) { unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; @@ -1206,26 +1204,13 @@ int kern_addr_valid(unsigned long addr) static unsigned long probe_memory_block_size(void) { - /* start from 2g */ - unsigned long bz = 1UL<<31; - - if (totalram_pages >= (64ULL << (30 - PAGE_SHIFT))) { - pr_info("Using 2GB memory block size for large-memory system\n"); - return 2UL * 1024 * 1024 * 1024; - } - - /* less than 64g installed */ - if ((max_pfn << PAGE_SHIFT) < (16UL << 32)) - return MIN_MEMORY_BLOCK_SIZE; + unsigned long bz = MIN_MEMORY_BLOCK_SIZE; - /* get the tail size */ - while (bz > MIN_MEMORY_BLOCK_SIZE) { - if (!((max_pfn << PAGE_SHIFT) & (bz - 1))) - break; - bz >>= 1; - } + /* if system is UV or has 64GB of RAM or more, use large blocks */ + if (is_uv_system() || ((max_pfn << PAGE_SHIFT) >= (64UL << 30))) + bz = 2UL << 30; /* 2GB */ - printk(KERN_DEBUG "memory block size : %ldMB\n", bz >> 20); + pr_info("x86/mm: Memory block size: %ldMB\n", bz >> 20); return bz; } diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index d470cf219a2d..1b1110fa0057 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -120,11 +120,22 @@ void __init kasan_init(void) kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), (void *)KASAN_SHADOW_END); - memset(kasan_zero_page, 0, PAGE_SIZE); - load_cr3(init_level4_pgt); __flush_tlb_all(); - init_task.kasan_depth = 0; + /* + * kasan_zero_page has been used as early shadow memory, thus it may + * contain some garbage. Now we can clear and write protect it, since + * after the TLB flush no one should write to it. + */ + memset(kasan_zero_page, 0, PAGE_SIZE); + for (i = 0; i < PTRS_PER_PTE; i++) { + pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO); + set_pte(&kasan_zero_pte[i], pte); + } + /* Flush TLBs again to be sure that write protection applied. */ + __flush_tlb_all(); + + init_task.kasan_depth = 0; pr_info("KernelAddressSanitizer initialized\n"); } diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 637ab34ed632..ddb2244b06a1 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -33,7 +33,7 @@ struct kmmio_fault_page { struct list_head list; struct kmmio_fault_page *release_next; - unsigned long page; /* location of the fault page */ + unsigned long addr; /* the requested address */ pteval_t old_presence; /* page presence prior to arming */ bool armed; @@ -70,9 +70,16 @@ unsigned int kmmio_count; static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE]; static LIST_HEAD(kmmio_probes); -static struct list_head *kmmio_page_list(unsigned long page) +static struct list_head *kmmio_page_list(unsigned long addr) { - return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)]; + unsigned int l; + pte_t *pte = lookup_address(addr, &l); + + if (!pte) + return NULL; + addr &= page_level_mask(l); + + return &kmmio_page_table[hash_long(addr, KMMIO_PAGE_HASH_BITS)]; } /* Accessed per-cpu */ @@ -98,15 +105,19 @@ static struct kmmio_probe *get_kmmio_probe(unsigned long addr) } /* You must be holding RCU read lock. */ -static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page) +static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long addr) { struct list_head *head; struct kmmio_fault_page *f; + unsigned int l; + pte_t *pte = lookup_address(addr, &l); - page &= PAGE_MASK; - head = kmmio_page_list(page); + if (!pte) + return NULL; + addr &= page_level_mask(l); + head = kmmio_page_list(addr); list_for_each_entry_rcu(f, head, list) { - if (f->page == page) + if (f->addr == addr) return f; } return NULL; @@ -137,10 +148,10 @@ static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old) static int clear_page_presence(struct kmmio_fault_page *f, bool clear) { unsigned int level; - pte_t *pte = lookup_address(f->page, &level); + pte_t *pte = lookup_address(f->addr, &level); if (!pte) { - pr_err("no pte for page 0x%08lx\n", f->page); + pr_err("no pte for addr 0x%08lx\n", f->addr); return -1; } @@ -156,7 +167,7 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear) return -1; } - __flush_tlb_one(f->page); + __flush_tlb_one(f->addr); return 0; } @@ -176,12 +187,12 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f) int ret; WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n")); if (f->armed) { - pr_warning("double-arm: page 0x%08lx, ref %d, old %d\n", - f->page, f->count, !!f->old_presence); + pr_warning("double-arm: addr 0x%08lx, ref %d, old %d\n", + f->addr, f->count, !!f->old_presence); } ret = clear_page_presence(f, true); - WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming 0x%08lx failed.\n"), - f->page); + WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming at 0x%08lx failed.\n"), + f->addr); f->armed = true; return ret; } @@ -191,7 +202,7 @@ static void disarm_kmmio_fault_page(struct kmmio_fault_page *f) { int ret = clear_page_presence(f, false); WARN_ONCE(ret < 0, - KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page); + KERN_ERR "kmmio disarming at 0x%08lx failed.\n", f->addr); f->armed = false; } @@ -215,6 +226,12 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) struct kmmio_context *ctx; struct kmmio_fault_page *faultpage; int ret = 0; /* default to fault not handled */ + unsigned long page_base = addr; + unsigned int l; + pte_t *pte = lookup_address(addr, &l); + if (!pte) + return -EINVAL; + page_base &= page_level_mask(l); /* * Preemption is now disabled to prevent process switch during @@ -227,7 +244,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) preempt_disable(); rcu_read_lock(); - faultpage = get_kmmio_fault_page(addr); + faultpage = get_kmmio_fault_page(page_base); if (!faultpage) { /* * Either this page fault is not caused by kmmio, or @@ -239,7 +256,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) ctx = &get_cpu_var(kmmio_ctx); if (ctx->active) { - if (addr == ctx->addr) { + if (page_base == ctx->addr) { /* * A second fault on the same page means some other * condition needs handling by do_page_fault(), the @@ -267,9 +284,9 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) ctx->active++; ctx->fpage = faultpage; - ctx->probe = get_kmmio_probe(addr); + ctx->probe = get_kmmio_probe(page_base); ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF)); - ctx->addr = addr; + ctx->addr = page_base; if (ctx->probe && ctx->probe->pre_handler) ctx->probe->pre_handler(ctx->probe, regs, addr); @@ -354,12 +371,11 @@ out: } /* You must be holding kmmio_lock. */ -static int add_kmmio_fault_page(unsigned long page) +static int add_kmmio_fault_page(unsigned long addr) { struct kmmio_fault_page *f; - page &= PAGE_MASK; - f = get_kmmio_fault_page(page); + f = get_kmmio_fault_page(addr); if (f) { if (!f->count) arm_kmmio_fault_page(f); @@ -372,26 +388,25 @@ static int add_kmmio_fault_page(unsigned long page) return -1; f->count = 1; - f->page = page; + f->addr = addr; if (arm_kmmio_fault_page(f)) { kfree(f); return -1; } - list_add_rcu(&f->list, kmmio_page_list(f->page)); + list_add_rcu(&f->list, kmmio_page_list(f->addr)); return 0; } /* You must be holding kmmio_lock. */ -static void release_kmmio_fault_page(unsigned long page, +static void release_kmmio_fault_page(unsigned long addr, struct kmmio_fault_page **release_list) { struct kmmio_fault_page *f; - page &= PAGE_MASK; - f = get_kmmio_fault_page(page); + f = get_kmmio_fault_page(addr); if (!f) return; @@ -420,18 +435,27 @@ int register_kmmio_probe(struct kmmio_probe *p) int ret = 0; unsigned long size = 0; const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); + unsigned int l; + pte_t *pte; spin_lock_irqsave(&kmmio_lock, flags); if (get_kmmio_probe(p->addr)) { ret = -EEXIST; goto out; } + + pte = lookup_address(p->addr, &l); + if (!pte) { + ret = -EINVAL; + goto out; + } + kmmio_count++; list_add_rcu(&p->list, &kmmio_probes); while (size < size_lim) { if (add_kmmio_fault_page(p->addr + size)) pr_err("Unable to set page fault.\n"); - size += PAGE_SIZE; + size += page_level_size(l); } out: spin_unlock_irqrestore(&kmmio_lock, flags); @@ -506,11 +530,17 @@ void unregister_kmmio_probe(struct kmmio_probe *p) const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); struct kmmio_fault_page *release_list = NULL; struct kmmio_delayed_release *drelease; + unsigned int l; + pte_t *pte; + + pte = lookup_address(p->addr, &l); + if (!pte) + return; spin_lock_irqsave(&kmmio_lock, flags); while (size < size_lim) { release_kmmio_fault_page(p->addr + size, &release_list); - size += PAGE_SIZE; + size += page_level_size(l); } list_del_rcu(&p->list); kmmio_count--; diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 96bd1e2bffaf..d2dc0438d654 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -71,12 +71,12 @@ unsigned long arch_mmap_rnd(void) if (mmap_is_ia32()) #ifdef CONFIG_COMPAT - rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_compat_bits) - 1); + rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); #else - rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_bits) - 1); + rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); #endif else - rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_bits) - 1); + rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); return rnd << PAGE_SHIFT; } @@ -94,18 +94,6 @@ static unsigned long mmap_base(unsigned long rnd) } /* - * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64 - * does, but not when emulating X86_32 - */ -static unsigned long mmap_legacy_base(unsigned long rnd) -{ - if (mmap_is_ia32()) - return TASK_UNMAPPED_BASE; - else - return TASK_UNMAPPED_BASE + rnd; -} - -/* * This function, called very early during the creation of a new * process VM image, sets up which VM layout function to use: */ @@ -116,7 +104,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (current->flags & PF_RANDOMIZE) random_factor = arch_mmap_rnd(); - mm->mmap_legacy_base = mmap_legacy_base(random_factor); + mm->mmap_legacy_base = TASK_UNMAPPED_BASE + random_factor; if (mmap_is_legacy()) { mm->mmap_base = mm->mmap_legacy_base; diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index b2fd67da1701..80476878eb4c 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -123,7 +123,7 @@ static int get_reg_offset(struct insn *insn, struct pt_regs *regs, break; } - if (regno > nr_registers) { + if (regno >= nr_registers) { WARN_ONCE(1, "decoded an instruction with an invalid register"); return -EINVAL; } @@ -546,8 +546,8 @@ static int mpx_resolve_fault(long __user *addr, int write) int nr_pages = 1; int force = 0; - gup_ret = get_user_pages(current, current->mm, (unsigned long)addr, - nr_pages, write, force, NULL, NULL); + gup_ret = get_user_pages((unsigned long)addr, nr_pages, write, + force, NULL, NULL); /* * get_user_pages() returns number of pages gotten. * 0 means we failed to fault in and get anything, @@ -728,14 +728,14 @@ static inline unsigned long bd_entry_virt_space(struct mm_struct *mm) /* * This covers 32-bit emulation as well as 32-bit kernels - * running on 64-bit harware. + * running on 64-bit hardware. */ if (!is_64bit_mm(mm)) return (4ULL * GB) / MPX_BD_NR_ENTRIES_32; /* * 'x86_virt_bits' returns what the hardware is capable - * of, and returns the full >32-bit adddress space when + * of, and returns the full >32-bit address space when * running 32-bit kernels on 64-bit hardware. */ virt_space = (1ULL << boot_cpu_data.x86_virt_bits); diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index d04f8094bc23..f70c1ff46125 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -465,46 +465,67 @@ static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) return true; } +/* + * Mark all currently memblock-reserved physical memory (which covers the + * kernel's own memory ranges) as hot-unswappable. + */ static void __init numa_clear_kernel_node_hotplug(void) { - int i, nid; - nodemask_t numa_kernel_nodes = NODE_MASK_NONE; - phys_addr_t start, end; - struct memblock_region *r; + nodemask_t reserved_nodemask = NODE_MASK_NONE; + struct memblock_region *mb_region; + int i; /* + * We have to do some preprocessing of memblock regions, to + * make them suitable for reservation. + * * At this time, all memory regions reserved by memblock are - * used by the kernel. Set the nid in memblock.reserved will - * mark out all the nodes the kernel resides in. + * used by the kernel, but those regions are not split up + * along node boundaries yet, and don't necessarily have their + * node ID set yet either. + * + * So iterate over all memory known to the x86 architecture, + * and use those ranges to set the nid in memblock.reserved. + * This will split up the memblock regions along node + * boundaries and will set the node IDs as well. */ for (i = 0; i < numa_meminfo.nr_blks; i++) { - struct numa_memblk *mb = &numa_meminfo.blk[i]; + struct numa_memblk *mb = numa_meminfo.blk + i; + int ret; - memblock_set_node(mb->start, mb->end - mb->start, - &memblock.reserved, mb->nid); + ret = memblock_set_node(mb->start, mb->end - mb->start, &memblock.reserved, mb->nid); + WARN_ON_ONCE(ret); } /* - * Mark all kernel nodes. + * Now go over all reserved memblock regions, to construct a + * node mask of all kernel reserved memory areas. * - * When booting with mem=nn[kMG] or in a kdump kernel, numa_meminfo - * may not include all the memblock.reserved memory ranges because - * trim_snb_memory() reserves specific pages for Sandy Bridge graphics. + * [ Note, when booting with mem=nn[kMG] or in a kdump kernel, + * numa_meminfo might not include all memblock.reserved + * memory ranges, because quirks such as trim_snb_memory() + * reserve specific pages for Sandy Bridge graphics. ] */ - for_each_memblock(reserved, r) - if (r->nid != MAX_NUMNODES) - node_set(r->nid, numa_kernel_nodes); + for_each_memblock(reserved, mb_region) { + if (mb_region->nid != MAX_NUMNODES) + node_set(mb_region->nid, reserved_nodemask); + } - /* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */ + /* + * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory + * belonging to the reserved node mask. + * + * Note that this will include memory regions that reside + * on nodes that contain kernel memory - entire nodes + * become hot-unpluggable: + */ for (i = 0; i < numa_meminfo.nr_blks; i++) { - nid = numa_meminfo.blk[i].nid; - if (!node_isset(nid, numa_kernel_nodes)) - continue; + struct numa_memblk *mb = numa_meminfo.blk + i; - start = numa_meminfo.blk[i].start; - end = numa_meminfo.blk[i].end; + if (!node_isset(mb->nid, reserved_nodemask)) + continue; - memblock_clear_hotplug(start, end - start); + memblock_clear_hotplug(mb->start, mb->end - mb->start); } } diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 2440814b0069..01be9ec3bf79 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -106,12 +106,6 @@ static inline unsigned long highmap_end_pfn(void) #endif -#ifdef CONFIG_DEBUG_PAGEALLOC -# define debug_pagealloc 1 -#else -# define debug_pagealloc 0 -#endif - static inline int within(unsigned long addr, unsigned long start, unsigned long end) { @@ -283,7 +277,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, __pa_symbol(__end_rodata) >> PAGE_SHIFT)) pgprot_val(forbidden) |= _PAGE_RW; -#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) +#if defined(CONFIG_X86_64) /* * Once the kernel maps the text as RO (kernel_set_to_readonly is set), * kernel text mappings for the large page aligned text, rodata sections @@ -419,24 +413,30 @@ pmd_t *lookup_pmd_address(unsigned long address) phys_addr_t slow_virt_to_phys(void *__virt_addr) { unsigned long virt_addr = (unsigned long)__virt_addr; - unsigned long phys_addr, offset; + phys_addr_t phys_addr; + unsigned long offset; enum pg_level level; pte_t *pte; pte = lookup_address(virt_addr, &level); BUG_ON(!pte); + /* + * pXX_pfn() returns unsigned long, which must be cast to phys_addr_t + * before being left-shifted PAGE_SHIFT bits -- this trick is to + * make 32-PAE kernel work correctly. + */ switch (level) { case PG_LEVEL_1G: - phys_addr = pud_pfn(*(pud_t *)pte) << PAGE_SHIFT; + phys_addr = (phys_addr_t)pud_pfn(*(pud_t *)pte) << PAGE_SHIFT; offset = virt_addr & ~PUD_PAGE_MASK; break; case PG_LEVEL_2M: - phys_addr = pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT; + phys_addr = (phys_addr_t)pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT; offset = virt_addr & ~PMD_PAGE_MASK; break; default: - phys_addr = pte_pfn(*pte) << PAGE_SHIFT; + phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; offset = virt_addr & ~PAGE_MASK; } @@ -708,10 +708,10 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte, { struct page *base; - if (!debug_pagealloc) + if (!debug_pagealloc_enabled()) spin_unlock(&cpa_lock); base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0); - if (!debug_pagealloc) + if (!debug_pagealloc_enabled()) spin_lock(&cpa_lock); if (!base) return -ENOMEM; @@ -909,16 +909,25 @@ static void populate_pte(struct cpa_data *cpa, pte = pte_offset_kernel(pmd, start); - while (num_pages-- && start < end) { + /* + * Set the GLOBAL flags only if the PRESENT flag is + * set otherwise pte_present will return true even on + * a non present pte. The canon_pgprot will clear + * _PAGE_GLOBAL for the ancient hardware that doesn't + * support it. + */ + if (pgprot_val(pgprot) & _PAGE_PRESENT) + pgprot_val(pgprot) |= _PAGE_GLOBAL; + else + pgprot_val(pgprot) &= ~_PAGE_GLOBAL; - /* deal with the NX bit */ - if (!(pgprot_val(pgprot) & _PAGE_NX)) - cpa->pfn &= ~_PAGE_NX; + pgprot = canon_pgprot(pgprot); - set_pte(pte, pfn_pte(cpa->pfn >> PAGE_SHIFT, pgprot)); + while (num_pages-- && start < end) { + set_pte(pte, pfn_pte(cpa->pfn, pgprot)); start += PAGE_SIZE; - cpa->pfn += PAGE_SIZE; + cpa->pfn++; pte++; } } @@ -974,11 +983,11 @@ static int populate_pmd(struct cpa_data *cpa, pmd = pmd_offset(pud, start); - set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE | + set_pmd(pmd, __pmd(cpa->pfn << PAGE_SHIFT | _PAGE_PSE | massage_pgprot(pmd_pgprot))); start += PMD_SIZE; - cpa->pfn += PMD_SIZE; + cpa->pfn += PMD_SIZE >> PAGE_SHIFT; cur_pages += PMD_SIZE >> PAGE_SHIFT; } @@ -1046,12 +1055,12 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, /* * Map everything starting from the Gb boundary, possibly with 1G pages */ - while (end - start >= PUD_SIZE) { - set_pud(pud, __pud(cpa->pfn | _PAGE_PSE | + while (cpu_has_gbpages && end - start >= PUD_SIZE) { + set_pud(pud, __pud(cpa->pfn << PAGE_SHIFT | _PAGE_PSE | massage_pgprot(pud_pgprot))); start += PUD_SIZE; - cpa->pfn += PUD_SIZE; + cpa->pfn += PUD_SIZE >> PAGE_SHIFT; cur_pages += PUD_SIZE >> PAGE_SHIFT; pud++; } @@ -1122,8 +1131,10 @@ static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr, /* * Ignore all non primary paths. */ - if (!primary) + if (!primary) { + cpa->numpages = 1; return 0; + } /* * Ignore the NULL PTE for kernel identity mapping, as it is expected @@ -1331,10 +1342,10 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY)) cpa->numpages = 1; - if (!debug_pagealloc) + if (!debug_pagealloc_enabled()) spin_lock(&cpa_lock); ret = __change_page_attr(cpa, checkalias); - if (!debug_pagealloc) + if (!debug_pagealloc_enabled()) spin_unlock(&cpa_lock); if (ret) return ret; @@ -1962,6 +1973,9 @@ int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, if (!(page_flags & _PAGE_NX)) cpa.mask_clr = __pgprot(_PAGE_NX); + if (!(page_flags & _PAGE_RW)) + cpa.mask_clr = __pgprot(_PAGE_RW); + cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags); retval = __change_page_attr_set_clr(&cpa, 0); diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index f4ae536b0914..faec01e7a17d 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -149,7 +149,7 @@ enum { PAT_WT = 4, /* Write Through */ PAT_WP = 5, /* Write Protected */ PAT_WB = 6, /* Write Back (default) */ - PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */ + PAT_UC_MINUS = 7, /* UC, but can be overridden by MTRR */ }; #define CM(c) (_PAGE_CACHE_MODE_ ## c) @@ -943,7 +943,7 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, return -EINVAL; } - *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | + *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) | cachemode2protval(pcm)); return 0; @@ -959,7 +959,7 @@ int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, /* Set prot based on lookup */ pcm = lookup_memtype(pfn_t_to_phys(pfn)); - *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | + *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) | cachemode2protval(pcm)); return 0; diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c new file mode 100644 index 000000000000..e8c474451928 --- /dev/null +++ b/arch/x86/mm/pkeys.c @@ -0,0 +1,101 @@ +/* + * Intel Memory Protection Keys management + * Copyright (c) 2015, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#include <linux/mm_types.h> /* mm_struct, vma, etc... */ +#include <linux/pkeys.h> /* PKEY_* */ +#include <uapi/asm-generic/mman-common.h> + +#include <asm/cpufeature.h> /* boot_cpu_has, ... */ +#include <asm/mmu_context.h> /* vma_pkey() */ +#include <asm/fpu/internal.h> /* fpregs_active() */ + +int __execute_only_pkey(struct mm_struct *mm) +{ + int ret; + + /* + * We do not want to go through the relatively costly + * dance to set PKRU if we do not need to. Check it + * first and assume that if the execute-only pkey is + * write-disabled that we do not have to set it + * ourselves. We need preempt off so that nobody + * can make fpregs inactive. + */ + preempt_disable(); + if (fpregs_active() && + !__pkru_allows_read(read_pkru(), PKEY_DEDICATED_EXECUTE_ONLY)) { + preempt_enable(); + return PKEY_DEDICATED_EXECUTE_ONLY; + } + preempt_enable(); + ret = arch_set_user_pkey_access(current, PKEY_DEDICATED_EXECUTE_ONLY, + PKEY_DISABLE_ACCESS); + /* + * If the PKRU-set operation failed somehow, just return + * 0 and effectively disable execute-only support. + */ + if (ret) + return 0; + + return PKEY_DEDICATED_EXECUTE_ONLY; +} + +static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma) +{ + /* Do this check first since the vm_flags should be hot */ + if ((vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) != VM_EXEC) + return false; + if (vma_pkey(vma) != PKEY_DEDICATED_EXECUTE_ONLY) + return false; + + return true; +} + +/* + * This is only called for *plain* mprotect calls. + */ +int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey) +{ + /* + * Is this an mprotect_pkey() call? If so, never + * override the value that came from the user. + */ + if (pkey != -1) + return pkey; + /* + * Look for a protection-key-drive execute-only mapping + * which is now being given permissions that are not + * execute-only. Move it back to the default pkey. + */ + if (vma_is_pkey_exec_only(vma) && + (prot & (PROT_READ|PROT_WRITE))) { + return 0; + } + /* + * The mapping is execute-only. Go try to get the + * execute-only protection key. If we fail to do that, + * fall through as if we do not have execute-only + * support. + */ + if (prot == PROT_EXEC) { + pkey = execute_only_pkey(vma->vm_mm); + if (pkey > 0) + return pkey; + } + /* + * This is a vanilla, non-pkey mprotect (or we failed to + * setup execute-only), inherit the pkey from the VMA we + * are working on. + */ + return vma_pkey(vma); +} diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c index 92e2eacb3321..8bea84724a7d 100644 --- a/arch/x86/mm/setup_nx.c +++ b/arch/x86/mm/setup_nx.c @@ -4,6 +4,7 @@ #include <asm/pgtable.h> #include <asm/proto.h> +#include <asm/cpufeature.h> static int disable_nx; @@ -31,9 +32,8 @@ early_param("noexec", noexec_setup); void x86_configure_nx(void) { - if (boot_cpu_has(X86_FEATURE_NX) && !disable_nx) - __supported_pte_mask |= _PAGE_NX; - else + /* If disable_nx is set, clear NX on all new mappings going forward. */ + if (disable_nx) __supported_pte_mask &= ~_PAGE_NX; } |