diff options
author | Mike Marshall <hubcap@omnibond.com> | 2016-01-15 20:22:14 +0100 |
---|---|---|
committer | Mike Marshall <hubcap@omnibond.com> | 2016-01-15 20:22:14 +0100 |
commit | 5e1f3938f9e4ef539031d639879cb1cea870fab5 (patch) | |
tree | b262a7c002b05b7bc800a106d6ca6777dabe15bf /arch/x86 | |
parent | Orangefs: add verification to decode_dirents (diff) | |
parent | Linux 4.4 (diff) | |
download | linux-5e1f3938f9e4ef539031d639879cb1cea870fab5.tar.xz linux-5e1f3938f9e4ef539031d639879cb1cea870fab5.zip |
Orangefs: merge with V4.4
Merge tag 'v4.4' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux into current
Linux 4.4
Diffstat (limited to 'arch/x86')
48 files changed, 327 insertions, 178 deletions
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 0033e96c3f09..9011a88353de 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -23,7 +23,6 @@ #include <stdarg.h> #include <linux/types.h> #include <linux/edd.h> -#include <asm/boot.h> #include <asm/setup.h> #include "bitops.h" #include "ctype.h" diff --git a/arch/x86/boot/video-mode.c b/arch/x86/boot/video-mode.c index aa8a96b052e3..95c7a818c0ed 100644 --- a/arch/x86/boot/video-mode.c +++ b/arch/x86/boot/video-mode.c @@ -19,6 +19,8 @@ #include "video.h" #include "vesa.h" +#include <uapi/asm/boot.h> + /* * Common variables */ diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c index 05111bb8d018..77780e386e9b 100644 --- a/arch/x86/boot/video.c +++ b/arch/x86/boot/video.c @@ -13,6 +13,8 @@ * Select video mode */ +#include <uapi/asm/boot.h> + #include "boot.h" #include "video.h" #include "vesa.h" diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index a89fdbc1f0be..03663740c866 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -421,7 +421,7 @@ __visible long do_fast_syscall_32(struct pt_regs *regs) regs->ip = landing_pad; /* - * Fetch ECX from where the vDSO stashed it. + * Fetch EBP from where the vDSO stashed it. * * WARNING: We are in CONTEXT_USER and RCU isn't paying attention! */ @@ -432,10 +432,10 @@ __visible long do_fast_syscall_32(struct pt_regs *regs) * Micro-optimization: the pointer we're following is explicitly * 32 bits, so it can't be out of range. */ - __get_user(*(u32 *)®s->cx, + __get_user(*(u32 *)®s->bp, (u32 __user __force *)(unsigned long)(u32)regs->sp) #else - get_user(*(u32 *)®s->cx, + get_user(*(u32 *)®s->bp, (u32 __user __force *)(unsigned long)(u32)regs->sp) #endif ) { diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 3eb572ed3d7a..f3b6d54e0042 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -292,7 +292,7 @@ ENTRY(entry_SYSENTER_32) movl TSS_sysenter_sp0(%esp), %esp sysenter_past_esp: pushl $__USER_DS /* pt_regs->ss */ - pushl %ecx /* pt_regs->cx */ + pushl %ebp /* pt_regs->sp (stashed in bp) */ pushfl /* pt_regs->flags (except IF = 0) */ orl $X86_EFLAGS_IF, (%esp) /* Fix IF */ pushl $__USER_CS /* pt_regs->cs */ @@ -308,8 +308,9 @@ sysenter_past_esp: movl %esp, %eax call do_fast_syscall_32 - testl %eax, %eax - jz .Lsyscall_32_done + /* XEN PV guests always use IRET path */ + ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \ + "jmp .Lsyscall_32_done", X86_FEATURE_XENPV /* Opportunistic SYSEXIT */ TRACE_IRQS_ON /* User mode traces as IRQs on. */ diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 53616ca03244..a55697d19824 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -509,6 +509,17 @@ END(irq_entries_start) * tracking that we're in kernel mode. */ SWAPGS + + /* + * We need to tell lockdep that IRQs are off. We can't do this until + * we fix gsbase, and we should do it before enter_from_user_mode + * (which can take locks). Since TRACE_IRQS_OFF idempotent, + * the simplest way to handle it is to just call it twice if + * we enter from user mode. There's no reason to optimize this since + * TRACE_IRQS_OFF is a no-op if lockdep is off. + */ + TRACE_IRQS_OFF + #ifdef CONFIG_CONTEXT_TRACKING call enter_from_user_mode #endif @@ -1049,12 +1060,18 @@ ENTRY(error_entry) SWAPGS .Lerror_entry_from_usermode_after_swapgs: + /* + * We need to tell lockdep that IRQs are off. We can't do this until + * we fix gsbase, and we should do it before enter_from_user_mode + * (which can take locks). + */ + TRACE_IRQS_OFF #ifdef CONFIG_CONTEXT_TRACKING call enter_from_user_mode #endif + ret .Lerror_entry_done: - TRACE_IRQS_OFF ret diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index c3201830a85e..6a1ae3751e82 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -63,7 +63,7 @@ ENTRY(entry_SYSENTER_compat) /* Construct struct pt_regs on stack */ pushq $__USER32_DS /* pt_regs->ss */ - pushq %rcx /* pt_regs->sp */ + pushq %rbp /* pt_regs->sp (stashed in bp) */ /* * Push flags. This is nasty. First, interrupts are currently @@ -82,14 +82,14 @@ ENTRY(entry_SYSENTER_compat) pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ pushq %rdx /* pt_regs->dx */ - pushq %rcx /* pt_regs->cx (will be overwritten) */ + pushq %rcx /* pt_regs->cx */ pushq $-ENOSYS /* pt_regs->ax */ pushq %r8 /* pt_regs->r8 = 0 */ pushq %r8 /* pt_regs->r9 = 0 */ pushq %r8 /* pt_regs->r10 = 0 */ pushq %r8 /* pt_regs->r11 = 0 */ pushq %rbx /* pt_regs->rbx */ - pushq %rbp /* pt_regs->rbp */ + pushq %rbp /* pt_regs->rbp (will be overwritten) */ pushq %r8 /* pt_regs->r12 = 0 */ pushq %r8 /* pt_regs->r13 = 0 */ pushq %r8 /* pt_regs->r14 = 0 */ @@ -121,8 +121,9 @@ sysenter_flags_fixed: movq %rsp, %rdi call do_fast_syscall_32 - testl %eax, %eax - jz .Lsyscall_32_done + /* XEN PV guests always use IRET path */ + ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \ + "jmp .Lsyscall_32_done", X86_FEATURE_XENPV jmp sysret32_from_system_call sysenter_fix_flags: @@ -178,7 +179,7 @@ ENTRY(entry_SYSCALL_compat) pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ pushq %rdx /* pt_regs->dx */ - pushq %rcx /* pt_regs->cx (will be overwritten) */ + pushq %rbp /* pt_regs->cx (stashed in bp) */ pushq $-ENOSYS /* pt_regs->ax */ xorq %r8,%r8 pushq %r8 /* pt_regs->r8 = 0 */ @@ -186,7 +187,7 @@ ENTRY(entry_SYSCALL_compat) pushq %r8 /* pt_regs->r10 = 0 */ pushq %r8 /* pt_regs->r11 = 0 */ pushq %rbx /* pt_regs->rbx */ - pushq %rbp /* pt_regs->rbp */ + pushq %rbp /* pt_regs->rbp (will be overwritten) */ pushq %r8 /* pt_regs->r12 = 0 */ pushq %r8 /* pt_regs->r13 = 0 */ pushq %r8 /* pt_regs->r14 = 0 */ @@ -200,8 +201,9 @@ ENTRY(entry_SYSCALL_compat) movq %rsp, %rdi call do_fast_syscall_32 - testl %eax, %eax - jz .Lsyscall_32_done + /* XEN PV guests always use IRET path */ + ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \ + "jmp .Lsyscall_32_done", X86_FEATURE_XENPV /* Opportunistic SYSRET */ sysret32_from_system_call: diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S index 93bd8452383f..3a1d9297074b 100644 --- a/arch/x86/entry/vdso/vdso32/system_call.S +++ b/arch/x86/entry/vdso/vdso32/system_call.S @@ -1,5 +1,5 @@ /* - * Code for the vDSO. This version uses the old int $0x80 method. + * AT_SYSINFO entry point */ #include <asm/dwarf2.h> @@ -21,35 +21,67 @@ __kernel_vsyscall: /* * Reshuffle regs so that all of any of the entry instructions * will preserve enough state. + * + * A really nice entry sequence would be: + * pushl %edx + * pushl %ecx + * movl %esp, %ecx + * + * Unfortunately, naughty Android versions between July and December + * 2015 actually hardcode the traditional Linux SYSENTER entry + * sequence. That is severely broken for a number of reasons (ask + * anyone with an AMD CPU, for example). Nonetheless, we try to keep + * it working approximately as well as it ever worked. + * + * This link may eludicate some of the history: + * https://android-review.googlesource.com/#/q/Iac3295376d61ef83e713ac9b528f3b50aa780cd7 + * personally, I find it hard to understand what's going on there. + * + * Note to future user developers: DO NOT USE SYSENTER IN YOUR CODE. + * Execute an indirect call to the address in the AT_SYSINFO auxv + * entry. That is the ONLY correct way to make a fast 32-bit system + * call on Linux. (Open-coding int $0x80 is also fine, but it's + * slow.) */ + pushl %ecx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ecx, 0 pushl %edx CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET edx, 0 - pushl %ecx + pushl %ebp CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET ecx, 0 - movl %esp, %ecx + CFI_REL_OFFSET ebp, 0 + + #define SYSENTER_SEQUENCE "movl %esp, %ebp; sysenter" + #define SYSCALL_SEQUENCE "movl %ecx, %ebp; syscall" #ifdef CONFIG_X86_64 /* If SYSENTER (Intel) or SYSCALL32 (AMD) is available, use it. */ - ALTERNATIVE_2 "", "sysenter", X86_FEATURE_SYSENTER32, \ - "syscall", X86_FEATURE_SYSCALL32 + ALTERNATIVE_2 "", SYSENTER_SEQUENCE, X86_FEATURE_SYSENTER32, \ + SYSCALL_SEQUENCE, X86_FEATURE_SYSCALL32 #else - ALTERNATIVE "", "sysenter", X86_FEATURE_SEP + ALTERNATIVE "", SYSENTER_SEQUENCE, X86_FEATURE_SEP #endif /* Enter using int $0x80 */ - movl (%esp), %ecx int $0x80 GLOBAL(int80_landing_pad) - /* Restore ECX and EDX in case they were clobbered. */ - popl %ecx - CFI_RESTORE ecx + /* + * Restore EDX and ECX in case they were clobbered. EBP is not + * clobbered (the kernel restores it), but it's cleaner and + * probably faster to pop it than to adjust ESP using addl. + */ + popl %ebp + CFI_RESTORE ebp CFI_ADJUST_CFA_OFFSET -4 popl %edx CFI_RESTORE edx CFI_ADJUST_CFA_OFFSET -4 + popl %ecx + CFI_RESTORE ecx + CFI_ADJUST_CFA_OFFSET -4 ret CFI_ENDPROC diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index e4f8010f22e0..f7ba9fbf12ee 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -216,6 +216,7 @@ #define X86_FEATURE_PAUSEFILTER ( 8*32+13) /* AMD filtered pause intercept */ #define X86_FEATURE_PFTHRESHOLD ( 8*32+14) /* AMD pause filter threshold */ #define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */ +#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 9f3905697f12..690b4027e17c 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -35,7 +35,7 @@ #define MSR_IA32_PERFCTR0 0x000000c1 #define MSR_IA32_PERFCTR1 0x000000c2 #define MSR_FSB_FREQ 0x000000cd -#define MSR_NHM_PLATFORM_INFO 0x000000ce +#define MSR_PLATFORM_INFO 0x000000ce #define MSR_NHM_SNB_PKG_CST_CFG_CTL 0x000000e2 #define NHM_C3_AUTO_DEMOTE (1UL << 25) @@ -44,7 +44,6 @@ #define SNB_C1_AUTO_UNDEMOTE (1UL << 27) #define SNB_C3_AUTO_UNDEMOTE (1UL << 28) -#define MSR_PLATFORM_INFO 0x000000ce #define MSR_MTRRcap 0x000000fe #define MSR_IA32_BBL_CR_CTL 0x00000119 #define MSR_IA32_BBL_CR_CTL3 0x0000011e diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index c5b7fb2774d0..cc071c6f7d4d 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -9,19 +9,21 @@ #define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) #define PAGE_MASK (~(PAGE_SIZE-1)) +#define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT) +#define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1)) + +#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) +#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) + #define __PHYSICAL_MASK ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1)) #define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) -/* Cast PAGE_MASK to a signed type so that it is sign-extended if +/* Cast *PAGE_MASK to a signed type so that it is sign-extended if virtual addresses are 32-bits but physical addresses are larger (ie, 32-bit PAE). */ #define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK) - -#define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT) -#define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1)) - -#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) -#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) +#define PHYSICAL_PMD_PAGE_MASK (((signed long)PMD_PAGE_MASK) & __PHYSICAL_MASK) +#define PHYSICAL_PUD_PAGE_MASK (((signed long)PUD_PAGE_MASK) & __PHYSICAL_MASK) #define HPAGE_SHIFT PMD_SHIFT #define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 10d0596433f8..c759b3cca663 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -19,6 +19,12 @@ static inline int paravirt_enabled(void) return pv_info.paravirt_enabled; } +static inline int paravirt_has_feature(unsigned int feature) +{ + WARN_ON_ONCE(!pv_info.paravirt_enabled); + return (pv_info.features & feature); +} + static inline void load_sp0(struct tss_struct *tss, struct thread_struct *thread) { diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 31247b5bff7c..3d44191185f8 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -70,9 +70,14 @@ struct pv_info { #endif int paravirt_enabled; + unsigned int features; /* valid only if paravirt_enabled is set */ const char *name; }; +#define paravirt_has(x) paravirt_has_feature(PV_SUPPORTED_##x) +/* Supported features */ +#define PV_SUPPORTED_RTC (1<<0) + struct pv_init_ops { /* * Patch may replace one of the defined code sequences with diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index dd5b0aa9dd2f..a471cadb9630 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -279,17 +279,14 @@ static inline pmdval_t native_pmd_val(pmd_t pmd) static inline pudval_t pud_pfn_mask(pud_t pud) { if (native_pud_val(pud) & _PAGE_PSE) - return PUD_PAGE_MASK & PHYSICAL_PAGE_MASK; + return PHYSICAL_PUD_PAGE_MASK; else return PTE_PFN_MASK; } static inline pudval_t pud_flags_mask(pud_t pud) { - if (native_pud_val(pud) & _PAGE_PSE) - return ~(PUD_PAGE_MASK & (pudval_t)PHYSICAL_PAGE_MASK); - else - return ~PTE_PFN_MASK; + return ~pud_pfn_mask(pud); } static inline pudval_t pud_flags(pud_t pud) @@ -300,17 +297,14 @@ static inline pudval_t pud_flags(pud_t pud) static inline pmdval_t pmd_pfn_mask(pmd_t pmd) { if (native_pmd_val(pmd) & _PAGE_PSE) - return PMD_PAGE_MASK & PHYSICAL_PAGE_MASK; + return PHYSICAL_PMD_PAGE_MASK; else return PTE_PFN_MASK; } static inline pmdval_t pmd_flags_mask(pmd_t pmd) { - if (native_pmd_val(pmd) & _PAGE_PSE) - return ~(PMD_PAGE_MASK & (pmdval_t)PHYSICAL_PAGE_MASK); - else - return ~PTE_PFN_MASK; + return ~pmd_pfn_mask(pmd); } static inline pmdval_t pmd_flags(pmd_t pmd) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 67522256c7ff..2d5a50cb61a2 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -472,6 +472,7 @@ static inline unsigned long current_top_of_stack(void) #else #define __cpuid native_cpuid #define paravirt_enabled() 0 +#define paravirt_has(x) 0 static inline void load_sp0(struct tss_struct *tss, struct thread_struct *thread) diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 48d34d28f5a6..cd0fc0cc78bc 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -1,7 +1,6 @@ #ifndef _ASM_X86_PLATFORM_H #define _ASM_X86_PLATFORM_H -#include <asm/pgtable_types.h> #include <asm/bootparam.h> struct mpc_bus; diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 38dd5efdd04c..2bd2292a316d 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -193,20 +193,17 @@ static int __init numachip_system_init(void) case 1: init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE); numachip_apic_icr_write = numachip1_apic_icr_write; - x86_init.pci.arch_init = pci_numachip_init; break; case 2: init_extra_mapping_uc(NUMACHIP2_LCSR_BASE, NUMACHIP2_LCSR_SIZE); numachip_apic_icr_write = numachip2_apic_icr_write; - - /* Use MCFG config cycles rather than locked CF8 cycles */ - raw_pci_ops = &pci_mmcfg; break; default: return 0; } x86_cpuinit.fixup_cpu_id = fixup_cpu_id; + x86_init.pci.arch_init = pci_numachip_init; return 0; } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4ddd780aeac9..c2b7522cbf35 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -273,10 +273,9 @@ __setup("nosmap", setup_disable_smap); static __always_inline void setup_smap(struct cpuinfo_x86 *c) { - unsigned long eflags; + unsigned long eflags = native_save_fl(); /* This should have been cleared long ago */ - raw_local_save_flags(eflags); BUG_ON(eflags & X86_EFLAGS_AC); if (cpu_has(c, X86_FEATURE_SMAP)) { diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index c5b0d562dbf5..7e8a736d09db 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -999,6 +999,17 @@ void do_machine_check(struct pt_regs *regs, long error_code) int flags = MF_ACTION_REQUIRED; int lmce = 0; + /* If this CPU is offline, just bail out. */ + if (cpu_is_offline(smp_processor_id())) { + u64 mcgstatus; + + mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); + if (mcgstatus & MCG_STATUS_RIPV) { + mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); + return; + } + } + ist_enter(regs); this_cpu_inc(mce_exception_count); diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 7fc27f1cca58..b3e94ef461fd 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -698,3 +698,4 @@ int __init microcode_init(void) return error; } +late_initcall(microcode_init); diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 4562cf070c27..2bf79d7c97df 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -5,7 +5,7 @@ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> * Copyright (C) 2009 Google, Inc., Stephane Eranian * diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 499f533dd3cc..d0e35ebb2adb 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -5,7 +5,7 @@ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> * Copyright (C) 2009 Google, Inc., Stephane Eranian * @@ -387,7 +387,7 @@ struct cpu_hw_events { /* Check flags and event code/umask, and set the HSW N/A flag */ #define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \ __EVENT_CONSTRAINT(code, n, \ - INTEL_ARCH_EVENT_MASK|INTEL_ARCH_EVENT_MASK, \ + INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_NA_HSW) @@ -627,6 +627,7 @@ struct x86_perf_task_context { u64 lbr_from[MAX_LBR_ENTRIES]; u64 lbr_to[MAX_LBR_ENTRIES]; u64 lbr_info[MAX_LBR_ENTRIES]; + int tos; int lbr_callstack_users; int lbr_stack_state; }; diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index f63360be2238..e2a430021e46 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -232,7 +232,7 @@ static struct event_constraint intel_hsw_event_constraints[] = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ - INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.* */ + INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */ INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */ diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c index 377e8f8ed391..a316ca96f1b6 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c @@ -298,7 +298,7 @@ static bool __match_event(struct perf_event *a, struct perf_event *b) static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event) { if (event->attach_state & PERF_ATTACH_TASK) - return perf_cgroup_from_task(event->hw.target); + return perf_cgroup_from_task(event->hw.target, event->ctx); return event->cgrp; } diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index bfd0b717e944..659f01e165d5 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -239,7 +239,7 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) } mask = x86_pmu.lbr_nr - 1; - tos = intel_pmu_lbr_tos(); + tos = task_ctx->tos; for (i = 0; i < tos; i++) { lbr_idx = (tos - i) & mask; wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]); @@ -247,6 +247,7 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); } + wrmsrl(x86_pmu.lbr_tos, tos); task_ctx->lbr_stack_state = LBR_NONE; } @@ -270,6 +271,7 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); } + task_ctx->tos = tos; task_ctx->lbr_stack_state = LBR_VALID; } diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index ef29b742cea7..31c6a60505e6 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -385,20 +385,19 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame, */ void fpu__init_prepare_fx_sw_frame(void) { - int fsave_header_size = sizeof(struct fregs_state); int size = xstate_size + FP_XSTATE_MAGIC2_SIZE; - if (config_enabled(CONFIG_X86_32)) - size += fsave_header_size; - fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; fx_sw_reserved.extended_size = size; fx_sw_reserved.xfeatures = xfeatures_mask; fx_sw_reserved.xstate_size = xstate_size; - if (config_enabled(CONFIG_IA32_EMULATION)) { + if (config_enabled(CONFIG_IA32_EMULATION) || + config_enabled(CONFIG_X86_32)) { + int fsave_header_size = sizeof(struct fregs_state); + fx_sw_reserved_ia32 = fx_sw_reserved; - fx_sw_reserved_ia32.extended_size += fsave_header_size; + fx_sw_reserved_ia32.extended_size = size + fsave_header_size; } } diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 6454f2731b56..70fc312221fc 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -694,7 +694,6 @@ void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature) if (!boot_cpu_has(X86_FEATURE_XSAVE)) return NULL; - xsave = ¤t->thread.fpu.state.xsave; /* * We should not ever be requesting features that we * have not enabled. Remember that pcntxt_mask is diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c index dc5fa6a1e8d6..3512ba607361 100644 --- a/arch/x86/kernel/irq_work.c +++ b/arch/x86/kernel/irq_work.c @@ -1,7 +1,7 @@ /* * x86 specific code for irq_work * - * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra */ #include <linux/kernel.h> diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S index 94ea120fa21f..87e1762e2bca 100644 --- a/arch/x86/kernel/mcount_64.S +++ b/arch/x86/kernel/mcount_64.S @@ -278,6 +278,12 @@ trace: /* save_mcount_regs fills in first two parameters */ save_mcount_regs + /* + * When DYNAMIC_FTRACE is not defined, ARCH_SUPPORTS_FTRACE_OPS is not + * set (see include/asm/ftrace.h and include/linux/ftrace.h). Only the + * ip and parent ip are used and the list function is called when + * function tracing is enabled. + */ call *ftrace_trace_function restore_mcount_regs diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c index 4f00b63d7ff3..14415aff1813 100644 --- a/arch/x86/kernel/pmem.c +++ b/arch/x86/kernel/pmem.c @@ -4,10 +4,22 @@ */ #include <linux/platform_device.h> #include <linux/module.h> +#include <linux/ioport.h> + +static int found(u64 start, u64 end, void *data) +{ + return 1; +} static __init int register_e820_pmem(void) { + char *pmem = "Persistent Memory (legacy)"; struct platform_device *pdev; + int rc; + + rc = walk_iomem_res(pmem, IORESOURCE_MEM, 0, -1, NULL, found); + if (rc <= 0) + return 0; /* * See drivers/nvdimm/e820.c for the implementation, this is diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index cd9685235df9..4af8d063fb36 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -200,6 +200,9 @@ static __init int add_rtc_cmos(void) } #endif + if (paravirt_enabled() && !paravirt_has(RTC)) + return -ENODEV; + platform_device_register(&rtc_device); dev_info(&rtc_device.dev, "registered platform RTC device (no PNP device found)\n"); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 29db25f9a745..d2bbe343fda7 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1250,8 +1250,6 @@ void __init setup_arch(char **cmdline_p) if (efi_enabled(EFI_BOOT)) efi_apply_memmap_quirks(); #endif - - microcode_init(); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index b7ffb7c00075..cb6282c3638f 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -690,12 +690,15 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) signal_setup_done(failed, ksig, stepping); } -#ifdef CONFIG_X86_32 -#define NR_restart_syscall __NR_restart_syscall -#else /* !CONFIG_X86_32 */ -#define NR_restart_syscall \ - test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall -#endif /* CONFIG_X86_32 */ +static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) +{ +#if defined(CONFIG_X86_32) || !defined(CONFIG_X86_64) + return __NR_restart_syscall; +#else /* !CONFIG_X86_32 && CONFIG_X86_64 */ + return test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : + __NR_restart_syscall | (regs->orig_ax & __X32_SYSCALL_BIT); +#endif /* CONFIG_X86_32 || !CONFIG_X86_64 */ +} /* * Note that 'init' is a special process: it doesn't get signals it doesn't @@ -724,7 +727,7 @@ void do_signal(struct pt_regs *regs) break; case -ERESTART_RESTARTBLOCK: - regs->ax = NR_restart_syscall; + regs->ax = get_nr_restart_syscall(regs); regs->ip -= 2; break; } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 892ee2e5ecbc..fbabe4fcc7fb 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -509,7 +509,7 @@ void __inquire_remote_apic(int apicid) */ #define UDELAY_10MS_DEFAULT 10000 -static unsigned int init_udelay = INT_MAX; +static unsigned int init_udelay = UINT_MAX; static int __init cpu_init_udelay(char *str) { @@ -522,14 +522,15 @@ early_param("cpu_init_udelay", cpu_init_udelay); static void __init smp_quirk_init_udelay(void) { /* if cmdline changed it from default, leave it alone */ - if (init_udelay != INT_MAX) + if (init_udelay != UINT_MAX) return; /* if modern processor, use no delay */ if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) || - ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) + ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) { init_udelay = 0; - + return; + } /* else, use legacy delay */ init_udelay = UDELAY_10MS_DEFAULT; } diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 06332cb7e7d1..3f5c48ddba45 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -38,6 +38,14 @@ static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) return best && (best->ecx & bit(X86_FEATURE_XSAVE)); } +static inline bool guest_cpuid_has_mtrr(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 1, 0); + return best && (best->edx & bit(X86_FEATURE_MTRR)); +} + static inline bool guest_cpuid_has_tsc_adjust(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 08116ff227cc..b0ea42b78ccd 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -420,6 +420,7 @@ void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_s u8 saved_mode; if (hpet_legacy_start) { /* save existing mode for later reenablement */ + WARN_ON(channel != 0); saved_mode = kvm->arch.vpit->pit_state.channels[0].mode; kvm->arch.vpit->pit_state.channels[0].mode = 0xff; /* disable timer */ pit_load_count(kvm, channel, val); diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index 9e8bf13572e6..3f8c732117ec 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -120,14 +120,22 @@ static u8 mtrr_default_type(struct kvm_mtrr *mtrr_state) return mtrr_state->deftype & IA32_MTRR_DEF_TYPE_TYPE_MASK; } -static u8 mtrr_disabled_type(void) +static u8 mtrr_disabled_type(struct kvm_vcpu *vcpu) { /* * Intel SDM 11.11.2.2: all MTRRs are disabled when * IA32_MTRR_DEF_TYPE.E bit is cleared, and the UC * memory type is applied to all of physical memory. + * + * However, virtual machines can be run with CPUID such that + * there are no MTRRs. In that case, the firmware will never + * enable MTRRs and it is obviously undesirable to run the + * guest entirely with UC memory and we use WB. */ - return MTRR_TYPE_UNCACHABLE; + if (guest_cpuid_has_mtrr(vcpu)) + return MTRR_TYPE_UNCACHABLE; + else + return MTRR_TYPE_WRBACK; } /* @@ -267,7 +275,7 @@ static int fixed_mtrr_addr_to_seg(u64 addr) for (seg = 0; seg < seg_num; seg++) { mtrr_seg = &fixed_seg_table[seg]; - if (mtrr_seg->start >= addr && addr < mtrr_seg->end) + if (mtrr_seg->start <= addr && addr < mtrr_seg->end) return seg; } @@ -300,7 +308,6 @@ static void var_mtrr_range(struct kvm_mtrr_range *range, u64 *start, u64 *end) *start = range->base & PAGE_MASK; mask = range->mask & PAGE_MASK; - mask |= ~0ULL << boot_cpu_data.x86_phys_bits; /* This cannot overflow because writing to the reserved bits of * variable MTRRs causes a #GP. @@ -356,10 +363,14 @@ static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) if (var_mtrr_range_is_valid(cur)) list_del(&mtrr_state->var_ranges[index].node); + /* Extend the mask with all 1 bits to the left, since those + * bits must implicitly be 0. The bits are then cleared + * when reading them. + */ if (!is_mtrr_mask) cur->base = data; else - cur->mask = data; + cur->mask = data | (-1LL << cpuid_maxphyaddr(vcpu)); /* add it to the list if it's enabled. */ if (var_mtrr_range_is_valid(cur)) { @@ -426,6 +437,8 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) *pdata = vcpu->arch.mtrr_state.var_ranges[index].base; else *pdata = vcpu->arch.mtrr_state.var_ranges[index].mask; + + *pdata &= (1ULL << cpuid_maxphyaddr(vcpu)) - 1; } return 0; @@ -670,7 +683,7 @@ u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) } if (iter.mtrr_disabled) - return mtrr_disabled_type(); + return mtrr_disabled_type(vcpu); /* not contained in any MTRRs. */ if (type == -1) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 83a1c643f9a5..899c40f826dd 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3422,6 +3422,8 @@ static int handle_exit(struct kvm_vcpu *vcpu) struct kvm_run *kvm_run = vcpu->run; u32 exit_code = svm->vmcb->control.exit_code; + trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM); + if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE)) vcpu->arch.cr0 = svm->vmcb->save.cr0; if (npt_enabled) @@ -3892,8 +3894,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; - trace_kvm_exit(svm->vmcb->control.exit_code, vcpu, KVM_ISA_SVM); - if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) kvm_before_handle_nmi(&svm->vcpu); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 87acc5221740..44976a596fa6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2803,7 +2803,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vcpu->arch.ia32_xss; break; case MSR_TSC_AUX: - if (!guest_cpuid_has_rdtscp(vcpu)) + if (!guest_cpuid_has_rdtscp(vcpu) && !msr_info->host_initiated) return 1; /* Otherwise falls through */ default: @@ -2909,7 +2909,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) clear_atomic_switch_msr(vmx, MSR_IA32_XSS); break; case MSR_TSC_AUX: - if (!guest_cpuid_has_rdtscp(vcpu)) + if (!guest_cpuid_has_rdtscp(vcpu) && !msr_info->host_initiated) return 1; /* Check reserved bit, higher 32 bits should be zero */ if ((data >> 32) != 0) @@ -7394,11 +7394,6 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) switch (type) { case VMX_VPID_EXTENT_ALL_CONTEXT: - if (get_vmcs12(vcpu)->virtual_processor_id == 0) { - nested_vmx_failValid(vcpu, - VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); - return 1; - } __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02); nested_vmx_succeed(vcpu); break; @@ -8047,6 +8042,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) u32 exit_reason = vmx->exit_reason; u32 vectoring_info = vmx->idt_vectoring_info; + trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX); + /* * Flush logged GPAs PML buffer, this will make dirty_bitmap more * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before @@ -8673,7 +8670,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->loaded_vmcs->launched = 1; vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); - trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX); /* * the KVM_REQ_EVENT optimization bit is only on for one entry, and if diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 00462bd63129..97592e190413 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2763,6 +2763,26 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, return 0; } +static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu) +{ + return (!lapic_in_kernel(vcpu) || + kvm_apic_accept_pic_intr(vcpu)); +} + +/* + * if userspace requested an interrupt window, check that the + * interrupt window is open. + * + * No need to exit to userspace if we already have an interrupt queued. + */ +static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu) +{ + return kvm_arch_interrupt_allowed(vcpu) && + !kvm_cpu_has_interrupt(vcpu) && + !kvm_event_needs_reinjection(vcpu) && + kvm_cpu_accept_dm_intr(vcpu); +} + static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) { @@ -2786,6 +2806,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, return -EEXIST; vcpu->arch.pending_external_vector = irq->irq; + kvm_make_request(KVM_REQ_EVENT, vcpu); return 0; } @@ -3551,9 +3572,11 @@ static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) { + int i; mutex_lock(&kvm->arch.vpit->pit_state.lock); memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state)); - kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0); + for (i = 0; i < 3; i++) + kvm_pit_load_count(kvm, i, ps->channels[i].count, 0); mutex_unlock(&kvm->arch.vpit->pit_state.lock); return 0; } @@ -3572,6 +3595,7 @@ static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) { int start = 0; + int i; u32 prev_legacy, cur_legacy; mutex_lock(&kvm->arch.vpit->pit_state.lock); prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; @@ -3581,7 +3605,9 @@ static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels, sizeof(kvm->arch.vpit->pit_state.channels)); kvm->arch.vpit->pit_state.flags = ps->flags; - kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start); + for (i = 0; i < 3; i++) + kvm_pit_load_count(kvm, i, kvm->arch.vpit->pit_state.channels[i].count, + start && i == 0); mutex_unlock(&kvm->arch.vpit->pit_state.lock); return 0; } @@ -5910,23 +5936,10 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) return emulator_write_emulated(ctxt, rip, instruction, 3, NULL); } -/* - * Check if userspace requested an interrupt window, and that the - * interrupt window is open. - * - * No need to exit to userspace if we already have an interrupt queued. - */ static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu) { - if (!vcpu->run->request_interrupt_window || pic_in_kernel(vcpu->kvm)) - return false; - - if (kvm_cpu_has_interrupt(vcpu)) - return false; - - return (irqchip_split(vcpu->kvm) - ? kvm_apic_accept_pic_intr(vcpu) - : kvm_arch_interrupt_allowed(vcpu)); + return vcpu->run->request_interrupt_window && + likely(!pic_in_kernel(vcpu->kvm)); } static void post_kvm_run_save(struct kvm_vcpu *vcpu) @@ -5937,17 +5950,9 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu) kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0; kvm_run->cr8 = kvm_get_cr8(vcpu); kvm_run->apic_base = kvm_get_apic_base(vcpu); - if (!irqchip_in_kernel(vcpu->kvm)) - kvm_run->ready_for_interrupt_injection = - kvm_arch_interrupt_allowed(vcpu) && - !kvm_cpu_has_interrupt(vcpu) && - !kvm_event_needs_reinjection(vcpu); - else if (!pic_in_kernel(vcpu->kvm)) - kvm_run->ready_for_interrupt_injection = - kvm_apic_accept_pic_intr(vcpu) && - !kvm_cpu_has_interrupt(vcpu); - else - kvm_run->ready_for_interrupt_injection = 1; + kvm_run->ready_for_interrupt_injection = + pic_in_kernel(vcpu->kvm) || + kvm_vcpu_ready_for_interrupt_injection(vcpu); } static void update_cr8_intercept(struct kvm_vcpu *vcpu) @@ -6360,8 +6365,10 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, static int vcpu_enter_guest(struct kvm_vcpu *vcpu) { int r; - bool req_int_win = !lapic_in_kernel(vcpu) && - vcpu->run->request_interrupt_window; + bool req_int_win = + dm_request_for_irq_injection(vcpu) && + kvm_cpu_accept_dm_intr(vcpu); + bool req_immediate_exit = false; if (vcpu->requests) { @@ -6513,6 +6520,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (req_immediate_exit) smp_send_reschedule(vcpu->cpu); + trace_kvm_entry(vcpu->vcpu_id); + wait_lapic_expire(vcpu); __kvm_guest_enter(); if (unlikely(vcpu->arch.switch_db_regs)) { @@ -6525,8 +6534,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; } - trace_kvm_entry(vcpu->vcpu_id); - wait_lapic_expire(vcpu); kvm_x86_ops->run(vcpu); /* @@ -6663,7 +6670,8 @@ static int vcpu_run(struct kvm_vcpu *vcpu) if (kvm_cpu_has_pending_timer(vcpu)) kvm_inject_pending_timer_irqs(vcpu); - if (dm_request_for_irq_injection(vcpu)) { + if (dm_request_for_irq_injection(vcpu) && + kvm_vcpu_ready_for_interrupt_injection(vcpu)) { r = 0; vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; ++vcpu->stat.request_irq_exits; diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index a0d09f6c6533..a43b2eafc466 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1414,6 +1414,7 @@ __init void lguest_init(void) pv_info.kernel_rpl = 1; /* Everyone except Xen runs with this set. */ pv_info.shared_kernel_pmd = 1; + pv_info.features = 0; /* * We set up all the lguest overrides for sensitive operations. These diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index a035c2aa7801..0f1c6fc3ddd8 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -89,7 +89,7 @@ static struct addr_marker address_markers[] = { { 0/* VMALLOC_START */, "vmalloc() Area" }, { 0/*VMALLOC_END*/, "vmalloc() End" }, # ifdef CONFIG_HIGHMEM - { 0/*PKMAP_BASE*/, "Persisent kmap() Area" }, + { 0/*PKMAP_BASE*/, "Persistent kmap() Area" }, # endif { 0/*FIXADDR_START*/, "Fixmap Area" }, #endif diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index b0ae85f90f10..b2fd67da1701 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -101,19 +101,19 @@ static int get_reg_offset(struct insn *insn, struct pt_regs *regs, switch (type) { case REG_TYPE_RM: regno = X86_MODRM_RM(insn->modrm.value); - if (X86_REX_B(insn->rex_prefix.value) == 1) + if (X86_REX_B(insn->rex_prefix.value)) regno += 8; break; case REG_TYPE_INDEX: regno = X86_SIB_INDEX(insn->sib.value); - if (X86_REX_X(insn->rex_prefix.value) == 1) + if (X86_REX_X(insn->rex_prefix.value)) regno += 8; break; case REG_TYPE_BASE: regno = X86_SIB_BASE(insn->sib.value); - if (X86_REX_B(insn->rex_prefix.value) == 1) + if (X86_REX_B(insn->rex_prefix.value)) regno += 8; break; @@ -586,6 +586,29 @@ static unsigned long mpx_bd_entry_to_bt_addr(struct mm_struct *mm, } /* + * We only want to do a 4-byte get_user() on 32-bit. Otherwise, + * we might run off the end of the bounds table if we are on + * a 64-bit kernel and try to get 8 bytes. + */ +int get_user_bd_entry(struct mm_struct *mm, unsigned long *bd_entry_ret, + long __user *bd_entry_ptr) +{ + u32 bd_entry_32; + int ret; + + if (is_64bit_mm(mm)) + return get_user(*bd_entry_ret, bd_entry_ptr); + + /* + * Note that get_user() uses the type of the *pointer* to + * establish the size of the get, not the destination. + */ + ret = get_user(bd_entry_32, (u32 __user *)bd_entry_ptr); + *bd_entry_ret = bd_entry_32; + return ret; +} + +/* * Get the base of bounds tables pointed by specific bounds * directory entry. */ @@ -605,7 +628,7 @@ static int get_bt_addr(struct mm_struct *mm, int need_write = 0; pagefault_disable(); - ret = get_user(bd_entry, bd_entry_ptr); + ret = get_user_bd_entry(mm, &bd_entry, bd_entry_ptr); pagefault_enable(); if (!ret) break; @@ -700,11 +723,23 @@ static unsigned long mpx_get_bt_entry_offset_bytes(struct mm_struct *mm, */ static inline unsigned long bd_entry_virt_space(struct mm_struct *mm) { - unsigned long long virt_space = (1ULL << boot_cpu_data.x86_virt_bits); - if (is_64bit_mm(mm)) - return virt_space / MPX_BD_NR_ENTRIES_64; - else - return virt_space / MPX_BD_NR_ENTRIES_32; + unsigned long long virt_space; + unsigned long long GB = (1ULL << 30); + + /* + * This covers 32-bit emulation as well as 32-bit kernels + * running on 64-bit harware. + */ + if (!is_64bit_mm(mm)) + return (4ULL * GB) / MPX_BD_NR_ENTRIES_32; + + /* + * 'x86_virt_bits' returns what the hardware is capable + * of, and returns the full >32-bit adddress space when + * running 32-bit kernels on 64-bit hardware. + */ + virt_space = (1ULL << boot_cpu_data.x86_virt_bits); + return virt_space / MPX_BD_NR_ENTRIES_64; } /* diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c index 7bcf06a7cd12..6eb3c8af96e2 100644 --- a/arch/x86/pci/bus_numa.c +++ b/arch/x86/pci/bus_numa.c @@ -50,18 +50,9 @@ void x86_pci_root_bus_resources(int bus, struct list_head *resources) if (!found) pci_add_resource(resources, &info->busn); - list_for_each_entry(root_res, &info->resources, list) { - struct resource *res; - struct resource *root; + list_for_each_entry(root_res, &info->resources, list) + pci_add_resource(resources, &root_res->res); - res = &root_res->res; - pci_add_resource(resources, res); - if (res->flags & IORESOURCE_IO) - root = &ioport_resource; - else - root = &iomem_resource; - insert_resource(root, res); - } return; default_resources: diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c index 06934a8a4872..14fcd01ed992 100644 --- a/arch/x86/um/signal.c +++ b/arch/x86/um/signal.c @@ -211,7 +211,7 @@ static int copy_sc_from_user(struct pt_regs *regs, if (err) return 1; - err = convert_fxsr_from_user(&fpx, sc.fpstate); + err = convert_fxsr_from_user(&fpx, (void *)sc.fpstate); if (err) return 1; @@ -227,7 +227,7 @@ static int copy_sc_from_user(struct pt_regs *regs, { struct user_i387_struct fp; - err = copy_from_user(&fp, sc.fpstate, + err = copy_from_user(&fp, (void *)sc.fpstate, sizeof(struct user_i387_struct)); if (err) return 1; @@ -291,7 +291,7 @@ static int copy_sc_to_user(struct sigcontext __user *to, #endif #undef PUTREG sc.oldmask = mask; - sc.fpstate = to_fp; + sc.fpstate = (unsigned long)to_fp; err = copy_to_user(to, &sc, sizeof(struct sigcontext)); if (err) @@ -468,12 +468,10 @@ long sys_sigreturn(void) struct sigframe __user *frame = (struct sigframe __user *)(sp - 8); sigset_t set; struct sigcontext __user *sc = &frame->sc; - unsigned long __user *oldmask = &sc->oldmask; - unsigned long __user *extramask = frame->extramask; int sig_size = (_NSIG_WORDS - 1) * sizeof(unsigned long); - if (copy_from_user(&set.sig[0], oldmask, sizeof(set.sig[0])) || - copy_from_user(&set.sig[1], extramask, sig_size)) + if (copy_from_user(&set.sig[0], &sc->oldmask, sizeof(set.sig[0])) || + copy_from_user(&set.sig[1], frame->extramask, sig_size)) goto segfault; set_current_blocked(&set); @@ -505,6 +503,7 @@ int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig, { struct rt_sigframe __user *frame; int err = 0, sig = ksig->sig; + unsigned long fp_to; frame = (struct rt_sigframe __user *) round_down(stack_top - sizeof(struct rt_sigframe), 16); @@ -526,7 +525,10 @@ int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig, err |= __save_altstack(&frame->uc.uc_stack, PT_REGS_SP(regs)); err |= copy_sc_to_user(&frame->uc.uc_mcontext, &frame->fpstate, regs, set->sig[0]); - err |= __put_user(&frame->fpstate, &frame->uc.uc_mcontext.fpstate); + + fp_to = (unsigned long)&frame->fpstate; + + err |= __put_user(fp_to, &frame->uc.uc_mcontext.fpstate); if (sizeof(*set) == 16) { err |= __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]); err |= __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 5774800ff583..b7de78bdc09c 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1192,7 +1192,7 @@ static const struct pv_info xen_info __initconst = { #ifdef CONFIG_X86_64 .extra_user_64bit_cs = FLAT_USER_CS64, #endif - + .features = 0, .name = "Xen", }; @@ -1535,6 +1535,8 @@ asmlinkage __visible void __init xen_start_kernel(void) /* Install Xen paravirt ops */ pv_info = xen_info; + if (xen_initial_domain()) + pv_info.features |= PV_SUPPORTED_RTC; pv_init_ops = xen_init_ops; pv_apic_ops = xen_apic_ops; if (!xen_pvh_domain()) { @@ -1886,8 +1888,10 @@ EXPORT_SYMBOL_GPL(xen_hvm_need_lapic); static void xen_set_cpu_features(struct cpuinfo_x86 *c) { - if (xen_pv_domain()) + if (xen_pv_domain()) { clear_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); + set_cpu_cap(c, X86_FEATURE_XENPV); + } } const struct hypervisor_x86 x86_hyper_xen = { diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index ac161db63388..cb5e266a8bf7 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -2495,14 +2495,9 @@ void __init xen_init_mmu_ops(void) { x86_init.paging.pagetable_init = xen_pagetable_init; - /* Optimization - we can use the HVM one but it has no idea which - * VCPUs are descheduled - which means that it will needlessly IPI - * them. Xen knows so let it do the job. - */ - if (xen_feature(XENFEAT_auto_translated_physmap)) { - pv_mmu_ops.flush_tlb_others = xen_flush_tlb_others; + if (xen_feature(XENFEAT_auto_translated_physmap)) return; - } + pv_mmu_ops = xen_mmu_ops; memset(dummy_mapping, 0xff, PAGE_SIZE); diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index feddabdab448..df0c40559583 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -1,6 +1,7 @@ #include <linux/types.h> #include <linux/tick.h> +#include <xen/xen.h> #include <xen/interface/xen.h> #include <xen/grant_table.h> #include <xen/events.h> @@ -68,26 +69,16 @@ static void xen_pv_post_suspend(int suspend_cancelled) void xen_arch_pre_suspend(void) { - int cpu; - - for_each_online_cpu(cpu) - xen_pmu_finish(cpu); - if (xen_pv_domain()) xen_pv_pre_suspend(); } void xen_arch_post_suspend(int cancelled) { - int cpu; - if (xen_pv_domain()) xen_pv_post_suspend(cancelled); else xen_hvm_post_suspend(cancelled); - - for_each_online_cpu(cpu) - xen_pmu_init(cpu); } static void xen_vcpu_notify_restore(void *data) @@ -106,10 +97,20 @@ static void xen_vcpu_notify_suspend(void *data) void xen_arch_resume(void) { + int cpu; + on_each_cpu(xen_vcpu_notify_restore, NULL, 1); + + for_each_online_cpu(cpu) + xen_pmu_init(cpu); } void xen_arch_suspend(void) { + int cpu; + + for_each_online_cpu(cpu) + xen_pmu_finish(cpu); + on_each_cpu(xen_vcpu_notify_suspend, NULL, 1); } |