summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorPaolo Bonzini <pbonzini@redhat.com>2020-05-20 09:40:09 +0200
committerPaolo Bonzini <pbonzini@redhat.com>2020-05-20 09:40:09 +0200
commit9d5272f5e36155bcead69417fd12e98624e7faef (patch)
tree770af81c03932ad4e508a12cca6f83fbb3527689 /arch/x86
parentrcuwait: avoid lockdep splats from rcuwait_active() (diff)
parentx86/kvm: Restrict ASYNC_PF to user space (diff)
downloadlinux-9d5272f5e36155bcead69417fd12e98624e7faef.tar.xz
linux-9d5272f5e36155bcead69417fd12e98624e7faef.zip
Merge tag 'noinstr-x86-kvm-2020-05-16' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip into HEAD
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig2
-rw-r--r--arch/x86/crypto/blake2s-glue.c10
-rw-r--r--arch/x86/crypto/chacha_glue.c14
-rw-r--r--arch/x86/crypto/nhpoly1305-avx2-glue.c2
-rw-r--r--arch/x86/crypto/nhpoly1305-sse2-glue.c2
-rw-r--r--arch/x86/crypto/poly1305_glue.c13
-rw-r--r--arch/x86/entry/calling.h40
-rw-r--r--arch/x86/entry/entry_32.S8
-rw-r--r--arch/x86/entry/entry_64.S18
-rw-r--r--arch/x86/events/intel/cstate.c1
-rw-r--r--arch/x86/hyperv/hv_init.c12
-rw-r--r--arch/x86/include/asm/ftrace.h5
-rw-r--r--arch/x86/include/asm/kvm_para.h23
-rw-r--r--arch/x86/include/asm/module.h60
-rw-r--r--arch/x86/include/asm/mshyperv.h2
-rw-r--r--arch/x86/include/asm/unwind.h2
-rw-r--r--arch/x86/include/asm/vermagic.h68
-rw-r--r--arch/x86/include/asm/x86_init.h2
-rw-r--r--arch/x86/kernel/apic/apic.c27
-rw-r--r--arch/x86/kernel/dumpstack_64.c3
-rw-r--r--arch/x86/kernel/kvm.c158
-rw-r--r--arch/x86/kernel/smpboot.c47
-rw-r--r--arch/x86/kernel/traps.c2
-rw-r--r--arch/x86/kernel/unwind_frame.c3
-rw-r--r--arch/x86/kernel/unwind_orc.c113
-rw-r--r--arch/x86/kernel/x86_init.c1
-rw-r--r--arch/x86/kvm/mmu/mmu.c2
-rw-r--r--arch/x86/kvm/svm/sev.c2
-rw-r--r--arch/x86/mm/fault.c19
-rw-r--r--arch/x86/mm/pat/set_memory.c12
-rw-r--r--arch/x86/net/bpf_jit_comp.c18
-rw-r--r--arch/x86/net/bpf_jit_comp32.c28
32 files changed, 426 insertions, 293 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 1d6104ea8af0..1197b5596d5a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -149,7 +149,7 @@ config X86
select HAVE_ARCH_TRACEHOOK
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64
- select HAVE_ARCH_USERFAULTFD_WP if USERFAULTFD
+ select HAVE_ARCH_USERFAULTFD_WP if X86_64 && USERFAULTFD
select HAVE_ARCH_VMAP_STACK if X86_64
select HAVE_ARCH_WITHIN_STACK_FRAMES
select HAVE_ASM_MODVERSIONS
diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c
index 06ef2d4a4701..6737bcea1fa1 100644
--- a/arch/x86/crypto/blake2s-glue.c
+++ b/arch/x86/crypto/blake2s-glue.c
@@ -32,16 +32,16 @@ void blake2s_compress_arch(struct blake2s_state *state,
const u32 inc)
{
/* SIMD disables preemption, so relax after processing each page. */
- BUILD_BUG_ON(PAGE_SIZE / BLAKE2S_BLOCK_SIZE < 8);
+ BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
if (!static_branch_likely(&blake2s_use_ssse3) || !crypto_simd_usable()) {
blake2s_compress_generic(state, block, nblocks, inc);
return;
}
- for (;;) {
+ do {
const size_t blocks = min_t(size_t, nblocks,
- PAGE_SIZE / BLAKE2S_BLOCK_SIZE);
+ SZ_4K / BLAKE2S_BLOCK_SIZE);
kernel_fpu_begin();
if (IS_ENABLED(CONFIG_AS_AVX512) &&
@@ -52,10 +52,8 @@ void blake2s_compress_arch(struct blake2s_state *state,
kernel_fpu_end();
nblocks -= blocks;
- if (!nblocks)
- break;
block += blocks * BLAKE2S_BLOCK_SIZE;
- }
+ } while (nblocks);
}
EXPORT_SYMBOL(blake2s_compress_arch);
diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c
index b412c21ee06e..22250091cdbe 100644
--- a/arch/x86/crypto/chacha_glue.c
+++ b/arch/x86/crypto/chacha_glue.c
@@ -153,9 +153,17 @@ void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
bytes <= CHACHA_BLOCK_SIZE)
return chacha_crypt_generic(state, dst, src, bytes, nrounds);
- kernel_fpu_begin();
- chacha_dosimd(state, dst, src, bytes, nrounds);
- kernel_fpu_end();
+ do {
+ unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+ kernel_fpu_begin();
+ chacha_dosimd(state, dst, src, todo, nrounds);
+ kernel_fpu_end();
+
+ bytes -= todo;
+ src += todo;
+ dst += todo;
+ } while (bytes);
}
EXPORT_SYMBOL(chacha_crypt_arch);
diff --git a/arch/x86/crypto/nhpoly1305-avx2-glue.c b/arch/x86/crypto/nhpoly1305-avx2-glue.c
index f7567cbd35b6..80fcb85736e1 100644
--- a/arch/x86/crypto/nhpoly1305-avx2-glue.c
+++ b/arch/x86/crypto/nhpoly1305-avx2-glue.c
@@ -29,7 +29,7 @@ static int nhpoly1305_avx2_update(struct shash_desc *desc,
return crypto_nhpoly1305_update(desc, src, srclen);
do {
- unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE);
+ unsigned int n = min_t(unsigned int, srclen, SZ_4K);
kernel_fpu_begin();
crypto_nhpoly1305_update_helper(desc, src, n, _nh_avx2);
diff --git a/arch/x86/crypto/nhpoly1305-sse2-glue.c b/arch/x86/crypto/nhpoly1305-sse2-glue.c
index a661ede3b5cf..cc6b7c1a2705 100644
--- a/arch/x86/crypto/nhpoly1305-sse2-glue.c
+++ b/arch/x86/crypto/nhpoly1305-sse2-glue.c
@@ -29,7 +29,7 @@ static int nhpoly1305_sse2_update(struct shash_desc *desc,
return crypto_nhpoly1305_update(desc, src, srclen);
do {
- unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE);
+ unsigned int n = min_t(unsigned int, srclen, SZ_4K);
kernel_fpu_begin();
crypto_nhpoly1305_update_helper(desc, src, n, _nh_sse2);
diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c
index 6dfec19f7d57..dfe921efa9b2 100644
--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
@@ -91,8 +91,8 @@ static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len,
struct poly1305_arch_internal *state = ctx;
/* SIMD disables preemption, so relax after processing each page. */
- BUILD_BUG_ON(PAGE_SIZE < POLY1305_BLOCK_SIZE ||
- PAGE_SIZE % POLY1305_BLOCK_SIZE);
+ BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE ||
+ SZ_4K % POLY1305_BLOCK_SIZE);
if (!static_branch_likely(&poly1305_use_avx) ||
(len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) ||
@@ -102,8 +102,8 @@ static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len,
return;
}
- for (;;) {
- const size_t bytes = min_t(size_t, len, PAGE_SIZE);
+ do {
+ const size_t bytes = min_t(size_t, len, SZ_4K);
kernel_fpu_begin();
if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512))
@@ -113,11 +113,10 @@ static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len,
else
poly1305_blocks_avx(ctx, inp, bytes, padbit);
kernel_fpu_end();
+
len -= bytes;
- if (!len)
- break;
inp += bytes;
- }
+ } while (len);
}
static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 0789e13ece90..1c7f13bb6728 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -98,13 +98,6 @@ For 32-bit we have the following conventions - kernel is built with
#define SIZEOF_PTREGS 21*8
.macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax save_ret=0
- /*
- * Push registers and sanitize registers of values that a
- * speculation attack might otherwise want to exploit. The
- * lower registers are likely clobbered well before they
- * could be put to use in a speculative execution gadget.
- * Interleave XOR with PUSH for better uop scheduling:
- */
.if \save_ret
pushq %rsi /* pt_regs->si */
movq 8(%rsp), %rsi /* temporarily store the return address in %rsi */
@@ -114,34 +107,43 @@ For 32-bit we have the following conventions - kernel is built with
pushq %rsi /* pt_regs->si */
.endif
pushq \rdx /* pt_regs->dx */
- xorl %edx, %edx /* nospec dx */
pushq %rcx /* pt_regs->cx */
- xorl %ecx, %ecx /* nospec cx */
pushq \rax /* pt_regs->ax */
pushq %r8 /* pt_regs->r8 */
- xorl %r8d, %r8d /* nospec r8 */
pushq %r9 /* pt_regs->r9 */
- xorl %r9d, %r9d /* nospec r9 */
pushq %r10 /* pt_regs->r10 */
- xorl %r10d, %r10d /* nospec r10 */
pushq %r11 /* pt_regs->r11 */
- xorl %r11d, %r11d /* nospec r11*/
pushq %rbx /* pt_regs->rbx */
- xorl %ebx, %ebx /* nospec rbx*/
pushq %rbp /* pt_regs->rbp */
- xorl %ebp, %ebp /* nospec rbp*/
pushq %r12 /* pt_regs->r12 */
- xorl %r12d, %r12d /* nospec r12*/
pushq %r13 /* pt_regs->r13 */
- xorl %r13d, %r13d /* nospec r13*/
pushq %r14 /* pt_regs->r14 */
- xorl %r14d, %r14d /* nospec r14*/
pushq %r15 /* pt_regs->r15 */
- xorl %r15d, %r15d /* nospec r15*/
UNWIND_HINT_REGS
+
.if \save_ret
pushq %rsi /* return address on top of stack */
.endif
+
+ /*
+ * Sanitize registers of values that a speculation attack might
+ * otherwise want to exploit. The lower registers are likely clobbered
+ * well before they could be put to use in a speculative execution
+ * gadget.
+ */
+ xorl %edx, %edx /* nospec dx */
+ xorl %ecx, %ecx /* nospec cx */
+ xorl %r8d, %r8d /* nospec r8 */
+ xorl %r9d, %r9d /* nospec r9 */
+ xorl %r10d, %r10d /* nospec r10 */
+ xorl %r11d, %r11d /* nospec r11 */
+ xorl %ebx, %ebx /* nospec rbx */
+ xorl %ebp, %ebp /* nospec rbp */
+ xorl %r12d, %r12d /* nospec r12 */
+ xorl %r13d, %r13d /* nospec r13 */
+ xorl %r14d, %r14d /* nospec r14 */
+ xorl %r15d, %r15d /* nospec r15 */
+
.endm
.macro POP_REGS pop_rdi=1 skip_r11rcx=0
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index b67bae7091d7..8ba0985f5016 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -1693,14 +1693,6 @@ SYM_CODE_START(general_protection)
jmp common_exception
SYM_CODE_END(general_protection)
-#ifdef CONFIG_KVM_GUEST
-SYM_CODE_START(async_page_fault)
- ASM_CLAC
- pushl $do_async_page_fault
- jmp common_exception_read_cr2
-SYM_CODE_END(async_page_fault)
-#endif
-
SYM_CODE_START(rewind_stack_do_exit)
/* Prevent any naive code from trying to unwind to our caller. */
xorl %ebp, %ebp
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 0e9504fabe52..9ab3ea6d02fc 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -249,7 +249,6 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
*/
syscall_return_via_sysret:
/* rcx and r11 are already restored (see code above) */
- UNWIND_HINT_EMPTY
POP_REGS pop_rdi=0 skip_r11rcx=1
/*
@@ -258,6 +257,7 @@ syscall_return_via_sysret:
*/
movq %rsp, %rdi
movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
+ UNWIND_HINT_EMPTY
pushq RSP-RDI(%rdi) /* RSP */
pushq (%rdi) /* RDI */
@@ -279,8 +279,7 @@ SYM_CODE_END(entry_SYSCALL_64)
* %rdi: prev task
* %rsi: next task
*/
-SYM_CODE_START(__switch_to_asm)
- UNWIND_HINT_FUNC
+SYM_FUNC_START(__switch_to_asm)
/*
* Save callee-saved registers
* This must match the order in inactive_task_frame
@@ -321,7 +320,7 @@ SYM_CODE_START(__switch_to_asm)
popq %rbp
jmp __switch_to
-SYM_CODE_END(__switch_to_asm)
+SYM_FUNC_END(__switch_to_asm)
/*
* A newly forked process directly context switches into this address.
@@ -512,7 +511,7 @@ SYM_CODE_END(spurious_entries_start)
* +----------------------------------------------------+
*/
SYM_CODE_START(interrupt_entry)
- UNWIND_HINT_FUNC
+ UNWIND_HINT_IRET_REGS offset=16
ASM_CLAC
cld
@@ -544,9 +543,9 @@ SYM_CODE_START(interrupt_entry)
pushq 5*8(%rdi) /* regs->eflags */
pushq 4*8(%rdi) /* regs->cs */
pushq 3*8(%rdi) /* regs->ip */
+ UNWIND_HINT_IRET_REGS
pushq 2*8(%rdi) /* regs->orig_ax */
pushq 8(%rdi) /* return address */
- UNWIND_HINT_FUNC
movq (%rdi), %rdi
jmp 2f
@@ -637,6 +636,7 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
*/
movq %rsp, %rdi
movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
+ UNWIND_HINT_EMPTY
/* Copy the IRET frame to the trampoline stack. */
pushq 6*8(%rdi) /* SS */
@@ -1202,10 +1202,6 @@ idtentry xendebug do_debug has_error_code=0
idtentry general_protection do_general_protection has_error_code=1
idtentry page_fault do_page_fault has_error_code=1 read_cr2=1
-#ifdef CONFIG_KVM_GUEST
-idtentry async_page_fault do_async_page_fault has_error_code=1 read_cr2=1
-#endif
-
#ifdef CONFIG_X86_MCE
idtentry machine_check do_mce has_error_code=0 paranoid=1
#endif
@@ -1739,7 +1735,7 @@ SYM_CODE_START(rewind_stack_do_exit)
movq PER_CPU_VAR(cpu_current_top_of_stack), %rax
leaq -PTREGS_SIZE(%rax), %rsp
- UNWIND_HINT_FUNC sp_offset=PTREGS_SIZE
+ UNWIND_HINT_REGS
call do_exit
SYM_CODE_END(rewind_stack_do_exit)
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index e4aa20c0426f..442e1ed4acd4 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -643,6 +643,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS, &glm_cstates),
X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &glm_cstates),
X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT, &glm_cstates),
+ X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, &glm_cstates),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &icl_cstates),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_cstates),
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 624f5d9b0f79..fd51bac11b46 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -73,7 +73,8 @@ static int hv_cpu_init(unsigned int cpu)
struct page *pg;
input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
- pg = alloc_page(GFP_KERNEL);
+ /* hv_cpu_init() can be called with IRQs disabled from hv_resume() */
+ pg = alloc_page(irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL);
if (unlikely(!pg))
return -ENOMEM;
*input_arg = page_address(pg);
@@ -254,6 +255,7 @@ static int __init hv_pci_init(void)
static int hv_suspend(void)
{
union hv_x64_msr_hypercall_contents hypercall_msr;
+ int ret;
/*
* Reset the hypercall page as it is going to be invalidated
@@ -270,12 +272,17 @@ static int hv_suspend(void)
hypercall_msr.enable = 0;
wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
- return 0;
+ ret = hv_cpu_die(0);
+ return ret;
}
static void hv_resume(void)
{
union hv_x64_msr_hypercall_contents hypercall_msr;
+ int ret;
+
+ ret = hv_cpu_init(0);
+ WARN_ON(ret);
/* Re-enable the hypercall page */
rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
@@ -288,6 +295,7 @@ static void hv_resume(void)
hv_hypercall_pg_saved = NULL;
}
+/* Note: when the ops are called, only CPU0 is online and IRQs are disabled. */
static struct syscore_ops hv_syscore_ops = {
.suspend = hv_suspend,
.resume = hv_resume,
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 85be2f506272..70b96cae5b42 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -61,11 +61,12 @@ static inline bool arch_syscall_match_sym_name(const char *sym, const char *name
{
/*
* Compare the symbol name with the system call name. Skip the
- * "__x64_sys", "__ia32_sys" or simple "sys" prefix.
+ * "__x64_sys", "__ia32_sys", "__do_sys" or simple "sys" prefix.
*/
return !strcmp(sym + 3, name + 3) ||
(!strncmp(sym, "__x64_", 6) && !strcmp(sym + 9, name + 3)) ||
- (!strncmp(sym, "__ia32_", 7) && !strcmp(sym + 10, name + 3));
+ (!strncmp(sym, "__ia32_", 7) && !strcmp(sym + 10, name + 3)) ||
+ (!strncmp(sym, "__do_sys", 8) && !strcmp(sym + 8, name + 3));
}
#ifndef COMPILE_OFFSETS
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 9b4df6eaa11a..118e5c2379f9 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -88,11 +88,21 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
bool kvm_para_available(void);
unsigned int kvm_arch_para_features(void);
unsigned int kvm_arch_para_hints(void);
-void kvm_async_pf_task_wait(u32 token, int interrupt_kernel);
+void kvm_async_pf_task_wait_schedule(u32 token);
void kvm_async_pf_task_wake(u32 token);
u32 kvm_read_and_reset_pf_reason(void);
-extern void kvm_disable_steal_time(void);
-void do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address);
+void kvm_disable_steal_time(void);
+bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token);
+
+DECLARE_STATIC_KEY_FALSE(kvm_async_pf_enabled);
+
+static __always_inline bool kvm_handle_async_pf(struct pt_regs *regs, u32 token)
+{
+ if (static_branch_unlikely(&kvm_async_pf_enabled))
+ return __kvm_handle_async_pf(regs, token);
+ else
+ return false;
+}
#ifdef CONFIG_PARAVIRT_SPINLOCKS
void __init kvm_spinlock_init(void);
@@ -103,7 +113,7 @@ static inline void kvm_spinlock_init(void)
#endif /* CONFIG_PARAVIRT_SPINLOCKS */
#else /* CONFIG_KVM_GUEST */
-#define kvm_async_pf_task_wait(T, I) do {} while(0)
+#define kvm_async_pf_task_wait_schedule(T) do {} while(0)
#define kvm_async_pf_task_wake(T) do {} while(0)
static inline bool kvm_para_available(void)
@@ -130,6 +140,11 @@ static inline void kvm_disable_steal_time(void)
{
return;
}
+
+static inline bool kvm_handle_async_pf(struct pt_regs *regs, u32 token)
+{
+ return false;
+}
#endif
#endif /* _ASM_X86_KVM_PARA_H */
diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h
index c215d2762488..e988bac0a4a1 100644
--- a/arch/x86/include/asm/module.h
+++ b/arch/x86/include/asm/module.h
@@ -13,64 +13,4 @@ struct mod_arch_specific {
#endif
};
-#ifdef CONFIG_X86_64
-/* X86_64 does not define MODULE_PROC_FAMILY */
-#elif defined CONFIG_M486SX
-#define MODULE_PROC_FAMILY "486SX "
-#elif defined CONFIG_M486
-#define MODULE_PROC_FAMILY "486 "
-#elif defined CONFIG_M586
-#define MODULE_PROC_FAMILY "586 "
-#elif defined CONFIG_M586TSC
-#define MODULE_PROC_FAMILY "586TSC "
-#elif defined CONFIG_M586MMX
-#define MODULE_PROC_FAMILY "586MMX "
-#elif defined CONFIG_MCORE2
-#define MODULE_PROC_FAMILY "CORE2 "
-#elif defined CONFIG_MATOM
-#define MODULE_PROC_FAMILY "ATOM "
-#elif defined CONFIG_M686
-#define MODULE_PROC_FAMILY "686 "
-#elif defined CONFIG_MPENTIUMII
-#define MODULE_PROC_FAMILY "PENTIUMII "
-#elif defined CONFIG_MPENTIUMIII
-#define MODULE_PROC_FAMILY "PENTIUMIII "
-#elif defined CONFIG_MPENTIUMM
-#define MODULE_PROC_FAMILY "PENTIUMM "
-#elif defined CONFIG_MPENTIUM4
-#define MODULE_PROC_FAMILY "PENTIUM4 "
-#elif defined CONFIG_MK6
-#define MODULE_PROC_FAMILY "K6 "
-#elif defined CONFIG_MK7
-#define MODULE_PROC_FAMILY "K7 "
-#elif defined CONFIG_MK8
-#define MODULE_PROC_FAMILY "K8 "
-#elif defined CONFIG_MELAN
-#define MODULE_PROC_FAMILY "ELAN "
-#elif defined CONFIG_MCRUSOE
-#define MODULE_PROC_FAMILY "CRUSOE "
-#elif defined CONFIG_MEFFICEON
-#define MODULE_PROC_FAMILY "EFFICEON "
-#elif defined CONFIG_MWINCHIPC6
-#define MODULE_PROC_FAMILY "WINCHIPC6 "
-#elif defined CONFIG_MWINCHIP3D
-#define MODULE_PROC_FAMILY "WINCHIP3D "
-#elif defined CONFIG_MCYRIXIII
-#define MODULE_PROC_FAMILY "CYRIXIII "
-#elif defined CONFIG_MVIAC3_2
-#define MODULE_PROC_FAMILY "VIAC3-2 "
-#elif defined CONFIG_MVIAC7
-#define MODULE_PROC_FAMILY "VIAC7 "
-#elif defined CONFIG_MGEODEGX1
-#define MODULE_PROC_FAMILY "GEODEGX1 "
-#elif defined CONFIG_MGEODE_LX
-#define MODULE_PROC_FAMILY "GEODE "
-#else
-#error unknown processor family
-#endif
-
-#ifdef CONFIG_X86_32
-# define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY
-#endif
-
#endif /* _ASM_X86_MODULE_H */
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 1c42ecbe75cb..d30805ed323e 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -35,6 +35,8 @@ typedef int (*hyperv_fill_flush_list_func)(
rdmsrl(HV_X64_MSR_SINT0 + int_num, val)
#define hv_set_synint_state(int_num, val) \
wrmsrl(HV_X64_MSR_SINT0 + int_num, val)
+#define hv_recommend_using_aeoi() \
+ (!(ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED))
#define hv_get_crash_ctl(val) \
rdmsrl(HV_X64_MSR_CRASH_CTL, val)
diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
index 499578f7e6d7..70fc159ebe69 100644
--- a/arch/x86/include/asm/unwind.h
+++ b/arch/x86/include/asm/unwind.h
@@ -19,7 +19,7 @@ struct unwind_state {
#if defined(CONFIG_UNWINDER_ORC)
bool signal, full_regs;
unsigned long sp, bp, ip;
- struct pt_regs *regs;
+ struct pt_regs *regs, *prev_regs;
#elif defined(CONFIG_UNWINDER_FRAME_POINTER)
bool got_irq;
unsigned long *bp, *orig_sp, ip;
diff --git a/arch/x86/include/asm/vermagic.h b/arch/x86/include/asm/vermagic.h
new file mode 100644
index 000000000000..75884d2cdec3
--- /dev/null
+++ b/arch/x86/include/asm/vermagic.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _ASM_VERMAGIC_H
+#define _ASM_VERMAGIC_H
+
+#ifdef CONFIG_X86_64
+/* X86_64 does not define MODULE_PROC_FAMILY */
+#elif defined CONFIG_M486SX
+#define MODULE_PROC_FAMILY "486SX "
+#elif defined CONFIG_M486
+#define MODULE_PROC_FAMILY "486 "
+#elif defined CONFIG_M586
+#define MODULE_PROC_FAMILY "586 "
+#elif defined CONFIG_M586TSC
+#define MODULE_PROC_FAMILY "586TSC "
+#elif defined CONFIG_M586MMX
+#define MODULE_PROC_FAMILY "586MMX "
+#elif defined CONFIG_MCORE2
+#define MODULE_PROC_FAMILY "CORE2 "
+#elif defined CONFIG_MATOM
+#define MODULE_PROC_FAMILY "ATOM "
+#elif defined CONFIG_M686
+#define MODULE_PROC_FAMILY "686 "
+#elif defined CONFIG_MPENTIUMII
+#define MODULE_PROC_FAMILY "PENTIUMII "
+#elif defined CONFIG_MPENTIUMIII
+#define MODULE_PROC_FAMILY "PENTIUMIII "
+#elif defined CONFIG_MPENTIUMM
+#define MODULE_PROC_FAMILY "PENTIUMM "
+#elif defined CONFIG_MPENTIUM4
+#define MODULE_PROC_FAMILY "PENTIUM4 "
+#elif defined CONFIG_MK6
+#define MODULE_PROC_FAMILY "K6 "
+#elif defined CONFIG_MK7
+#define MODULE_PROC_FAMILY "K7 "
+#elif defined CONFIG_MK8
+#define MODULE_PROC_FAMILY "K8 "
+#elif defined CONFIG_MELAN
+#define MODULE_PROC_FAMILY "ELAN "
+#elif defined CONFIG_MCRUSOE
+#define MODULE_PROC_FAMILY "CRUSOE "
+#elif defined CONFIG_MEFFICEON
+#define MODULE_PROC_FAMILY "EFFICEON "
+#elif defined CONFIG_MWINCHIPC6
+#define MODULE_PROC_FAMILY "WINCHIPC6 "
+#elif defined CONFIG_MWINCHIP3D
+#define MODULE_PROC_FAMILY "WINCHIP3D "
+#elif defined CONFIG_MCYRIXIII
+#define MODULE_PROC_FAMILY "CYRIXIII "
+#elif defined CONFIG_MVIAC3_2
+#define MODULE_PROC_FAMILY "VIAC3-2 "
+#elif defined CONFIG_MVIAC7
+#define MODULE_PROC_FAMILY "VIAC7 "
+#elif defined CONFIG_MGEODEGX1
+#define MODULE_PROC_FAMILY "GEODEGX1 "
+#elif defined CONFIG_MGEODE_LX
+#define MODULE_PROC_FAMILY "GEODE "
+#else
+#error unknown processor family
+#endif
+
+#ifdef CONFIG_X86_32
+# define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY
+#else
+# define MODULE_ARCH_VERMAGIC ""
+#endif
+
+#endif /* _ASM_VERMAGIC_H */
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 96d9cd208610..6807153c0410 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -50,14 +50,12 @@ struct x86_init_resources {
* @pre_vector_init: init code to run before interrupt vectors
* are set up.
* @intr_init: interrupt init code
- * @trap_init: platform specific trap setup
* @intr_mode_select: interrupt delivery mode selection
* @intr_mode_init: interrupt delivery mode setup
*/
struct x86_init_irqs {
void (*pre_vector_init)(void);
void (*intr_init)(void);
- void (*trap_init)(void);
void (*intr_mode_select)(void);
void (*intr_mode_init)(void);
};
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 81b9c63dae1b..e53dda210cd7 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -352,8 +352,6 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
* According to Intel, MFENCE can do the serialization here.
*/
asm volatile("mfence" : : : "memory");
-
- printk_once(KERN_DEBUG "TSC deadline timer enabled\n");
return;
}
@@ -546,7 +544,7 @@ static struct clock_event_device lapic_clockevent = {
};
static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
-static u32 hsx_deadline_rev(void)
+static __init u32 hsx_deadline_rev(void)
{
switch (boot_cpu_data.x86_stepping) {
case 0x02: return 0x3a; /* EP */
@@ -556,7 +554,7 @@ static u32 hsx_deadline_rev(void)
return ~0U;
}
-static u32 bdx_deadline_rev(void)
+static __init u32 bdx_deadline_rev(void)
{
switch (boot_cpu_data.x86_stepping) {
case 0x02: return 0x00000011;
@@ -568,7 +566,7 @@ static u32 bdx_deadline_rev(void)
return ~0U;
}
-static u32 skx_deadline_rev(void)
+static __init u32 skx_deadline_rev(void)
{
switch (boot_cpu_data.x86_stepping) {
case 0x03: return 0x01000136;
@@ -581,7 +579,7 @@ static u32 skx_deadline_rev(void)
return ~0U;
}
-static const struct x86_cpu_id deadline_match[] = {
+static const struct x86_cpu_id deadline_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL( HASWELL_X, &hsx_deadline_rev),
X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_X, 0x0b000020),
X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_D, &bdx_deadline_rev),
@@ -603,18 +601,19 @@ static const struct x86_cpu_id deadline_match[] = {
{},
};
-static void apic_check_deadline_errata(void)
+static __init bool apic_validate_deadline_timer(void)
{
const struct x86_cpu_id *m;
u32 rev;
- if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER) ||
- boot_cpu_has(X86_FEATURE_HYPERVISOR))
- return;
+ if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
+ return false;
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return true;
m = x86_match_cpu(deadline_match);
if (!m)
- return;
+ return true;
/*
* Function pointers will have the MSB set due to address layout,
@@ -626,11 +625,12 @@ static void apic_check_deadline_errata(void)
rev = (u32)m->driver_data;
if (boot_cpu_data.microcode >= rev)
- return;
+ return true;
setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
pr_err(FW_BUG "TSC_DEADLINE disabled due to Errata; "
"please update microcode to version: 0x%x (or later)\n", rev);
+ return false;
}
/*
@@ -2092,7 +2092,8 @@ void __init init_apic_mappings(void)
{
unsigned int new_apicid;
- apic_check_deadline_errata();
+ if (apic_validate_deadline_timer())
+ pr_debug("TSC deadline timer available\n");
if (x2apic_mode) {
boot_cpu_physical_apicid = read_apic_id();
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 87b97897a881..460ae7f66818 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -183,7 +183,8 @@ recursion_check:
*/
if (visit_mask) {
if (*visit_mask & (1UL << info->type)) {
- printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type);
+ if (task == current)
+ printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type);
goto unknown;
}
*visit_mask |= 1UL << info->type;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 6efe0410fb72..b3d9b0d7a37d 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -35,6 +35,8 @@
#include <asm/tlb.h>
#include <asm/cpuidle_haltpoll.h>
+DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled);
+
static int kvmapf = 1;
static int __init parse_no_kvmapf(char *arg)
@@ -73,7 +75,6 @@ struct kvm_task_sleep_node {
struct swait_queue_head wq;
u32 token;
int cpu;
- bool halted;
};
static struct kvm_task_sleep_head {
@@ -96,77 +97,64 @@ static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
return NULL;
}
-/*
- * @interrupt_kernel: Is this called from a routine which interrupts the kernel
- * (other than user space)?
- */
-void kvm_async_pf_task_wait(u32 token, int interrupt_kernel)
+static bool kvm_async_pf_queue_task(u32 token, struct kvm_task_sleep_node *n)
{
u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
- struct kvm_task_sleep_node n, *e;
- DECLARE_SWAITQUEUE(wait);
-
- rcu_irq_enter();
+ struct kvm_task_sleep_node *e;
raw_spin_lock(&b->lock);
e = _find_apf_task(b, token);
if (e) {
/* dummy entry exist -> wake up was delivered ahead of PF */
hlist_del(&e->link);
- kfree(e);
raw_spin_unlock(&b->lock);
-
- rcu_irq_exit();
- return;
+ kfree(e);
+ return false;
}
- n.token = token;
- n.cpu = smp_processor_id();
- n.halted = is_idle_task(current) ||
- (IS_ENABLED(CONFIG_PREEMPT_COUNT)
- ? preempt_count() > 1 || rcu_preempt_depth()
- : interrupt_kernel);
- init_swait_queue_head(&n.wq);
- hlist_add_head(&n.link, &b->list);
+ n->token = token;
+ n->cpu = smp_processor_id();
+ init_swait_queue_head(&n->wq);
+ hlist_add_head(&n->link, &b->list);
raw_spin_unlock(&b->lock);
+ return true;
+}
+
+/*
+ * kvm_async_pf_task_wait_schedule - Wait for pagefault to be handled
+ * @token: Token to identify the sleep node entry
+ *
+ * Invoked from the async pagefault handling code or from the VM exit page
+ * fault handler. In both cases RCU is watching.
+ */
+void kvm_async_pf_task_wait_schedule(u32 token)
+{
+ struct kvm_task_sleep_node n;
+ DECLARE_SWAITQUEUE(wait);
+
+ lockdep_assert_irqs_disabled();
+
+ if (!kvm_async_pf_queue_task(token, &n))
+ return;
for (;;) {
- if (!n.halted)
- prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
+ prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
if (hlist_unhashed(&n.link))
break;
- rcu_irq_exit();
-
- if (!n.halted) {
- local_irq_enable();
- schedule();
- local_irq_disable();
- } else {
- /*
- * We cannot reschedule. So halt.
- */
- native_safe_halt();
- local_irq_disable();
- }
-
- rcu_irq_enter();
+ local_irq_enable();
+ schedule();
+ local_irq_disable();
}
- if (!n.halted)
- finish_swait(&n.wq, &wait);
-
- rcu_irq_exit();
- return;
+ finish_swait(&n.wq, &wait);
}
-EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait);
+EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait_schedule);
static void apf_task_wake_one(struct kvm_task_sleep_node *n)
{
hlist_del_init(&n->link);
- if (n->halted)
- smp_send_reschedule(n->cpu);
- else if (swq_has_sleeper(&n->wq))
+ if (swq_has_sleeper(&n->wq))
swake_up_one(&n->wq);
}
@@ -175,12 +163,13 @@ static void apf_task_wake_all(void)
int i;
for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
- struct hlist_node *p, *next;
struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
+ struct kvm_task_sleep_node *n;
+ struct hlist_node *p, *next;
+
raw_spin_lock(&b->lock);
hlist_for_each_safe(p, next, &b->list) {
- struct kvm_task_sleep_node *n =
- hlist_entry(p, typeof(*n), link);
+ n = hlist_entry(p, typeof(*n), link);
if (n->cpu == smp_processor_id())
apf_task_wake_one(n);
}
@@ -221,8 +210,9 @@ again:
n->cpu = smp_processor_id();
init_swait_queue_head(&n->wq);
hlist_add_head(&n->link, &b->list);
- } else
+ } else {
apf_task_wake_one(n);
+ }
raw_spin_unlock(&b->lock);
return;
}
@@ -242,25 +232,39 @@ u32 kvm_read_and_reset_pf_reason(void)
EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason);
-dotraplinkage void
-do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
{
- switch (kvm_read_and_reset_pf_reason()) {
- default:
- do_page_fault(regs, error_code, address);
- break;
+ u32 reason = kvm_read_and_reset_pf_reason();
+
+ switch (reason) {
case KVM_PV_REASON_PAGE_NOT_PRESENT:
- /* page is swapped out by the host. */
- kvm_async_pf_task_wait((u32)address, !user_mode(regs));
- break;
case KVM_PV_REASON_PAGE_READY:
+ break;
+ default:
+ return false;
+ }
+
+ /*
+ * If the host managed to inject an async #PF into an interrupt
+ * disabled region, then die hard as this is not going to end well
+ * and the host side is seriously broken.
+ */
+ if (unlikely(!(regs->flags & X86_EFLAGS_IF)))
+ panic("Host injected async #PF in interrupt disabled region\n");
+
+ if (reason == KVM_PV_REASON_PAGE_NOT_PRESENT) {
+ if (unlikely(!(user_mode(regs))))
+ panic("Host injected async #PF in kernel mode\n");
+ /* Page is swapped out by the host. */
+ kvm_async_pf_task_wait_schedule(token);
+ } else {
rcu_irq_enter();
- kvm_async_pf_task_wake((u32)address);
+ kvm_async_pf_task_wake(token);
rcu_irq_exit();
- break;
}
+ return true;
}
-NOKPROBE_SYMBOL(do_async_page_fault);
+NOKPROBE_SYMBOL(__kvm_handle_async_pf);
static void __init paravirt_ops_setup(void)
{
@@ -306,11 +310,11 @@ static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val)
static void kvm_guest_cpu_init(void)
{
if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
- u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
+ u64 pa;
-#ifdef CONFIG_PREEMPTION
- pa |= KVM_ASYNC_PF_SEND_ALWAYS;
-#endif
+ WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled));
+
+ pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
pa |= KVM_ASYNC_PF_ENABLED;
if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT))
@@ -318,12 +322,12 @@ static void kvm_guest_cpu_init(void)
wrmsrl(MSR_KVM_ASYNC_PF_EN, pa);
__this_cpu_write(apf_reason.enabled, 1);
- printk(KERN_INFO"KVM setup async PF for cpu %d\n",
- smp_processor_id());
+ pr_info("KVM setup async PF for cpu %d\n", smp_processor_id());
}
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
unsigned long pa;
+
/* Size alignment is implied but just to make it explicit. */
BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4);
__this_cpu_write(kvm_apic_eoi, 0);
@@ -344,8 +348,7 @@ static void kvm_pv_disable_apf(void)
wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
__this_cpu_write(apf_reason.enabled, 0);
- printk(KERN_INFO"Unregister pv shared memory for cpu %d\n",
- smp_processor_id());
+ pr_info("Unregister pv shared memory for cpu %d\n", smp_processor_id());
}
static void kvm_pv_guest_cpu_reboot(void *unused)
@@ -592,12 +595,6 @@ static int kvm_cpu_down_prepare(unsigned int cpu)
}
#endif
-static void __init kvm_apf_trap_init(void)
-{
- update_intr_gate(X86_TRAP_PF, async_page_fault);
-}
-
-
static void kvm_flush_tlb_others(const struct cpumask *cpumask,
const struct flush_tlb_info *info)
{
@@ -632,8 +629,6 @@ static void __init kvm_guest_init(void)
register_reboot_notifier(&kvm_pv_reboot_nb);
for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
raw_spin_lock_init(&async_pf_sleepers[i].lock);
- if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
- x86_init.irqs.trap_init = kvm_apf_trap_init;
if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
has_steal_clock = 1;
@@ -649,6 +644,9 @@ static void __init kvm_guest_init(void)
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
apic_set_eoi_write(kvm_guest_apic_eoi_write);
+ if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf)
+ static_branch_enable(&kvm_async_pf_enabled);
+
#ifdef CONFIG_SMP
smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus;
smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index fe3ab9632f3b..8c89e4d9ad28 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -147,7 +147,7 @@ static inline void smpboot_restore_warm_reset_vector(void)
*((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
}
-static void init_freq_invariance(void);
+static void init_freq_invariance(bool secondary);
/*
* Report back to the Boot Processor during boot time or to the caller processor
@@ -185,7 +185,7 @@ static void smp_callin(void)
*/
set_cpu_sibling_map(raw_smp_processor_id());
- init_freq_invariance();
+ init_freq_invariance(true);
/*
* Get our bogomips.
@@ -1341,7 +1341,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
set_sched_topology(x86_topology);
set_cpu_sibling_map(0);
- init_freq_invariance();
+ init_freq_invariance(false);
smp_sanity_check();
switch (apic_intr_mode) {
@@ -1877,9 +1877,6 @@ static bool knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq,
int err, i;
u64 msr;
- if (!x86_match_cpu(has_knl_turbo_ratio_limits))
- return false;
-
err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
if (err)
return false;
@@ -1945,18 +1942,23 @@ static bool skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size)
static bool core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
{
+ u64 msr;
int err;
err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
if (err)
return false;
- err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, turbo_freq);
+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
if (err)
return false;
- *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
- *turbo_freq = (*turbo_freq >> 24) & 0xFF; /* 4C turbo */
+ *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
+ *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */
+
+ /* The CPU may have less than 4 cores */
+ if (!*turbo_freq)
+ *turbo_freq = msr & 0xFF; /* 1C turbo */
return true;
}
@@ -1972,7 +1974,8 @@ static bool intel_set_max_freq_ratio(void)
skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
goto out;
- if (knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
+ if (x86_match_cpu(has_knl_turbo_ratio_limits) &&
+ knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
goto out;
if (x86_match_cpu(has_skx_turbo_ratio_limits) &&
@@ -1985,13 +1988,22 @@ static bool intel_set_max_freq_ratio(void)
return false;
out:
+ /*
+ * Some hypervisors advertise X86_FEATURE_APERFMPERF
+ * but then fill all MSR's with zeroes.
+ */
+ if (!base_freq) {
+ pr_debug("Couldn't determine cpu base frequency, necessary for scale-invariant accounting.\n");
+ return false;
+ }
+
arch_turbo_freq_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE,
base_freq);
arch_set_max_freq_ratio(turbo_disabled());
return true;
}
-static void init_counter_refs(void *arg)
+static void init_counter_refs(void)
{
u64 aperf, mperf;
@@ -2002,18 +2014,25 @@ static void init_counter_refs(void *arg)
this_cpu_write(arch_prev_mperf, mperf);
}
-static void init_freq_invariance(void)
+static void init_freq_invariance(bool secondary)
{
bool ret = false;
- if (smp_processor_id() != 0 || !boot_cpu_has(X86_FEATURE_APERFMPERF))
+ if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
return;
+ if (secondary) {
+ if (static_branch_likely(&arch_scale_freq_key)) {
+ init_counter_refs();
+ }
+ return;
+ }
+
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
ret = intel_set_max_freq_ratio();
if (ret) {
- on_each_cpu(init_counter_refs, NULL, 1);
+ init_counter_refs();
static_branch_enable(&arch_scale_freq_key);
} else {
pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n");
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index d54cffdc7cac..821fac47eef6 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -983,7 +983,5 @@ void __init trap_init(void)
idt_setup_ist_traps();
- x86_init.irqs.trap_init();
-
idt_setup_debugidt_traps();
}
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index a224b5ab103f..54226110bc7f 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -344,6 +344,9 @@ bad_address:
if (IS_ENABLED(CONFIG_X86_32))
goto the_end;
+ if (state->task != current)
+ goto the_end;
+
if (state->regs) {
printk_deferred_once(KERN_WARNING
"WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n",
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index e9cc182aa97e..5b0bd8581fe6 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -8,19 +8,21 @@
#include <asm/orc_lookup.h>
#define orc_warn(fmt, ...) \
- printk_deferred_once(KERN_WARNING pr_fmt("WARNING: " fmt), ##__VA_ARGS__)
+ printk_deferred_once(KERN_WARNING "WARNING: " fmt, ##__VA_ARGS__)
+
+#define orc_warn_current(args...) \
+({ \
+ if (state->task == current) \
+ orc_warn(args); \
+})
extern int __start_orc_unwind_ip[];
extern int __stop_orc_unwind_ip[];
extern struct orc_entry __start_orc_unwind[];
extern struct orc_entry __stop_orc_unwind[];
-static DEFINE_MUTEX(sort_mutex);
-int *cur_orc_ip_table = __start_orc_unwind_ip;
-struct orc_entry *cur_orc_table = __start_orc_unwind;
-
-unsigned int lookup_num_blocks;
-bool orc_init;
+static bool orc_init __ro_after_init;
+static unsigned int lookup_num_blocks __ro_after_init;
static inline unsigned long orc_ip(const int *ip)
{
@@ -142,9 +144,6 @@ static struct orc_entry *orc_find(unsigned long ip)
{
static struct orc_entry *orc;
- if (!orc_init)
- return NULL;
-
if (ip == 0)
return &null_orc_entry;
@@ -189,6 +188,10 @@ static struct orc_entry *orc_find(unsigned long ip)
#ifdef CONFIG_MODULES
+static DEFINE_MUTEX(sort_mutex);
+static int *cur_orc_ip_table = __start_orc_unwind_ip;
+static struct orc_entry *cur_orc_table = __start_orc_unwind;
+
static void orc_sort_swap(void *_a, void *_b, int size)
{
struct orc_entry *orc_a, *orc_b;
@@ -381,9 +384,38 @@ static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr
return true;
}
+/*
+ * If state->regs is non-NULL, and points to a full pt_regs, just get the reg
+ * value from state->regs.
+ *
+ * Otherwise, if state->regs just points to IRET regs, and the previous frame
+ * had full regs, it's safe to get the value from the previous regs. This can
+ * happen when early/late IRQ entry code gets interrupted by an NMI.
+ */
+static bool get_reg(struct unwind_state *state, unsigned int reg_off,
+ unsigned long *val)
+{
+ unsigned int reg = reg_off/8;
+
+ if (!state->regs)
+ return false;
+
+ if (state->full_regs) {
+ *val = ((unsigned long *)state->regs)[reg];
+ return true;
+ }
+
+ if (state->prev_regs) {
+ *val = ((unsigned long *)state->prev_regs)[reg];
+ return true;
+ }
+
+ return false;
+}
+
bool unwind_next_frame(struct unwind_state *state)
{
- unsigned long ip_p, sp, orig_ip = state->ip, prev_sp = state->sp;
+ unsigned long ip_p, sp, tmp, orig_ip = state->ip, prev_sp = state->sp;
enum stack_type prev_type = state->stack_info.type;
struct orc_entry *orc;
bool indirect = false;
@@ -445,43 +477,39 @@ bool unwind_next_frame(struct unwind_state *state)
break;
case ORC_REG_R10:
- if (!state->regs || !state->full_regs) {
- orc_warn("missing regs for base reg R10 at ip %pB\n",
- (void *)state->ip);
+ if (!get_reg(state, offsetof(struct pt_regs, r10), &sp)) {
+ orc_warn_current("missing R10 value at %pB\n",
+ (void *)state->ip);
goto err;
}
- sp = state->regs->r10;
break;
case ORC_REG_R13:
- if (!state->regs || !state->full_regs) {
- orc_warn("missing regs for base reg R13 at ip %pB\n",
- (void *)state->ip);
+ if (!get_reg(state, offsetof(struct pt_regs, r13), &sp)) {
+ orc_warn_current("missing R13 value at %pB\n",
+ (void *)state->ip);
goto err;
}
- sp = state->regs->r13;
break;
case ORC_REG_DI:
- if (!state->regs || !state->full_regs) {
- orc_warn("missing regs for base reg DI at ip %pB\n",
- (void *)state->ip);
+ if (!get_reg(state, offsetof(struct pt_regs, di), &sp)) {
+ orc_warn_current("missing RDI value at %pB\n",
+ (void *)state->ip);
goto err;
}
- sp = state->regs->di;
break;
case ORC_REG_DX:
- if (!state->regs || !state->full_regs) {
- orc_warn("missing regs for base reg DX at ip %pB\n",
- (void *)state->ip);
+ if (!get_reg(state, offsetof(struct pt_regs, dx), &sp)) {
+ orc_warn_current("missing DX value at %pB\n",
+ (void *)state->ip);
goto err;
}
- sp = state->regs->dx;
break;
default:
- orc_warn("unknown SP base reg %d for ip %pB\n",
+ orc_warn("unknown SP base reg %d at %pB\n",
orc->sp_reg, (void *)state->ip);
goto err;
}
@@ -504,44 +532,48 @@ bool unwind_next_frame(struct unwind_state *state)
state->sp = sp;
state->regs = NULL;
+ state->prev_regs = NULL;
state->signal = false;
break;
case ORC_TYPE_REGS:
if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) {
- orc_warn("can't dereference registers at %p for ip %pB\n",
- (void *)sp, (void *)orig_ip);
+ orc_warn_current("can't access registers at %pB\n",
+ (void *)orig_ip);
goto err;
}
state->regs = (struct pt_regs *)sp;
+ state->prev_regs = NULL;
state->full_regs = true;
state->signal = true;
break;
case ORC_TYPE_REGS_IRET:
if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) {
- orc_warn("can't dereference iret registers at %p for ip %pB\n",
- (void *)sp, (void *)orig_ip);
+ orc_warn_current("can't access iret registers at %pB\n",
+ (void *)orig_ip);
goto err;
}
+ if (state->full_regs)
+ state->prev_regs = state->regs;
state->regs = (void *)sp - IRET_FRAME_OFFSET;
state->full_regs = false;
state->signal = true;
break;
default:
- orc_warn("unknown .orc_unwind entry type %d for ip %pB\n",
+ orc_warn("unknown .orc_unwind entry type %d at %pB\n",
orc->type, (void *)orig_ip);
- break;
+ goto err;
}
/* Find BP: */
switch (orc->bp_reg) {
case ORC_REG_UNDEFINED:
- if (state->regs && state->full_regs)
- state->bp = state->regs->bp;
+ if (get_reg(state, offsetof(struct pt_regs, bp), &tmp))
+ state->bp = tmp;
break;
case ORC_REG_PREV_SP:
@@ -564,8 +596,8 @@ bool unwind_next_frame(struct unwind_state *state)
if (state->stack_info.type == prev_type &&
on_stack(&state->stack_info, (void *)state->sp, sizeof(long)) &&
state->sp <= prev_sp) {
- orc_warn("stack going in the wrong direction? ip=%pB\n",
- (void *)orig_ip);
+ orc_warn_current("stack going in the wrong direction? at %pB\n",
+ (void *)orig_ip);
goto err;
}
@@ -585,6 +617,9 @@ EXPORT_SYMBOL_GPL(unwind_next_frame);
void __unwind_start(struct unwind_state *state, struct task_struct *task,
struct pt_regs *regs, unsigned long *first_frame)
{
+ if (!orc_init)
+ goto done;
+
memset(state, 0, sizeof(*state));
state->task = task;
@@ -651,7 +686,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
/* Otherwise, skip ahead to the user-specified starting frame: */
while (!unwind_done(state) &&
(!on_stack(&state->stack_info, first_frame, sizeof(long)) ||
- state->sp <= (unsigned long)first_frame))
+ state->sp < (unsigned long)first_frame))
unwind_next_frame(state);
return;
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 85f1a90c55cd..123f1c1f1788 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -79,7 +79,6 @@ struct x86_init_ops x86_init __initdata = {
.irqs = {
.pre_vector_init = init_ISA_irqs,
.intr_init = native_init_IRQ,
- .trap_init = x86_init_noop,
.intr_mode_select = apic_intr_mode_select,
.intr_mode_init = apic_intr_mode_init
},
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 907625fea7b3..d93cb3ad8f03 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4178,7 +4178,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
case KVM_PV_REASON_PAGE_NOT_PRESENT:
vcpu->arch.apf.host_apf_reason = 0;
local_irq_disable();
- kvm_async_pf_task_wait(fault_address, 0);
+ kvm_async_pf_task_wait_schedule(fault_address);
local_irq_enable();
break;
case KVM_PV_REASON_PAGE_READY:
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index cf912b4aaba8..89f7f3aebd31 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -345,7 +345,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
return NULL;
/* Pin the user virtual address. */
- npinned = get_user_pages_fast(uaddr, npages, FOLL_WRITE, pages);
+ npinned = get_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages);
if (npinned != npages) {
pr_err("SEV: Failure locking %lu pages.\n", npages);
goto err;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index a51df516b87b..6486ccec1b0e 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -30,6 +30,7 @@
#include <asm/desc.h> /* store_idt(), ... */
#include <asm/cpu_entry_area.h> /* exception stack */
#include <asm/pgtable_areas.h> /* VMALLOC_START, ... */
+#include <asm/kvm_para.h> /* kvm_handle_async_pf */
#define CREATE_TRACE_POINTS
#include <asm/trace/exceptions.h>
@@ -1523,6 +1524,24 @@ do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
unsigned long address)
{
prefetchw(&current->mm->mmap_sem);
+ /*
+ * KVM has two types of events that are, logically, interrupts, but
+ * are unfortunately delivered using the #PF vector. These events are
+ * "you just accessed valid memory, but the host doesn't have it right
+ * now, so I'll put you to sleep if you continue" and "that memory
+ * you tried to access earlier is available now."
+ *
+ * We are relying on the interrupted context being sane (valid RSP,
+ * relevant locks not held, etc.), which is fine as long as the
+ * interrupted context had IF=1. We are also relying on the KVM
+ * async pf type field and CR2 being read consistently instead of
+ * getting values from real and async page faults mixed up.
+ *
+ * Fingers crossed.
+ */
+ if (kvm_handle_async_pf(regs, (u32)address))
+ return;
+
trace_page_fault_entries(regs, hw_error_code, address);
if (unlikely(kmmio_fault(regs, address)))
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 59eca6a94ce7..b8c55a2e402d 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -43,7 +43,8 @@ struct cpa_data {
unsigned long pfn;
unsigned int flags;
unsigned int force_split : 1,
- force_static_prot : 1;
+ force_static_prot : 1,
+ force_flush_all : 1;
struct page **pages;
};
@@ -355,10 +356,10 @@ static void cpa_flush(struct cpa_data *data, int cache)
return;
}
- if (cpa->numpages <= tlb_single_page_flush_ceiling)
- on_each_cpu(__cpa_flush_tlb, cpa, 1);
- else
+ if (cpa->force_flush_all || cpa->numpages > tlb_single_page_flush_ceiling)
flush_tlb_all();
+ else
+ on_each_cpu(__cpa_flush_tlb, cpa, 1);
if (!cache)
return;
@@ -1598,6 +1599,8 @@ static int cpa_process_alias(struct cpa_data *cpa)
alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
alias_cpa.curpage = 0;
+ cpa->force_flush_all = 1;
+
ret = __change_page_attr_set_clr(&alias_cpa, 0);
if (ret)
return ret;
@@ -1618,6 +1621,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
alias_cpa.curpage = 0;
+ cpa->force_flush_all = 1;
/*
* The high mapping range is imprecise, so ignore the
* return value.
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 5ea7c2cf7ab4..42b6709e6dc7 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -158,6 +158,19 @@ static bool is_ereg(u32 reg)
BIT(BPF_REG_AX));
}
+/*
+ * is_ereg_8l() == true if BPF register 'reg' is mapped to access x86-64
+ * lower 8-bit registers dil,sil,bpl,spl,r8b..r15b, which need extra byte
+ * of encoding. al,cl,dl,bl have simpler encoding.
+ */
+static bool is_ereg_8l(u32 reg)
+{
+ return is_ereg(reg) ||
+ (1 << reg) & (BIT(BPF_REG_1) |
+ BIT(BPF_REG_2) |
+ BIT(BPF_REG_FP));
+}
+
static bool is_axreg(u32 reg)
{
return reg == BPF_REG_0;
@@ -598,9 +611,8 @@ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
switch (size) {
case BPF_B:
/* Emit 'mov byte ptr [rax + off], al' */
- if (is_ereg(dst_reg) || is_ereg(src_reg) ||
- /* We have to add extra byte for x86 SIL, DIL regs */
- src_reg == BPF_REG_1 || src_reg == BPF_REG_2)
+ if (is_ereg(dst_reg) || is_ereg_8l(src_reg))
+ /* Add extra byte for eregs or SIL,DIL,BPL in src_reg */
EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x88);
else
EMIT1(0x88);
diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
index 4d2a7a764602..66cd150b7e54 100644
--- a/arch/x86/net/bpf_jit_comp32.c
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -1847,14 +1847,16 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
case BPF_B:
case BPF_H:
case BPF_W:
- if (!bpf_prog->aux->verifier_zext)
+ if (bpf_prog->aux->verifier_zext)
break;
if (dstk) {
EMIT3(0xC7, add_1reg(0x40, IA32_EBP),
STACK_VAR(dst_hi));
EMIT(0x0, 4);
} else {
- EMIT3(0xC7, add_1reg(0xC0, dst_hi), 0);
+ /* xor dst_hi,dst_hi */
+ EMIT2(0x33,
+ add_2reg(0xC0, dst_hi, dst_hi));
}
break;
case BPF_DW:
@@ -2013,8 +2015,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
case BPF_JMP | BPF_JSET | BPF_X:
case BPF_JMP32 | BPF_JSET | BPF_X: {
bool is_jmp64 = BPF_CLASS(insn->code) == BPF_JMP;
- u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
- u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+ u8 dreg_lo = IA32_EAX;
+ u8 dreg_hi = IA32_EDX;
u8 sreg_lo = sstk ? IA32_ECX : src_lo;
u8 sreg_hi = sstk ? IA32_EBX : src_hi;
@@ -2026,6 +2028,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
add_2reg(0x40, IA32_EBP,
IA32_EDX),
STACK_VAR(dst_hi));
+ } else {
+ /* mov dreg_lo,dst_lo */
+ EMIT2(0x89, add_2reg(0xC0, dreg_lo, dst_lo));
+ if (is_jmp64)
+ /* mov dreg_hi,dst_hi */
+ EMIT2(0x89,
+ add_2reg(0xC0, dreg_hi, dst_hi));
}
if (sstk) {
@@ -2050,8 +2059,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
case BPF_JMP | BPF_JSET | BPF_K:
case BPF_JMP32 | BPF_JSET | BPF_K: {
bool is_jmp64 = BPF_CLASS(insn->code) == BPF_JMP;
- u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
- u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+ u8 dreg_lo = IA32_EAX;
+ u8 dreg_hi = IA32_EDX;
u8 sreg_lo = IA32_ECX;
u8 sreg_hi = IA32_EBX;
u32 hi;
@@ -2064,6 +2073,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
add_2reg(0x40, IA32_EBP,
IA32_EDX),
STACK_VAR(dst_hi));
+ } else {
+ /* mov dreg_lo,dst_lo */
+ EMIT2(0x89, add_2reg(0xC0, dreg_lo, dst_lo));
+ if (is_jmp64)
+ /* mov dreg_hi,dst_hi */
+ EMIT2(0x89,
+ add_2reg(0xC0, dreg_hi, dst_hi));
}
/* mov ecx,imm32 */