summaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-12-20 19:44:05 +0100
committerLinus Torvalds <torvalds@linux-foundation.org>2020-12-20 19:44:05 +0100
commit6a447b0e3151893f6d4a889956553c06d2e775c6 (patch)
tree0f0c149c03dd8c2e9a5fbe01d6de528b2724893e /arch
parentMerge tag 'rtc-5.11' of git://git.kernel.org/pub/scm/linux/kernel/git/abellon... (diff)
parentKVM: SVM: fix 32-bit compilation (diff)
downloadlinux-6a447b0e3151893f6d4a889956553c06d2e775c6.tar.xz
linux-6a447b0e3151893f6d4a889956553c06d2e775c6.zip
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Paolo Bonzini: "Much x86 work was pushed out to 5.12, but ARM more than made up for it. ARM: - PSCI relay at EL2 when "protected KVM" is enabled - New exception injection code - Simplification of AArch32 system register handling - Fix PMU accesses when no PMU is enabled - Expose CSV3 on non-Meltdown hosts - Cache hierarchy discovery fixes - PV steal-time cleanups - Allow function pointers at EL2 - Various host EL2 entry cleanups - Simplification of the EL2 vector allocation s390: - memcg accouting for s390 specific parts of kvm and gmap - selftest for diag318 - new kvm_stat for when async_pf falls back to sync x86: - Tracepoints for the new pagetable code from 5.10 - Catch VFIO and KVM irqfd events before userspace - Reporting dirty pages to userspace with a ring buffer - SEV-ES host support - Nested VMX support for wait-for-SIPI activity state - New feature flag (AVX512 FP16) - New system ioctl to report Hyper-V-compatible paravirtualization features Generic: - Selftest improvements" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (171 commits) KVM: SVM: fix 32-bit compilation KVM: SVM: Add AP_JUMP_TABLE support in prep for AP booting KVM: SVM: Provide support to launch and run an SEV-ES guest KVM: SVM: Provide an updated VMRUN invocation for SEV-ES guests KVM: SVM: Provide support for SEV-ES vCPU loading KVM: SVM: Provide support for SEV-ES vCPU creation/loading KVM: SVM: Update ASID allocation to support SEV-ES guests KVM: SVM: Set the encryption mask for the SVM host save area KVM: SVM: Add NMI support for an SEV-ES guest KVM: SVM: Guest FPU state save/restore not needed for SEV-ES guest KVM: SVM: Do not report support for SMM for an SEV-ES guest KVM: x86: Update __get_sregs() / __set_sregs() to support SEV-ES KVM: SVM: Add support for CR8 write traps for an SEV-ES guest KVM: SVM: Add support for CR4 write traps for an SEV-ES guest KVM: SVM: Add support for CR0 write traps for an SEV-ES guest KVM: SVM: Add support for EFER write traps for an SEV-ES guest KVM: SVM: Support string IO operations for an SEV-ES guest KVM: SVM: Support MMIO for an SEV-ES guest KVM: SVM: Create trace events for VMGEXIT MSR protocol processing KVM: SVM: Create trace events for VMGEXIT processing ...
Diffstat (limited to 'arch')
-rw-r--r--arch/arm64/include/asm/cpucaps.h5
-rw-r--r--arch/arm64/include/asm/cpufeature.h5
-rw-r--r--arch/arm64/include/asm/el2_setup.h181
-rw-r--r--arch/arm64/include/asm/kvm_arm.h1
-rw-r--r--arch/arm64/include/asm/kvm_asm.h17
-rw-r--r--arch/arm64/include/asm/kvm_coproc.h38
-rw-r--r--arch/arm64/include/asm/kvm_emulate.h70
-rw-r--r--arch/arm64/include/asm/kvm_host.h206
-rw-r--r--arch/arm64/include/asm/kvm_hyp.h4
-rw-r--r--arch/arm64/include/asm/kvm_mmu.h110
-rw-r--r--arch/arm64/include/asm/mmu.h29
-rw-r--r--arch/arm64/include/asm/percpu.h6
-rw-r--r--arch/arm64/include/asm/sections.h1
-rw-r--r--arch/arm64/include/asm/smp.h4
-rw-r--r--arch/arm64/include/asm/spectre.h65
-rw-r--r--arch/arm64/include/asm/sysreg.h1
-rw-r--r--arch/arm64/include/asm/virt.h26
-rw-r--r--arch/arm64/include/uapi/asm/kvm.h3
-rw-r--r--arch/arm64/kernel/asm-offsets.c5
-rw-r--r--arch/arm64/kernel/cpu_errata.c19
-rw-r--r--arch/arm64/kernel/cpufeature.c42
-rw-r--r--arch/arm64/kernel/head.S135
-rw-r--r--arch/arm64/kernel/image-vars.h11
-rw-r--r--arch/arm64/kernel/proton-pack.c84
-rw-r--r--arch/arm64/kernel/setup.c2
-rw-r--r--arch/arm64/kernel/vmlinux.lds.S10
-rw-r--r--arch/arm64/kvm/Makefile4
-rw-r--r--arch/arm64/kvm/aarch32.c232
-rw-r--r--arch/arm64/kvm/arm.c281
-rw-r--r--arch/arm64/kvm/guest.c29
-rw-r--r--arch/arm64/kvm/handle_exit.c24
-rw-r--r--arch/arm64/kvm/hyp/Makefile2
-rw-r--r--arch/arm64/kvm/hyp/aarch32.c4
-rw-r--r--arch/arm64/kvm/hyp/exception.c331
-rw-r--r--arch/arm64/kvm/hyp/hyp-entry.S71
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/adjust_pc.h62
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/switch.h17
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/trap_handler.h18
-rw-r--r--arch/arm64/kvm/hyp/nvhe/Makefile5
-rw-r--r--arch/arm64/kvm/hyp/nvhe/host.S58
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-init.S152
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-main.c243
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-smp.c40
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp.lds.S1
-rw-r--r--arch/arm64/kvm/hyp/nvhe/psci-relay.c324
-rw-r--r--arch/arm64/kvm/hyp/nvhe/switch.c8
-rw-r--r--arch/arm64/kvm/hyp/nvhe/sysreg-sr.c11
-rw-r--r--arch/arm64/kvm/hyp/smccc_wa.S32
-rw-r--r--arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c2
-rw-r--r--arch/arm64/kvm/hyp/vgic-v3-sr.c2
-rw-r--r--arch/arm64/kvm/hyp/vhe/Makefile2
-rw-r--r--arch/arm64/kvm/hyp/vhe/switch.c3
-rw-r--r--arch/arm64/kvm/inject_fault.c167
-rw-r--r--arch/arm64/kvm/mmio.c2
-rw-r--r--arch/arm64/kvm/mmu.c2
-rw-r--r--arch/arm64/kvm/pmu-emul.c19
-rw-r--r--arch/arm64/kvm/pvtime.c6
-rw-r--r--arch/arm64/kvm/regmap.c224
-rw-r--r--arch/arm64/kvm/reset.c57
-rw-r--r--arch/arm64/kvm/sys_regs.c390
-rw-r--r--arch/arm64/kvm/sys_regs.h9
-rw-r--r--arch/arm64/kvm/va_layout.c104
-rw-r--r--arch/arm64/kvm/vgic-sys-reg-v3.c4
-rw-r--r--arch/arm64/kvm/vgic/vgic-v4.c12
-rw-r--r--arch/arm64/kvm/vgic/vgic.c3
-rw-r--r--arch/s390/include/asm/kvm_host.h1
-rw-r--r--arch/s390/kvm/guestdbg.c8
-rw-r--r--arch/s390/kvm/intercept.c2
-rw-r--r--arch/s390/kvm/interrupt.c10
-rw-r--r--arch/s390/kvm/kvm-s390.c22
-rw-r--r--arch/s390/kvm/priv.c4
-rw-r--r--arch/s390/kvm/pv.c6
-rw-r--r--arch/s390/kvm/vsie.c4
-rw-r--r--arch/s390/mm/gmap.c30
-rw-r--r--arch/x86/include/asm/cpufeatures.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h20
-rw-r--r--arch/x86/include/asm/msr-index.h1
-rw-r--r--arch/x86/include/asm/svm.h40
-rw-r--r--arch/x86/include/asm/vmx.h1
-rw-r--r--arch/x86/include/uapi/asm/kvm.h1
-rw-r--r--arch/x86/include/uapi/asm/svm.h28
-rw-r--r--arch/x86/include/uapi/asm/vmx.h2
-rw-r--r--arch/x86/kernel/cpu/cpuid-deps.c1
-rw-r--r--arch/x86/kernel/cpu/scattered.c1
-rw-r--r--arch/x86/kernel/cpu/vmware.c12
-rw-r--r--arch/x86/kernel/kvmclock.c1
-rw-r--r--arch/x86/kvm/Kconfig3
-rw-r--r--arch/x86/kvm/Makefile3
-rw-r--r--arch/x86/kvm/cpuid.c3
-rw-r--r--arch/x86/kvm/cpuid.h14
-rw-r--r--arch/x86/kvm/hyperv.c6
-rw-r--r--arch/x86/kvm/hyperv.h4
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h51
-rw-r--r--arch/x86/kvm/lapic.c45
-rw-r--r--arch/x86/kvm/mmu/mmu.c10
-rw-r--r--arch/x86/kvm/mmu/mmutrace.h29
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c16
-rw-r--r--arch/x86/kvm/mtrr.c6
-rw-r--r--arch/x86/kvm/svm/avic.c9
-rw-r--r--arch/x86/kvm/svm/nested.c11
-rw-r--r--arch/x86/kvm/svm/sev.c915
-rw-r--r--arch/x86/kvm/svm/svm.c469
-rw-r--r--arch/x86/kvm/svm/svm.h167
-rw-r--r--arch/x86/kvm/svm/vmenter.S50
-rw-r--r--arch/x86/kvm/trace.h97
-rw-r--r--arch/x86/kvm/vmx/evmcs.c3
-rw-r--r--arch/x86/kvm/vmx/nested.c57
-rw-r--r--arch/x86/kvm/vmx/vmenter.S2
-rw-r--r--arch/x86/kvm/vmx/vmx.c171
-rw-r--r--arch/x86/kvm/vmx/vmx.h2
-rw-r--r--arch/x86/kvm/x86.c511
-rw-r--r--arch/x86/kvm/x86.h31
112 files changed, 4730 insertions, 2189 deletions
diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index a7242ef2a2cd..b77d997b173b 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -19,7 +19,7 @@
#define ARM64_HAS_VIRT_HOST_EXTN 11
#define ARM64_WORKAROUND_CAVIUM_27456 12
#define ARM64_HAS_32BIT_EL0 13
-#define ARM64_HARDEN_EL2_VECTORS 14
+#define ARM64_SPECTRE_V3A 14
#define ARM64_HAS_CNP 15
#define ARM64_HAS_NO_FPSIMD 16
#define ARM64_WORKAROUND_REPEAT_TLBI 17
@@ -65,7 +65,8 @@
#define ARM64_MTE 57
#define ARM64_WORKAROUND_1508412 58
#define ARM64_HAS_LDAPR 59
+#define ARM64_KVM_PROTECTED_MODE 60
-#define ARM64_NCAPS 60
+#define ARM64_NCAPS 61
#endif /* __ASM_CPUCAPS_H */
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 1c406e8ae27e..9a555809b89c 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -705,6 +705,11 @@ static inline bool system_supports_generic_auth(void)
cpus_have_const_cap(ARM64_HAS_GENERIC_AUTH);
}
+static inline bool system_has_full_ptr_auth(void)
+{
+ return system_supports_address_auth() && system_supports_generic_auth();
+}
+
static __always_inline bool system_uses_irq_prio_masking(void)
{
return IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) &&
diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h
new file mode 100644
index 000000000000..a7f5a1bbc8ac
--- /dev/null
+++ b/arch/arm64/include/asm/el2_setup.h
@@ -0,0 +1,181 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2012,2013 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ */
+
+#ifndef __ARM_KVM_INIT_H__
+#define __ARM_KVM_INIT_H__
+
+#ifndef __ASSEMBLY__
+#error Assembly-only header
+#endif
+
+#include <asm/kvm_arm.h>
+#include <asm/ptrace.h>
+#include <asm/sysreg.h>
+#include <linux/irqchip/arm-gic-v3.h>
+
+.macro __init_el2_sctlr
+ mov_q x0, INIT_SCTLR_EL2_MMU_OFF
+ msr sctlr_el2, x0
+ isb
+.endm
+
+/*
+ * Allow Non-secure EL1 and EL0 to access physical timer and counter.
+ * This is not necessary for VHE, since the host kernel runs in EL2,
+ * and EL0 accesses are configured in the later stage of boot process.
+ * Note that when HCR_EL2.E2H == 1, CNTHCTL_EL2 has the same bit layout
+ * as CNTKCTL_EL1, and CNTKCTL_EL1 accessing instructions are redefined
+ * to access CNTHCTL_EL2. This allows the kernel designed to run at EL1
+ * to transparently mess with the EL0 bits via CNTKCTL_EL1 access in
+ * EL2.
+ */
+.macro __init_el2_timers mode
+.ifeqs "\mode", "nvhe"
+ mrs x0, cnthctl_el2
+ orr x0, x0, #3 // Enable EL1 physical timers
+ msr cnthctl_el2, x0
+.endif
+ msr cntvoff_el2, xzr // Clear virtual offset
+.endm
+
+.macro __init_el2_debug mode
+ mrs x1, id_aa64dfr0_el1
+ sbfx x0, x1, #ID_AA64DFR0_PMUVER_SHIFT, #4
+ cmp x0, #1
+ b.lt 1f // Skip if no PMU present
+ mrs x0, pmcr_el0 // Disable debug access traps
+ ubfx x0, x0, #11, #5 // to EL2 and allow access to
+1:
+ csel x2, xzr, x0, lt // all PMU counters from EL1
+
+ /* Statistical profiling */
+ ubfx x0, x1, #ID_AA64DFR0_PMSVER_SHIFT, #4
+ cbz x0, 3f // Skip if SPE not present
+
+.ifeqs "\mode", "nvhe"
+ mrs_s x0, SYS_PMBIDR_EL1 // If SPE available at EL2,
+ and x0, x0, #(1 << SYS_PMBIDR_EL1_P_SHIFT)
+ cbnz x0, 2f // then permit sampling of physical
+ mov x0, #(1 << SYS_PMSCR_EL2_PCT_SHIFT | \
+ 1 << SYS_PMSCR_EL2_PA_SHIFT)
+ msr_s SYS_PMSCR_EL2, x0 // addresses and physical counter
+2:
+ mov x0, #(MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT)
+ orr x2, x2, x0 // If we don't have VHE, then
+ // use EL1&0 translation.
+.else
+ orr x2, x2, #MDCR_EL2_TPMS // For VHE, use EL2 translation
+ // and disable access from EL1
+.endif
+
+3:
+ msr mdcr_el2, x2 // Configure debug traps
+.endm
+
+/* LORegions */
+.macro __init_el2_lor
+ mrs x1, id_aa64mmfr1_el1
+ ubfx x0, x1, #ID_AA64MMFR1_LOR_SHIFT, 4
+ cbz x0, 1f
+ msr_s SYS_LORC_EL1, xzr
+1:
+.endm
+
+/* Stage-2 translation */
+.macro __init_el2_stage2
+ msr vttbr_el2, xzr
+.endm
+
+/* GICv3 system register access */
+.macro __init_el2_gicv3
+ mrs x0, id_aa64pfr0_el1
+ ubfx x0, x0, #ID_AA64PFR0_GIC_SHIFT, #4
+ cbz x0, 1f
+
+ mrs_s x0, SYS_ICC_SRE_EL2
+ orr x0, x0, #ICC_SRE_EL2_SRE // Set ICC_SRE_EL2.SRE==1
+ orr x0, x0, #ICC_SRE_EL2_ENABLE // Set ICC_SRE_EL2.Enable==1
+ msr_s SYS_ICC_SRE_EL2, x0
+ isb // Make sure SRE is now set
+ mrs_s x0, SYS_ICC_SRE_EL2 // Read SRE back,
+ tbz x0, #0, 1f // and check that it sticks
+ msr_s SYS_ICH_HCR_EL2, xzr // Reset ICC_HCR_EL2 to defaults
+1:
+.endm
+
+.macro __init_el2_hstr
+ msr hstr_el2, xzr // Disable CP15 traps to EL2
+.endm
+
+/* Virtual CPU ID registers */
+.macro __init_el2_nvhe_idregs
+ mrs x0, midr_el1
+ mrs x1, mpidr_el1
+ msr vpidr_el2, x0
+ msr vmpidr_el2, x1
+.endm
+
+/* Coprocessor traps */
+.macro __init_el2_nvhe_cptr
+ mov x0, #0x33ff
+ msr cptr_el2, x0 // Disable copro. traps to EL2
+.endm
+
+/* SVE register access */
+.macro __init_el2_nvhe_sve
+ mrs x1, id_aa64pfr0_el1
+ ubfx x1, x1, #ID_AA64PFR0_SVE_SHIFT, #4
+ cbz x1, 1f
+
+ bic x0, x0, #CPTR_EL2_TZ // Also disable SVE traps
+ msr cptr_el2, x0 // Disable copro. traps to EL2
+ isb
+ mov x1, #ZCR_ELx_LEN_MASK // SVE: Enable full vector
+ msr_s SYS_ZCR_EL2, x1 // length for EL1.
+1:
+.endm
+
+.macro __init_el2_nvhe_prepare_eret
+ mov x0, #INIT_PSTATE_EL1
+ msr spsr_el2, x0
+.endm
+
+/**
+ * Initialize EL2 registers to sane values. This should be called early on all
+ * cores that were booted in EL2.
+ *
+ * Regs: x0, x1 and x2 are clobbered.
+ */
+.macro init_el2_state mode
+.ifnes "\mode", "vhe"
+.ifnes "\mode", "nvhe"
+.error "Invalid 'mode' argument"
+.endif
+.endif
+
+ __init_el2_sctlr
+ __init_el2_timers \mode
+ __init_el2_debug \mode
+ __init_el2_lor
+ __init_el2_stage2
+ __init_el2_gicv3
+ __init_el2_hstr
+
+ /*
+ * When VHE is not in use, early init of EL2 needs to be done here.
+ * When VHE _is_ in use, EL1 will not be used in the host and
+ * requires no configuration, and all non-hyp-specific EL2 setup
+ * will be done via the _EL1 system register aliases in __cpu_setup.
+ */
+.ifeqs "\mode", "nvhe"
+ __init_el2_nvhe_idregs
+ __init_el2_nvhe_cptr
+ __init_el2_nvhe_sve
+ __init_el2_nvhe_prepare_eret
+.endif
+.endm
+
+#endif /* __ARM_KVM_INIT_H__ */
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index 64ce29378467..4e90c2debf70 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -80,6 +80,7 @@
HCR_FMO | HCR_IMO | HCR_PTW )
#define HCR_VIRT_EXCP_MASK (HCR_VSE | HCR_VI | HCR_VF)
#define HCR_HOST_NVHE_FLAGS (HCR_RW | HCR_API | HCR_APK | HCR_ATA)
+#define HCR_HOST_NVHE_PROTECTED_FLAGS (HCR_HOST_NVHE_FLAGS | HCR_TSC)
#define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H)
/* TCR_EL2 Registers bits */
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 8e5fa28b78c2..8a33d83ea843 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -34,8 +34,6 @@
*/
#define KVM_VECTOR_PREAMBLE (2 * AARCH64_INSN_SIZE)
-#define __SMCCC_WORKAROUND_1_SMC_SZ 36
-
#define KVM_HOST_SMCCC_ID(id) \
ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \
ARM_SMCCC_SMC_64, \
@@ -150,6 +148,14 @@ extern void *__vhe_undefined_symbol;
#endif
+struct kvm_nvhe_init_params {
+ unsigned long mair_el2;
+ unsigned long tcr_el2;
+ unsigned long tpidr_el2;
+ unsigned long stack_hyp_va;
+ phys_addr_t pgd_pa;
+};
+
/* Translate a kernel address @ptr into its equivalent linear mapping */
#define kvm_ksym_ref(ptr) \
({ \
@@ -165,17 +171,14 @@ struct kvm_vcpu;
struct kvm_s2_mmu;
DECLARE_KVM_NVHE_SYM(__kvm_hyp_init);
-DECLARE_KVM_NVHE_SYM(__kvm_hyp_host_vector);
DECLARE_KVM_HYP_SYM(__kvm_hyp_vector);
#define __kvm_hyp_init CHOOSE_NVHE_SYM(__kvm_hyp_init)
-#define __kvm_hyp_host_vector CHOOSE_NVHE_SYM(__kvm_hyp_host_vector)
#define __kvm_hyp_vector CHOOSE_HYP_SYM(__kvm_hyp_vector)
extern unsigned long kvm_arm_hyp_percpu_base[NR_CPUS];
DECLARE_KVM_NVHE_SYM(__per_cpu_start);
DECLARE_KVM_NVHE_SYM(__per_cpu_end);
-extern atomic_t arm64_el2_vector_last_slot;
DECLARE_KVM_HYP_SYM(__bp_harden_hyp_vecs);
#define __bp_harden_hyp_vecs CHOOSE_HYP_SYM(__bp_harden_hyp_vecs)
@@ -189,8 +192,6 @@ extern void __kvm_timer_set_cntvoff(u64 cntvoff);
extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
-extern void __kvm_enable_ssbs(void);
-
extern u64 __vgic_v3_get_ich_vtr_el2(void);
extern u64 __vgic_v3_read_vmcr(void);
extern void __vgic_v3_write_vmcr(u32 vmcr);
@@ -198,8 +199,6 @@ extern void __vgic_v3_init_lrs(void);
extern u32 __kvm_get_mdcr_el2(void);
-extern char __smccc_workaround_1_smc[__SMCCC_WORKAROUND_1_SMC_SZ];
-
#if defined(GCC_VERSION) && GCC_VERSION < 50000
#define SYM_CONSTRAINT "i"
#else
diff --git a/arch/arm64/include/asm/kvm_coproc.h b/arch/arm64/include/asm/kvm_coproc.h
deleted file mode 100644
index d6bb40122fdb..000000000000
--- a/arch/arm64/include/asm/kvm_coproc.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2012,2013 - ARM Ltd
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * Derived from arch/arm/include/asm/kvm_coproc.h
- * Copyright (C) 2012 Rusty Russell IBM Corporation
- */
-
-#ifndef __ARM64_KVM_COPROC_H__
-#define __ARM64_KVM_COPROC_H__
-
-#include <linux/kvm_host.h>
-
-void kvm_reset_sys_regs(struct kvm_vcpu *vcpu);
-
-struct kvm_sys_reg_table {
- const struct sys_reg_desc *table;
- size_t num;
-};
-
-int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu);
-int kvm_handle_cp14_32(struct kvm_vcpu *vcpu);
-int kvm_handle_cp14_64(struct kvm_vcpu *vcpu);
-int kvm_handle_cp15_32(struct kvm_vcpu *vcpu);
-int kvm_handle_cp15_64(struct kvm_vcpu *vcpu);
-int kvm_handle_sys_reg(struct kvm_vcpu *vcpu);
-
-#define kvm_coproc_table_init kvm_sys_reg_table_init
-void kvm_sys_reg_table_init(void);
-
-struct kvm_one_reg;
-int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices);
-int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *);
-int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *);
-unsigned long kvm_arm_num_sys_reg_descs(struct kvm_vcpu *vcpu);
-
-#endif /* __ARM64_KVM_COPROC_H__ */
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index 00bc6f1234ba..f612c090f2e4 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -21,20 +21,25 @@
#include <asm/cputype.h>
#include <asm/virt.h>
-unsigned long *vcpu_reg32(const struct kvm_vcpu *vcpu, u8 reg_num);
-unsigned long vcpu_read_spsr32(const struct kvm_vcpu *vcpu);
-void vcpu_write_spsr32(struct kvm_vcpu *vcpu, unsigned long v);
+#define CURRENT_EL_SP_EL0_VECTOR 0x0
+#define CURRENT_EL_SP_ELx_VECTOR 0x200
+#define LOWER_EL_AArch64_VECTOR 0x400
+#define LOWER_EL_AArch32_VECTOR 0x600
+
+enum exception_type {
+ except_type_sync = 0,
+ except_type_irq = 0x80,
+ except_type_fiq = 0x100,
+ except_type_serror = 0x180,
+};
bool kvm_condition_valid32(const struct kvm_vcpu *vcpu);
-void kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr);
+void kvm_skip_instr32(struct kvm_vcpu *vcpu);
void kvm_inject_undefined(struct kvm_vcpu *vcpu);
void kvm_inject_vabt(struct kvm_vcpu *vcpu);
void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr);
void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr);
-void kvm_inject_undef32(struct kvm_vcpu *vcpu);
-void kvm_inject_dabt32(struct kvm_vcpu *vcpu, unsigned long addr);
-void kvm_inject_pabt32(struct kvm_vcpu *vcpu, unsigned long addr);
static __always_inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu)
{
@@ -168,30 +173,6 @@ static __always_inline void vcpu_set_reg(struct kvm_vcpu *vcpu, u8 reg_num,
vcpu_gp_regs(vcpu)->regs[reg_num] = val;
}
-static inline unsigned long vcpu_read_spsr(const struct kvm_vcpu *vcpu)
-{
- if (vcpu_mode_is_32bit(vcpu))
- return vcpu_read_spsr32(vcpu);
-
- if (vcpu->arch.sysregs_loaded_on_cpu)
- return read_sysreg_el1(SYS_SPSR);
- else
- return __vcpu_sys_reg(vcpu, SPSR_EL1);
-}
-
-static inline void vcpu_write_spsr(struct kvm_vcpu *vcpu, unsigned long v)
-{
- if (vcpu_mode_is_32bit(vcpu)) {
- vcpu_write_spsr32(vcpu, v);
- return;
- }
-
- if (vcpu->arch.sysregs_loaded_on_cpu)
- write_sysreg_el1(v, SYS_SPSR);
- else
- __vcpu_sys_reg(vcpu, SPSR_EL1) = v;
-}
-
/*
* The layout of SPSR for an AArch32 state is different when observed from an
* AArch64 SPSR_ELx or an AArch32 SPSR_*. This function generates the AArch32
@@ -477,32 +458,9 @@ static inline unsigned long vcpu_data_host_to_guest(struct kvm_vcpu *vcpu,
return data; /* Leave LE untouched */
}
-static __always_inline void kvm_skip_instr(struct kvm_vcpu *vcpu, bool is_wide_instr)
-{
- if (vcpu_mode_is_32bit(vcpu)) {
- kvm_skip_instr32(vcpu, is_wide_instr);
- } else {
- *vcpu_pc(vcpu) += 4;
- *vcpu_cpsr(vcpu) &= ~PSR_BTYPE_MASK;
- }
-
- /* advance the singlestep state machine */
- *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS;
-}
-
-/*
- * Skip an instruction which has been emulated at hyp while most guest sysregs
- * are live.
- */
-static __always_inline void __kvm_skip_instr(struct kvm_vcpu *vcpu)
+static __always_inline void kvm_incr_pc(struct kvm_vcpu *vcpu)
{
- *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR);
- vcpu_gp_regs(vcpu)->pstate = read_sysreg_el2(SYS_SPSR);
-
- kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
-
- write_sysreg_el2(vcpu_gp_regs(vcpu)->pstate, SYS_SPSR);
- write_sysreg_el2(*vcpu_pc(vcpu), SYS_ELR);
+ vcpu->arch.flags |= KVM_ARM64_INCREMENT_PC;
}
#endif /* __ARM64_KVM_EMULATE_H__ */
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 0cd9f0f75c13..11beda85ee7e 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -50,6 +50,16 @@
#define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \
KVM_DIRTY_LOG_INITIALLY_SET)
+/*
+ * Mode of operation configurable with kvm-arm.mode early param.
+ * See Documentation/admin-guide/kernel-parameters.txt for more information.
+ */
+enum kvm_mode {
+ KVM_MODE_DEFAULT,
+ KVM_MODE_PROTECTED,
+};
+enum kvm_mode kvm_get_mode(void);
+
DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
extern unsigned int kvm_sve_max_vl;
@@ -58,8 +68,6 @@ int kvm_arm_init_sve(void);
int __attribute_const__ kvm_target_cpu(void);
int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu);
-int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext);
-void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start);
struct kvm_vmid {
/* The VMID generation used for the virt. memory system */
@@ -89,6 +97,9 @@ struct kvm_s2_mmu {
struct kvm *kvm;
};
+struct kvm_arch_memory_slot {
+};
+
struct kvm_arch {
struct kvm_s2_mmu mmu;
@@ -120,6 +131,7 @@ struct kvm_arch {
unsigned int pmuver;
u8 pfr0_csv2;
+ u8 pfr0_csv3;
};
struct kvm_vcpu_fault_info {
@@ -203,48 +215,6 @@ enum vcpu_sysreg {
NR_SYS_REGS /* Nothing after this line! */
};
-/* 32bit mapping */
-#define c0_MPIDR (MPIDR_EL1 * 2) /* MultiProcessor ID Register */
-#define c0_CSSELR (CSSELR_EL1 * 2)/* Cache Size Selection Register */
-#define c1_SCTLR (SCTLR_EL1 * 2) /* System Control Register */
-#define c1_ACTLR (ACTLR_EL1 * 2) /* Auxiliary Control Register */
-#define c1_CPACR (CPACR_EL1 * 2) /* Coprocessor Access Control */
-#define c2_TTBR0 (TTBR0_EL1 * 2) /* Translation Table Base Register 0 */
-#define c2_TTBR0_high (c2_TTBR0 + 1) /* TTBR0 top 32 bits */
-#define c2_TTBR1 (TTBR1_EL1 * 2) /* Translation Table Base Register 1 */
-#define c2_TTBR1_high (c2_TTBR1 + 1) /* TTBR1 top 32 bits */
-#define c2_TTBCR (TCR_EL1 * 2) /* Translation Table Base Control R. */
-#define c3_DACR (DACR32_EL2 * 2)/* Domain Access Control Register */
-#define c5_DFSR (ESR_EL1 * 2) /* Data Fault Status Register */
-#define c5_IFSR (IFSR32_EL2 * 2)/* Instruction Fault Status Register */
-#define c5_ADFSR (AFSR0_EL1 * 2) /* Auxiliary Data Fault Status R */
-#define c5_AIFSR (AFSR1_EL1 * 2) /* Auxiliary Instr Fault Status R */
-#define c6_DFAR (FAR_EL1 * 2) /* Data Fault Address Register */
-#define c6_IFAR (c6_DFAR + 1) /* Instruction Fault Address Register */
-#define c7_PAR (PAR_EL1 * 2) /* Physical Address Register */
-#define c7_PAR_high (c7_PAR + 1) /* PAR top 32 bits */
-#define c10_PRRR (MAIR_EL1 * 2) /* Primary Region Remap Register */
-#define c10_NMRR (c10_PRRR + 1) /* Normal Memory Remap Register */
-#define c12_VBAR (VBAR_EL1 * 2) /* Vector Base Address Register */
-#define c13_CID (CONTEXTIDR_EL1 * 2) /* Context ID Register */
-#define c13_TID_URW (TPIDR_EL0 * 2) /* Thread ID, User R/W */
-#define c13_TID_URO (TPIDRRO_EL0 * 2)/* Thread ID, User R/O */
-#define c13_TID_PRIV (TPIDR_EL1 * 2) /* Thread ID, Privileged */
-#define c10_AMAIR0 (AMAIR_EL1 * 2) /* Aux Memory Attr Indirection Reg */
-#define c10_AMAIR1 (c10_AMAIR0 + 1)/* Aux Memory Attr Indirection Reg */
-#define c14_CNTKCTL (CNTKCTL_EL1 * 2) /* Timer Control Register (PL1) */
-
-#define cp14_DBGDSCRext (MDSCR_EL1 * 2)
-#define cp14_DBGBCR0 (DBGBCR0_EL1 * 2)
-#define cp14_DBGBVR0 (DBGBVR0_EL1 * 2)
-#define cp14_DBGBXVR0 (cp14_DBGBVR0 + 1)
-#define cp14_DBGWCR0 (DBGWCR0_EL1 * 2)
-#define cp14_DBGWVR0 (DBGWVR0_EL1 * 2)
-#define cp14_DBGDCCINT (MDCCINT_EL1 * 2)
-#define cp14_DBGVCR (DBGVCR32_EL2 * 2)
-
-#define NR_COPRO_REGS (NR_SYS_REGS * 2)
-
struct kvm_cpu_context {
struct user_pt_regs regs; /* sp = sp_el0 */
@@ -255,10 +225,7 @@ struct kvm_cpu_context {
struct user_fpsimd_state fp_regs;
- union {
- u64 sys_regs[NR_SYS_REGS];
- u32 copro[NR_COPRO_REGS];
- };
+ u64 sys_regs[NR_SYS_REGS];
struct kvm_vcpu *__hyp_running_vcpu;
};
@@ -409,8 +376,33 @@ struct kvm_vcpu_arch {
#define KVM_ARM64_GUEST_HAS_SVE (1 << 5) /* SVE exposed to guest */
#define KVM_ARM64_VCPU_SVE_FINALIZED (1 << 6) /* SVE config completed */
#define KVM_ARM64_GUEST_HAS_PTRAUTH (1 << 7) /* PTRAUTH exposed to guest */
+#define KVM_ARM64_PENDING_EXCEPTION (1 << 8) /* Exception pending */
+#define KVM_ARM64_EXCEPT_MASK (7 << 9) /* Target EL/MODE */
-#define vcpu_has_sve(vcpu) (system_supports_sve() && \
+/*
+ * When KVM_ARM64_PENDING_EXCEPTION is set, KVM_ARM64_EXCEPT_MASK can
+ * take the following values:
+ *
+ * For AArch32 EL1:
+ */
+#define KVM_ARM64_EXCEPT_AA32_UND (0 << 9)
+#define KVM_ARM64_EXCEPT_AA32_IABT (1 << 9)
+#define KVM_ARM64_EXCEPT_AA32_DABT (2 << 9)
+/* For AArch64: */
+#define KVM_ARM64_EXCEPT_AA64_ELx_SYNC (0 << 9)
+#define KVM_ARM64_EXCEPT_AA64_ELx_IRQ (1 << 9)
+#define KVM_ARM64_EXCEPT_AA64_ELx_FIQ (2 << 9)
+#define KVM_ARM64_EXCEPT_AA64_ELx_SERR (3 << 9)
+#define KVM_ARM64_EXCEPT_AA64_EL1 (0 << 11)
+#define KVM_ARM64_EXCEPT_AA64_EL2 (1 << 11)
+
+/*
+ * Overlaps with KVM_ARM64_EXCEPT_MASK on purpose so that it can't be
+ * set together with an exception...
+ */
+#define KVM_ARM64_INCREMENT_PC (1 << 9) /* Increment PC */
+
+#define vcpu_has_sve(vcpu) (system_supports_sve() && \
((vcpu)->arch.flags & KVM_ARM64_GUEST_HAS_SVE))
#ifdef CONFIG_ARM64_PTR_AUTH
@@ -440,14 +432,96 @@ struct kvm_vcpu_arch {
u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg);
void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg);
-/*
- * CP14 and CP15 live in the same array, as they are backed by the
- * same system registers.
- */
-#define CPx_BIAS IS_ENABLED(CONFIG_CPU_BIG_ENDIAN)
+static inline bool __vcpu_read_sys_reg_from_cpu(int reg, u64 *val)
+{
+ /*
+ * *** VHE ONLY ***
+ *
+ * System registers listed in the switch are not saved on every
+ * exit from the guest but are only saved on vcpu_put.
+ *
+ * Note that MPIDR_EL1 for the guest is set by KVM via VMPIDR_EL2 but
+ * should never be listed below, because the guest cannot modify its
+ * own MPIDR_EL1 and MPIDR_EL1 is accessed for VCPU A from VCPU B's
+ * thread when emulating cross-VCPU communication.
+ */
+ if (!has_vhe())
+ return false;
+
+ switch (reg) {
+ case CSSELR_EL1: *val = read_sysreg_s(SYS_CSSELR_EL1); break;
+ case SCTLR_EL1: *val = read_sysreg_s(SYS_SCTLR_EL12); break;
+ case CPACR_EL1: *val = read_sysreg_s(SYS_CPACR_EL12); break;
+ case TTBR0_EL1: *val = read_sysreg_s(SYS_TTBR0_EL12); break;
+ case TTBR1_EL1: *val = read_sysreg_s(SYS_TTBR1_EL12); break;
+ case TCR_EL1: *val = read_sysreg_s(SYS_TCR_EL12); break;
+ case ESR_EL1: *val = read_sysreg_s(SYS_ESR_EL12); break;
+ case AFSR0_EL1: *val = read_sysreg_s(SYS_AFSR0_EL12); break;
+ case AFSR1_EL1: *val = read_sysreg_s(SYS_AFSR1_EL12); break;
+ case FAR_EL1: *val = read_sysreg_s(SYS_FAR_EL12); break;
+ case MAIR_EL1: *val = read_sysreg_s(SYS_MAIR_EL12); break;
+ case VBAR_EL1: *val = read_sysreg_s(SYS_VBAR_EL12); break;
+ case CONTEXTIDR_EL1: *val = read_sysreg_s(SYS_CONTEXTIDR_EL12);break;
+ case TPIDR_EL0: *val = read_sysreg_s(SYS_TPIDR_EL0); break;
+ case TPIDRRO_EL0: *val = read_sysreg_s(SYS_TPIDRRO_EL0); break;
+ case TPIDR_EL1: *val = read_sysreg_s(SYS_TPIDR_EL1); break;
+ case AMAIR_EL1: *val = read_sysreg_s(SYS_AMAIR_EL12); break;
+ case CNTKCTL_EL1: *val = read_sysreg_s(SYS_CNTKCTL_EL12); break;
+ case ELR_EL1: *val = read_sysreg_s(SYS_ELR_EL12); break;
+ case PAR_EL1: *val = read_sysreg_par(); break;
+ case DACR32_EL2: *val = read_sysreg_s(SYS_DACR32_EL2); break;
+ case IFSR32_EL2: *val = read_sysreg_s(SYS_IFSR32_EL2); break;
+ case DBGVCR32_EL2: *val = read_sysreg_s(SYS_DBGVCR32_EL2); break;
+ default: return false;
+ }
+
+ return true;
+}
-#define vcpu_cp14(v,r) ((v)->arch.ctxt.copro[(r) ^ CPx_BIAS])
-#define vcpu_cp15(v,r) ((v)->arch.ctxt.copro[(r) ^ CPx_BIAS])
+static inline bool __vcpu_write_sys_reg_to_cpu(u64 val, int reg)
+{
+ /*
+ * *** VHE ONLY ***
+ *
+ * System registers listed in the switch are not restored on every
+ * entry to the guest but are only restored on vcpu_load.
+ *
+ * Note that MPIDR_EL1 for the guest is set by KVM via VMPIDR_EL2 but
+ * should never be listed below, because the MPIDR should only be set
+ * once, before running the VCPU, and never changed later.
+ */
+ if (!has_vhe())
+ return false;
+
+ switch (reg) {
+ case CSSELR_EL1: write_sysreg_s(val, SYS_CSSELR_EL1); break;
+ case SCTLR_EL1: write_sysreg_s(val, SYS_SCTLR_EL12); break;
+ case CPACR_EL1: write_sysreg_s(val, SYS_CPACR_EL12); break;
+ case TTBR0_EL1: write_sysreg_s(val, SYS_TTBR0_EL12); break;
+ case TTBR1_EL1: write_sysreg_s(val, SYS_TTBR1_EL12); break;
+ case TCR_EL1: write_sysreg_s(val, SYS_TCR_EL12); break;
+ case ESR_EL1: write_sysreg_s(val, SYS_ESR_EL12); break;
+ case AFSR0_EL1: write_sysreg_s(val, SYS_AFSR0_EL12); break;
+ case AFSR1_EL1: write_sysreg_s(val, SYS_AFSR1_EL12); break;
+ case FAR_EL1: write_sysreg_s(val, SYS_FAR_EL12); break;
+ case MAIR_EL1: write_sysreg_s(val, SYS_MAIR_EL12); break;
+ case VBAR_EL1: write_sysreg_s(val, SYS_VBAR_EL12); break;
+ case CONTEXTIDR_EL1: write_sysreg_s(val, SYS_CONTEXTIDR_EL12);break;
+ case TPIDR_EL0: write_sysreg_s(val, SYS_TPIDR_EL0); break;
+ case TPIDRRO_EL0: write_sysreg_s(val, SYS_TPIDRRO_EL0); break;
+ case TPIDR_EL1: write_sysreg_s(val, SYS_TPIDR_EL1); break;
+ case AMAIR_EL1: write_sysreg_s(val, SYS_AMAIR_EL12); break;
+ case CNTKCTL_EL1: write_sysreg_s(val, SYS_CNTKCTL_EL12); break;
+ case ELR_EL1: write_sysreg_s(val, SYS_ELR_EL12); break;
+ case PAR_EL1: write_sysreg_s(val, SYS_PAR_EL1); break;
+ case DACR32_EL2: write_sysreg_s(val, SYS_DACR32_EL2); break;
+ case IFSR32_EL2: write_sysreg_s(val, SYS_IFSR32_EL2); break;
+ case DBGVCR32_EL2: write_sysreg_s(val, SYS_DBGVCR32_EL2); break;
+ default: return false;
+ }
+
+ return true;
+}
struct kvm_vm_stat {
ulong remote_tlb_flush;
@@ -473,6 +547,12 @@ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu);
int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices);
int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg);
int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg);
+
+unsigned long kvm_arm_num_sys_reg_descs(struct kvm_vcpu *vcpu);
+int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices);
+int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *);
+int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *);
+
int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu,
struct kvm_vcpu_events *events);
@@ -535,6 +615,17 @@ void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
int handle_exit(struct kvm_vcpu *vcpu, int exception_index);
void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index);
+int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu);
+int kvm_handle_cp14_32(struct kvm_vcpu *vcpu);
+int kvm_handle_cp14_64(struct kvm_vcpu *vcpu);
+int kvm_handle_cp15_32(struct kvm_vcpu *vcpu);
+int kvm_handle_cp15_64(struct kvm_vcpu *vcpu);
+int kvm_handle_sys_reg(struct kvm_vcpu *vcpu);
+
+void kvm_reset_sys_regs(struct kvm_vcpu *vcpu);
+
+void kvm_sys_reg_table_init(void);
+
/* MMIO helpers */
void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data);
unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len);
@@ -654,4 +745,7 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu);
#define kvm_arm_vcpu_sve_finalized(vcpu) \
((vcpu)->arch.flags & KVM_ARM64_VCPU_SVE_FINALIZED)
+#define kvm_vcpu_has_pmu(vcpu) \
+ (test_bit(KVM_ARM_VCPU_PMU_V3, (vcpu)->arch.features))
+
#endif /* __ARM64_KVM_HOST_H__ */
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index 6b664de5ec1f..c0450828378b 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -14,6 +14,7 @@
DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
DECLARE_PER_CPU(unsigned long, kvm_hyp_vector);
+DECLARE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
#define read_sysreg_elx(r,nvh,vh) \
({ \
@@ -92,10 +93,11 @@ void deactivate_traps_vhe_put(void);
u64 __guest_enter(struct kvm_vcpu *vcpu);
+bool kvm_host_psci_handler(struct kvm_cpu_context *host_ctxt);
+
void __noreturn hyp_panic(void);
#ifdef __KVM_NVHE_HYPERVISOR__
void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par);
#endif
#endif /* __ARM64_KVM_HYP_H__ */
-
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 331394306cce..e52d82aeadca 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -72,6 +72,52 @@ alternative_cb kvm_update_va_mask
alternative_cb_end
.endm
+/*
+ * Convert a kernel image address to a PA
+ * reg: kernel address to be converted in place
+ * tmp: temporary register
+ *
+ * The actual code generation takes place in kvm_get_kimage_voffset, and
+ * the instructions below are only there to reserve the space and
+ * perform the register allocation (kvm_get_kimage_voffset uses the
+ * specific registers encoded in the instructions).
+ */
+.macro kimg_pa reg, tmp
+alternative_cb kvm_get_kimage_voffset
+ movz \tmp, #0
+ movk \tmp, #0, lsl #16
+ movk \tmp, #0, lsl #32
+ movk \tmp, #0, lsl #48
+alternative_cb_end
+
+ /* reg = __pa(reg) */
+ sub \reg, \reg, \tmp
+.endm
+
+/*
+ * Convert a kernel image address to a hyp VA
+ * reg: kernel address to be converted in place
+ * tmp: temporary register
+ *
+ * The actual code generation takes place in kvm_get_kimage_voffset, and
+ * the instructions below are only there to reserve the space and
+ * perform the register allocation (kvm_update_kimg_phys_offset uses the
+ * specific registers encoded in the instructions).
+ */
+.macro kimg_hyp_va reg, tmp
+alternative_cb kvm_update_kimg_phys_offset
+ movz \tmp, #0
+ movk \tmp, #0, lsl #16
+ movk \tmp, #0, lsl #32
+ movk \tmp, #0, lsl #48
+alternative_cb_end
+
+ sub \reg, \reg, \tmp
+ mov_q \tmp, PAGE_OFFSET
+ orr \reg, \reg, \tmp
+ kern_hyp_va \reg
+.endm
+
#else
#include <linux/pgtable.h>
@@ -98,6 +144,24 @@ static __always_inline unsigned long __kern_hyp_va(unsigned long v)
#define kern_hyp_va(v) ((typeof(v))(__kern_hyp_va((unsigned long)(v))))
+static __always_inline unsigned long __kimg_hyp_va(unsigned long v)
+{
+ unsigned long offset;
+
+ asm volatile(ALTERNATIVE_CB("movz %0, #0\n"
+ "movk %0, #0, lsl #16\n"
+ "movk %0, #0, lsl #32\n"
+ "movk %0, #0, lsl #48\n",
+ kvm_update_kimg_phys_offset)
+ : "=r" (offset));
+
+ return __kern_hyp_va((v - offset) | PAGE_OFFSET);
+}
+
+#define kimg_fn_hyp_va(v) ((typeof(*v))(__kimg_hyp_va((unsigned long)(v))))
+
+#define kimg_fn_ptr(x) (typeof(x) **)(x)
+
/*
* We currently support using a VM-specified IPA size. For backward
* compatibility, the default IPA size is fixed to 40bits.
@@ -208,52 +272,6 @@ static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa,
return ret;
}
-/*
- * EL2 vectors can be mapped and rerouted in a number of ways,
- * depending on the kernel configuration and CPU present:
- *
- * - If the CPU is affected by Spectre-v2, the hardening sequence is
- * placed in one of the vector slots, which is executed before jumping
- * to the real vectors.
- *
- * - If the CPU also has the ARM64_HARDEN_EL2_VECTORS cap, the slot
- * containing the hardening sequence is mapped next to the idmap page,
- * and executed before jumping to the real vectors.
- *
- * - If the CPU only has the ARM64_HARDEN_EL2_VECTORS cap, then an
- * empty slot is selected, mapped next to the idmap page, and
- * executed before jumping to the real vectors.
- *
- * Note that ARM64_HARDEN_EL2_VECTORS is somewhat incompatible with
- * VHE, as we don't have hypervisor-specific mappings. If the system
- * is VHE and yet selects this capability, it will be ignored.
- */
-extern void *__kvm_bp_vect_base;
-extern int __kvm_harden_el2_vector_slot;
-
-static inline void *kvm_get_hyp_vector(void)
-{
- struct bp_hardening_data *data = arm64_get_bp_hardening_data();
- void *vect = kern_hyp_va(kvm_ksym_ref(__kvm_hyp_vector));
- int slot = -1;
-
- if (cpus_have_const_cap(ARM64_SPECTRE_V2) && data->fn) {
- vect = kern_hyp_va(kvm_ksym_ref(__bp_harden_hyp_vecs));
- slot = data->hyp_vectors_slot;
- }
-
- if (this_cpu_has_cap(ARM64_HARDEN_EL2_VECTORS) && !has_vhe()) {
- vect = __kvm_bp_vect_base;
- if (slot == -1)
- slot = __kvm_harden_el2_vector_slot;
- }
-
- if (slot != -1)
- vect += slot * SZ_2K;
-
- return vect;
-}
-
#define kvm_phys_to_vttbr(addr) phys_to_ttbr(addr)
static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu)
diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index b2e91c187e2a..75beffe2ee8a 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -12,9 +12,6 @@
#define USER_ASID_FLAG (UL(1) << USER_ASID_BIT)
#define TTBR_ASID_MASK (UL(0xffff) << 48)
-#define BP_HARDEN_EL2_SLOTS 4
-#define __BP_HARDEN_HYP_VECS_SZ (BP_HARDEN_EL2_SLOTS * SZ_2K)
-
#ifndef __ASSEMBLY__
#include <linux/refcount.h>
@@ -41,32 +38,6 @@ static inline bool arm64_kernel_unmapped_at_el0(void)
return cpus_have_const_cap(ARM64_UNMAP_KERNEL_AT_EL0);
}
-typedef void (*bp_hardening_cb_t)(void);
-
-struct bp_hardening_data {
- int hyp_vectors_slot;
- bp_hardening_cb_t fn;
-};
-
-DECLARE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data);
-
-static inline struct bp_hardening_data *arm64_get_bp_hardening_data(void)
-{
- return this_cpu_ptr(&bp_hardening_data);
-}
-
-static inline void arm64_apply_bp_hardening(void)
-{
- struct bp_hardening_data *d;
-
- if (!cpus_have_const_cap(ARM64_SPECTRE_V2))
- return;
-
- d = arm64_get_bp_hardening_data();
- if (d->fn)
- d->fn();
-}
-
extern void arm64_memblock_init(void);
extern void paging_init(void);
extern void bootmem_init(void);
diff --git a/arch/arm64/include/asm/percpu.h b/arch/arm64/include/asm/percpu.h
index 1599e17379d8..8f1661603b78 100644
--- a/arch/arm64/include/asm/percpu.h
+++ b/arch/arm64/include/asm/percpu.h
@@ -239,6 +239,12 @@ PERCPU_RET_OP(add, add, ldadd)
#define this_cpu_cmpxchg_8(pcp, o, n) \
_pcp_protect_return(cmpxchg_relaxed, pcp, o, n)
+#ifdef __KVM_NVHE_HYPERVISOR__
+extern unsigned long __hyp_per_cpu_offset(unsigned int cpu);
+#define __per_cpu_offset
+#define per_cpu_offset(cpu) __hyp_per_cpu_offset((cpu))
+#endif
+
#include <asm-generic/percpu.h>
/* Redefine macros for nVHE hyp under DEBUG_PREEMPT to avoid its dependencies. */
diff --git a/arch/arm64/include/asm/sections.h b/arch/arm64/include/asm/sections.h
index 3994169985ef..8ff579361731 100644
--- a/arch/arm64/include/asm/sections.h
+++ b/arch/arm64/include/asm/sections.h
@@ -11,6 +11,7 @@ extern char __alt_instructions[], __alt_instructions_end[];
extern char __hibernate_exit_text_start[], __hibernate_exit_text_end[];
extern char __hyp_idmap_text_start[], __hyp_idmap_text_end[];
extern char __hyp_text_start[], __hyp_text_end[];
+extern char __hyp_data_ro_after_init_start[], __hyp_data_ro_after_init_end[];
extern char __idmap_text_start[], __idmap_text_end[];
extern char __initdata_begin[], __initdata_end[];
extern char __inittext_begin[], __inittext_end[];
diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
index 2e7f529ec5a6..bcb01ca15325 100644
--- a/arch/arm64/include/asm/smp.h
+++ b/arch/arm64/include/asm/smp.h
@@ -46,9 +46,9 @@ DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);
* Logical CPU mapping.
*/
extern u64 __cpu_logical_map[NR_CPUS];
-extern u64 cpu_logical_map(int cpu);
+extern u64 cpu_logical_map(unsigned int cpu);
-static inline void set_cpu_logical_map(int cpu, u64 hwid)
+static inline void set_cpu_logical_map(unsigned int cpu, u64 hwid)
{
__cpu_logical_map[cpu] = hwid;
}
diff --git a/arch/arm64/include/asm/spectre.h b/arch/arm64/include/asm/spectre.h
index fcdfbce302bd..f62ca39da6c5 100644
--- a/arch/arm64/include/asm/spectre.h
+++ b/arch/arm64/include/asm/spectre.h
@@ -9,7 +9,15 @@
#ifndef __ASM_SPECTRE_H
#define __ASM_SPECTRE_H
+#define BP_HARDEN_EL2_SLOTS 4
+#define __BP_HARDEN_HYP_VECS_SZ ((BP_HARDEN_EL2_SLOTS - 1) * SZ_2K)
+
+#ifndef __ASSEMBLY__
+
+#include <linux/percpu.h>
+
#include <asm/cpufeature.h>
+#include <asm/virt.h>
/* Watch out, ordering is important here. */
enum mitigation_state {
@@ -20,13 +28,70 @@ enum mitigation_state {
struct task_struct;
+/*
+ * Note: the order of this enum corresponds to __bp_harden_hyp_vecs and
+ * we rely on having the direct vectors first.
+ */
+enum arm64_hyp_spectre_vector {
+ /*
+ * Take exceptions directly to __kvm_hyp_vector. This must be
+ * 0 so that it used by default when mitigations are not needed.
+ */
+ HYP_VECTOR_DIRECT,
+
+ /*
+ * Bounce via a slot in the hypervisor text mapping of
+ * __bp_harden_hyp_vecs, which contains an SMC call.
+ */
+ HYP_VECTOR_SPECTRE_DIRECT,
+
+ /*
+ * Bounce via a slot in a special mapping of __bp_harden_hyp_vecs
+ * next to the idmap page.
+ */
+ HYP_VECTOR_INDIRECT,
+
+ /*
+ * Bounce via a slot in a special mapping of __bp_harden_hyp_vecs
+ * next to the idmap page, which contains an SMC call.
+ */
+ HYP_VECTOR_SPECTRE_INDIRECT,
+};
+
+typedef void (*bp_hardening_cb_t)(void);
+
+struct bp_hardening_data {
+ enum arm64_hyp_spectre_vector slot;
+ bp_hardening_cb_t fn;
+};
+
+DECLARE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data);
+
+static inline void arm64_apply_bp_hardening(void)
+{
+ struct bp_hardening_data *d;
+
+ if (!cpus_have_const_cap(ARM64_SPECTRE_V2))
+ return;
+
+ d = this_cpu_ptr(&bp_hardening_data);
+ if (d->fn)
+ d->fn();
+}
+
enum mitigation_state arm64_get_spectre_v2_state(void);
bool has_spectre_v2(const struct arm64_cpu_capabilities *cap, int scope);
void spectre_v2_enable_mitigation(const struct arm64_cpu_capabilities *__unused);
+bool has_spectre_v3a(const struct arm64_cpu_capabilities *cap, int scope);
+void spectre_v3a_enable_mitigation(const struct arm64_cpu_capabilities *__unused);
+
enum mitigation_state arm64_get_spectre_v4_state(void);
bool has_spectre_v4(const struct arm64_cpu_capabilities *cap, int scope);
void spectre_v4_enable_mitigation(const struct arm64_cpu_capabilities *__unused);
void spectre_v4_enable_task_mitigation(struct task_struct *tsk);
+enum mitigation_state arm64_get_meltdown_state(void);
+
+#endif /* __ASSEMBLY__ */
#endif /* __ASM_SPECTRE_H */
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index cf7922f23808..8b5e7e5c3cc8 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -469,6 +469,7 @@
#define SYS_PMCCFILTR_EL0 sys_reg(3, 3, 14, 15, 7)
+#define SYS_SCTLR_EL2 sys_reg(3, 4, 1, 0, 0)
#define SYS_ZCR_EL2 sys_reg(3, 4, 1, 2, 0)
#define SYS_DACR32_EL2 sys_reg(3, 4, 3, 0, 0)
#define SYS_SPSR_EL2 sys_reg(3, 4, 4, 0, 0)
diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h
index 6069be50baf9..ee6a48df89d9 100644
--- a/arch/arm64/include/asm/virt.h
+++ b/arch/arm64/include/asm/virt.h
@@ -65,9 +65,19 @@ extern u32 __boot_cpu_mode[2];
void __hyp_set_vectors(phys_addr_t phys_vector_base);
void __hyp_reset_vectors(void);
+DECLARE_STATIC_KEY_FALSE(kvm_protected_mode_initialized);
+
/* Reports the availability of HYP mode */
static inline bool is_hyp_mode_available(void)
{
+ /*
+ * If KVM protected mode is initialized, all CPUs must have been booted
+ * in EL2. Avoid checking __boot_cpu_mode as CPUs now come up in EL1.
+ */
+ if (IS_ENABLED(CONFIG_KVM) &&
+ static_branch_likely(&kvm_protected_mode_initialized))
+ return true;
+
return (__boot_cpu_mode[0] == BOOT_CPU_MODE_EL2 &&
__boot_cpu_mode[1] == BOOT_CPU_MODE_EL2);
}
@@ -75,6 +85,14 @@ static inline bool is_hyp_mode_available(void)
/* Check if the bootloader has booted CPUs in different modes */
static inline bool is_hyp_mode_mismatched(void)
{
+ /*
+ * If KVM protected mode is initialized, all CPUs must have been booted
+ * in EL2. Avoid checking __boot_cpu_mode as CPUs now come up in EL1.
+ */
+ if (IS_ENABLED(CONFIG_KVM) &&
+ static_branch_likely(&kvm_protected_mode_initialized))
+ return false;
+
return __boot_cpu_mode[0] != __boot_cpu_mode[1];
}
@@ -97,6 +115,14 @@ static __always_inline bool has_vhe(void)
return cpus_have_final_cap(ARM64_HAS_VIRT_HOST_EXTN);
}
+static __always_inline bool is_protected_kvm_enabled(void)
+{
+ if (is_vhe_hyp_code())
+ return false;
+ else
+ return cpus_have_final_cap(ARM64_KVM_PROTECTED_MODE);
+}
+
#endif /* __ASSEMBLY__ */
#endif /* ! __ASM__VIRT_H */
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index 1c17c3a24411..24223adae150 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -156,9 +156,6 @@ struct kvm_sync_regs {
__u64 device_irq_level;
};
-struct kvm_arch_memory_slot {
-};
-
/*
* PMU filter structure. Describe a range of events with a particular
* action. To be used with KVM_ARM_VCPU_PMU_V3_FILTER.
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 679b19b8a7ff..5e82488f1b82 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -109,6 +109,11 @@ int main(void)
DEFINE(CPU_APGAKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APGAKEYLO_EL1]));
DEFINE(HOST_CONTEXT_VCPU, offsetof(struct kvm_cpu_context, __hyp_running_vcpu));
DEFINE(HOST_DATA_CONTEXT, offsetof(struct kvm_host_data, host_ctxt));
+ DEFINE(NVHE_INIT_MAIR_EL2, offsetof(struct kvm_nvhe_init_params, mair_el2));
+ DEFINE(NVHE_INIT_TCR_EL2, offsetof(struct kvm_nvhe_init_params, tcr_el2));
+ DEFINE(NVHE_INIT_TPIDR_EL2, offsetof(struct kvm_nvhe_init_params, tpidr_el2));
+ DEFINE(NVHE_INIT_STACK_HYP_VA, offsetof(struct kvm_nvhe_init_params, stack_hyp_va));
+ DEFINE(NVHE_INIT_PGD_PA, offsetof(struct kvm_nvhe_init_params, pgd_pa));
#endif
#ifdef CONFIG_CPU_PM
DEFINE(CPU_CTX_SP, offsetof(struct cpu_suspend_ctx, sp));
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index cafaf0da05b7..a63428301f42 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -196,16 +196,6 @@ has_neoverse_n1_erratum_1542419(const struct arm64_cpu_capabilities *entry,
return is_midr_in_range(midr, &range) && has_dic;
}
-#ifdef CONFIG_RANDOMIZE_BASE
-
-static const struct midr_range ca57_a72[] = {
- MIDR_ALL_VERSIONS(MIDR_CORTEX_A57),
- MIDR_ALL_VERSIONS(MIDR_CORTEX_A72),
- {},
-};
-
-#endif
-
#ifdef CONFIG_ARM64_WORKAROUND_REPEAT_TLBI
static const struct arm64_cpu_capabilities arm64_repeat_tlbi_list[] = {
#ifdef CONFIG_QCOM_FALKOR_ERRATUM_1009
@@ -461,9 +451,12 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
},
#ifdef CONFIG_RANDOMIZE_BASE
{
- .desc = "EL2 vector hardening",
- .capability = ARM64_HARDEN_EL2_VECTORS,
- ERRATA_MIDR_RANGE_LIST(ca57_a72),
+ /* Must come after the Spectre-v2 entry */
+ .desc = "Spectre-v3a",
+ .capability = ARM64_SPECTRE_V3A,
+ .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM,
+ .matches = has_spectre_v3a,
+ .cpu_enable = spectre_v3a_enable_mitigation,
},
#endif
{
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 39138f6d3ba2..d87cfc6246e0 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -74,6 +74,7 @@
#include <asm/cpufeature.h>
#include <asm/cpu_ops.h>
#include <asm/fpsimd.h>
+#include <asm/kvm_host.h>
#include <asm/mmu_context.h>
#include <asm/mte.h>
#include <asm/processor.h>
@@ -1712,6 +1713,21 @@ static void cpu_enable_mte(struct arm64_cpu_capabilities const *cap)
}
#endif /* CONFIG_ARM64_MTE */
+#ifdef CONFIG_KVM
+static bool is_kvm_protected_mode(const struct arm64_cpu_capabilities *entry, int __unused)
+{
+ if (kvm_get_mode() != KVM_MODE_PROTECTED)
+ return false;
+
+ if (is_kernel_in_hyp_mode()) {
+ pr_warn("Protected KVM not available with VHE\n");
+ return false;
+ }
+
+ return true;
+}
+#endif /* CONFIG_KVM */
+
/* Internal helper functions to match cpu capability type */
static bool
cpucap_late_cpu_optional(const struct arm64_cpu_capabilities *cap)
@@ -1803,6 +1819,12 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.field_pos = ID_AA64PFR0_EL1_SHIFT,
.min_field_value = ID_AA64PFR0_EL1_32BIT_64BIT,
},
+ {
+ .desc = "Protected KVM",
+ .capability = ARM64_KVM_PROTECTED_MODE,
+ .type = ARM64_CPUCAP_SYSTEM_FEATURE,
+ .matches = is_kvm_protected_mode,
+ },
#endif
{
.desc = "Kernel page table isolation (KPTI)",
@@ -2831,14 +2853,28 @@ static int __init enable_mrs_emulation(void)
core_initcall(enable_mrs_emulation);
+enum mitigation_state arm64_get_meltdown_state(void)
+{
+ if (__meltdown_safe)
+ return SPECTRE_UNAFFECTED;
+
+ if (arm64_kernel_unmapped_at_el0())
+ return SPECTRE_MITIGATED;
+
+ return SPECTRE_VULNERABLE;
+}
+
ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr,
char *buf)
{
- if (__meltdown_safe)
+ switch (arm64_get_meltdown_state()) {
+ case SPECTRE_UNAFFECTED:
return sprintf(buf, "Not affected\n");
- if (arm64_kernel_unmapped_at_el0())
+ case SPECTRE_MITIGATED:
return sprintf(buf, "Mitigation: PTI\n");
- return sprintf(buf, "Vulnerable\n");
+ default:
+ return sprintf(buf, "Vulnerable\n");
+ }
}
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index f2eb206920a2..42b23ce679dc 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -11,7 +11,6 @@
#include <linux/linkage.h>
#include <linux/init.h>
-#include <linux/irqchip/arm-gic-v3.h>
#include <linux/pgtable.h>
#include <asm/asm_pointer_auth.h>
@@ -21,6 +20,7 @@
#include <asm/asm-offsets.h>
#include <asm/cache.h>
#include <asm/cputype.h>
+#include <asm/el2_setup.h>
#include <asm/elf.h>
#include <asm/image.h>
#include <asm/kernel-pgtable.h>
@@ -493,155 +493,56 @@ SYM_INNER_LABEL(init_el1, SYM_L_LOCAL)
eret
SYM_INNER_LABEL(init_el2, SYM_L_LOCAL)
- mov_q x0, INIT_SCTLR_EL2_MMU_OFF
- msr sctlr_el2, x0
-
#ifdef CONFIG_ARM64_VHE
/*
- * Check for VHE being present. For the rest of the EL2 setup,
- * x2 being non-zero indicates that we do have VHE, and that the
- * kernel is intended to run at EL2.
+ * Check for VHE being present. x2 being non-zero indicates that we
+ * do have VHE, and that the kernel is intended to run at EL2.
*/
mrs x2, id_aa64mmfr1_el1
ubfx x2, x2, #ID_AA64MMFR1_VHE_SHIFT, #4
#else
mov x2, xzr
#endif
+ cbz x2, init_el2_nvhe
- /* Hyp configuration. */
- mov_q x0, HCR_HOST_NVHE_FLAGS
- cbz x2, set_hcr
+ /*
+ * When VHE _is_ in use, EL1 will not be used in the host and
+ * requires no configuration, and all non-hyp-specific EL2 setup
+ * will be done via the _EL1 system register aliases in __cpu_setup.
+ */
mov_q x0, HCR_HOST_VHE_FLAGS
-set_hcr:
msr hcr_el2, x0
isb
- /*
- * Allow Non-secure EL1 and EL0 to access physical timer and counter.
- * This is not necessary for VHE, since the host kernel runs in EL2,
- * and EL0 accesses are configured in the later stage of boot process.
- * Note that when HCR_EL2.E2H == 1, CNTHCTL_EL2 has the same bit layout
- * as CNTKCTL_EL1, and CNTKCTL_EL1 accessing instructions are redefined
- * to access CNTHCTL_EL2. This allows the kernel designed to run at EL1
- * to transparently mess with the EL0 bits via CNTKCTL_EL1 access in
- * EL2.
- */
- cbnz x2, 1f
- mrs x0, cnthctl_el2
- orr x0, x0, #3 // Enable EL1 physical timers
- msr cnthctl_el2, x0
-1:
- msr cntvoff_el2, xzr // Clear virtual offset
-
-#ifdef CONFIG_ARM_GIC_V3
- /* GICv3 system register access */
- mrs x0, id_aa64pfr0_el1
- ubfx x0, x0, #ID_AA64PFR0_GIC_SHIFT, #4
- cbz x0, 3f
-
- mrs_s x0, SYS_ICC_SRE_EL2
- orr x0, x0, #ICC_SRE_EL2_SRE // Set ICC_SRE_EL2.SRE==1
- orr x0, x0, #ICC_SRE_EL2_ENABLE // Set ICC_SRE_EL2.Enable==1
- msr_s SYS_ICC_SRE_EL2, x0
- isb // Make sure SRE is now set
- mrs_s x0, SYS_ICC_SRE_EL2 // Read SRE back,
- tbz x0, #0, 3f // and check that it sticks
- msr_s SYS_ICH_HCR_EL2, xzr // Reset ICC_HCR_EL2 to defaults
-
-3:
-#endif
-
- /* Populate ID registers. */
- mrs x0, midr_el1
- mrs x1, mpidr_el1
- msr vpidr_el2, x0
- msr vmpidr_el2, x1
-
-#ifdef CONFIG_COMPAT
- msr hstr_el2, xzr // Disable CP15 traps to EL2
-#endif
-
- /* EL2 debug */
- mrs x1, id_aa64dfr0_el1
- sbfx x0, x1, #ID_AA64DFR0_PMUVER_SHIFT, #4
- cmp x0, #1
- b.lt 4f // Skip if no PMU present
- mrs x0, pmcr_el0 // Disable debug access traps
- ubfx x0, x0, #11, #5 // to EL2 and allow access to
-4:
- csel x3, xzr, x0, lt // all PMU counters from EL1
-
- /* Statistical profiling */
- ubfx x0, x1, #ID_AA64DFR0_PMSVER_SHIFT, #4
- cbz x0, 7f // Skip if SPE not present
- cbnz x2, 6f // VHE?
- mrs_s x4, SYS_PMBIDR_EL1 // If SPE available at EL2,
- and x4, x4, #(1 << SYS_PMBIDR_EL1_P_SHIFT)
- cbnz x4, 5f // then permit sampling of physical
- mov x4, #(1 << SYS_PMSCR_EL2_PCT_SHIFT | \
- 1 << SYS_PMSCR_EL2_PA_SHIFT)
- msr_s SYS_PMSCR_EL2, x4 // addresses and physical counter
-5:
- mov x1, #(MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT)
- orr x3, x3, x1 // If we don't have VHE, then
- b 7f // use EL1&0 translation.
-6: // For VHE, use EL2 translation
- orr x3, x3, #MDCR_EL2_TPMS // and disable access from EL1
-7:
- msr mdcr_el2, x3 // Configure debug traps
-
- /* LORegions */
- mrs x1, id_aa64mmfr1_el1
- ubfx x0, x1, #ID_AA64MMFR1_LOR_SHIFT, 4
- cbz x0, 1f
- msr_s SYS_LORC_EL1, xzr
-1:
-
- /* Stage-2 translation */
- msr vttbr_el2, xzr
-
- cbz x2, install_el2_stub
+ init_el2_state vhe
isb
+
mov_q x0, INIT_PSTATE_EL2
msr spsr_el2, x0
msr elr_el2, lr
mov w0, #BOOT_CPU_MODE_EL2
eret
-SYM_INNER_LABEL(install_el2_stub, SYM_L_LOCAL)
+SYM_INNER_LABEL(init_el2_nvhe, SYM_L_LOCAL)
/*
* When VHE is not in use, early init of EL2 and EL1 needs to be
* done here.
- * When VHE _is_ in use, EL1 will not be used in the host and
- * requires no configuration, and all non-hyp-specific EL2 setup
- * will be done via the _EL1 system register aliases in __cpu_setup.
*/
mov_q x0, INIT_SCTLR_EL1_MMU_OFF
msr sctlr_el1, x0
- /* Coprocessor traps. */
- mov x0, #0x33ff
- msr cptr_el2, x0 // Disable copro. traps to EL2
-
- /* SVE register access */
- mrs x1, id_aa64pfr0_el1
- ubfx x1, x1, #ID_AA64PFR0_SVE_SHIFT, #4
- cbz x1, 7f
-
- bic x0, x0, #CPTR_EL2_TZ // Also disable SVE traps
- msr cptr_el2, x0 // Disable copro. traps to EL2
+ mov_q x0, HCR_HOST_NVHE_FLAGS
+ msr hcr_el2, x0
isb
- mov x1, #ZCR_ELx_LEN_MASK // SVE: Enable full vector
- msr_s SYS_ZCR_EL2, x1 // length for EL1.
+
+ init_el2_state nvhe
/* Hypervisor stub */
-7: adr_l x0, __hyp_stub_vectors
+ adr_l x0, __hyp_stub_vectors
msr vbar_el2, x0
-
isb
- mov x0, #INIT_PSTATE_EL1
- msr spsr_el2, x0
+
msr elr_el2, lr
mov w0, #BOOT_CPU_MODE_EL2
eret
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index c615b285ff5b..39289d75118d 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -64,13 +64,12 @@ __efistub__ctype = _ctype;
/* Alternative callbacks for init-time patching of nVHE hyp code. */
KVM_NVHE_ALIAS(kvm_patch_vector_branch);
KVM_NVHE_ALIAS(kvm_update_va_mask);
+KVM_NVHE_ALIAS(kvm_update_kimg_phys_offset);
+KVM_NVHE_ALIAS(kvm_get_kimage_voffset);
/* Global kernel state accessed by nVHE hyp code. */
KVM_NVHE_ALIAS(kvm_vgic_global_state);
-/* Kernel constant needed to compute idmap addresses. */
-KVM_NVHE_ALIAS(kimage_voffset);
-
/* Kernel symbols used to call panic() from nVHE hyp code (via ERET). */
KVM_NVHE_ALIAS(__hyp_panic_string);
KVM_NVHE_ALIAS(panic);
@@ -78,9 +77,6 @@ KVM_NVHE_ALIAS(panic);
/* Vectors installed by hyp-init on reset HVC. */
KVM_NVHE_ALIAS(__hyp_stub_vectors);
-/* IDMAP TCR_EL1.T0SZ as computed by the EL1 init code */
-KVM_NVHE_ALIAS(idmap_t0sz);
-
/* Kernel symbol used by icache_is_vpipt(). */
KVM_NVHE_ALIAS(__icache_flags);
@@ -103,6 +99,9 @@ KVM_NVHE_ALIAS(gic_nonsecure_priorities);
KVM_NVHE_ALIAS(__start___kvm_ex_table);
KVM_NVHE_ALIAS(__stop___kvm_ex_table);
+/* Array containing bases of nVHE per-CPU memory regions. */
+KVM_NVHE_ALIAS(kvm_arm_hyp_percpu_base);
+
#endif /* CONFIG_KVM */
#endif /* __ARM64_KERNEL_IMAGE_VARS_H */
diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c
index 4c25c008504f..902e4084c477 100644
--- a/arch/arm64/kernel/proton-pack.c
+++ b/arch/arm64/kernel/proton-pack.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * Handle detection, reporting and mitigation of Spectre v1, v2 and v4, as
+ * Handle detection, reporting and mitigation of Spectre v1, v2, v3a and v4, as
* detailed at:
*
* https://developer.arm.com/support/arm-security-updates/speculative-processor-vulnerability
@@ -27,6 +27,7 @@
#include <asm/insn.h>
#include <asm/spectre.h>
#include <asm/traps.h>
+#include <asm/virt.h>
/*
* We try to ensure that the mitigation state can never change as the result of
@@ -171,72 +172,26 @@ bool has_spectre_v2(const struct arm64_cpu_capabilities *entry, int scope)
return true;
}
-DEFINE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data);
-
enum mitigation_state arm64_get_spectre_v2_state(void)
{
return spectre_v2_state;
}
-#ifdef CONFIG_KVM
-#include <asm/cacheflush.h>
-#include <asm/kvm_asm.h>
-
-atomic_t arm64_el2_vector_last_slot = ATOMIC_INIT(-1);
-
-static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start,
- const char *hyp_vecs_end)
-{
- void *dst = lm_alias(__bp_harden_hyp_vecs + slot * SZ_2K);
- int i;
-
- for (i = 0; i < SZ_2K; i += 0x80)
- memcpy(dst + i, hyp_vecs_start, hyp_vecs_end - hyp_vecs_start);
-
- __flush_icache_range((uintptr_t)dst, (uintptr_t)dst + SZ_2K);
-}
+DEFINE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data);
static void install_bp_hardening_cb(bp_hardening_cb_t fn)
{
- static DEFINE_RAW_SPINLOCK(bp_lock);
- int cpu, slot = -1;
- const char *hyp_vecs_start = __smccc_workaround_1_smc;
- const char *hyp_vecs_end = __smccc_workaround_1_smc +
- __SMCCC_WORKAROUND_1_SMC_SZ;
+ __this_cpu_write(bp_hardening_data.fn, fn);
/*
* Vinz Clortho takes the hyp_vecs start/end "keys" at
* the door when we're a guest. Skip the hyp-vectors work.
*/
- if (!is_hyp_mode_available()) {
- __this_cpu_write(bp_hardening_data.fn, fn);
+ if (!is_hyp_mode_available())
return;
- }
-
- raw_spin_lock(&bp_lock);
- for_each_possible_cpu(cpu) {
- if (per_cpu(bp_hardening_data.fn, cpu) == fn) {
- slot = per_cpu(bp_hardening_data.hyp_vectors_slot, cpu);
- break;
- }
- }
-
- if (slot == -1) {
- slot = atomic_inc_return(&arm64_el2_vector_last_slot);
- BUG_ON(slot >= BP_HARDEN_EL2_SLOTS);
- __copy_hyp_vect_bpi(slot, hyp_vecs_start, hyp_vecs_end);
- }
- __this_cpu_write(bp_hardening_data.hyp_vectors_slot, slot);
- __this_cpu_write(bp_hardening_data.fn, fn);
- raw_spin_unlock(&bp_lock);
-}
-#else
-static void install_bp_hardening_cb(bp_hardening_cb_t fn)
-{
- __this_cpu_write(bp_hardening_data.fn, fn);
+ __this_cpu_write(bp_hardening_data.slot, HYP_VECTOR_SPECTRE_DIRECT);
}
-#endif /* CONFIG_KVM */
static void call_smc_arch_workaround_1(void)
{
@@ -318,6 +273,33 @@ void spectre_v2_enable_mitigation(const struct arm64_cpu_capabilities *__unused)
}
/*
+ * Spectre-v3a.
+ *
+ * Phew, there's not an awful lot to do here! We just instruct EL2 to use
+ * an indirect trampoline for the hyp vectors so that guests can't read
+ * VBAR_EL2 to defeat randomisation of the hypervisor VA layout.
+ */
+bool has_spectre_v3a(const struct arm64_cpu_capabilities *entry, int scope)
+{
+ static const struct midr_range spectre_v3a_unsafe_list[] = {
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A57),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A72),
+ {},
+ };
+
+ WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());
+ return is_midr_in_range_list(read_cpuid_id(), spectre_v3a_unsafe_list);
+}
+
+void spectre_v3a_enable_mitigation(const struct arm64_cpu_capabilities *__unused)
+{
+ struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data);
+
+ if (this_cpu_has_cap(ARM64_SPECTRE_V3A))
+ data->slot += HYP_VECTOR_INDIRECT;
+}
+
+/*
* Spectre v4.
*
* If you thought Spectre v2 was nasty, wait until you see this mess. A CPU is
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 1a57a76e1cc2..c44eb4b80163 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -276,7 +276,7 @@ arch_initcall(reserve_memblock_reserved_regions);
u64 __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID };
-u64 cpu_logical_map(int cpu)
+u64 cpu_logical_map(unsigned int cpu)
{
return __cpu_logical_map[cpu];
}
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 5d5857c5b025..4c0b0c89ad59 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -30,6 +30,13 @@ jiffies = jiffies_64;
*(__kvm_ex_table) \
__stop___kvm_ex_table = .;
+#define HYPERVISOR_DATA_SECTIONS \
+ HYP_SECTION_NAME(.data..ro_after_init) : { \
+ __hyp_data_ro_after_init_start = .; \
+ *(HYP_SECTION_NAME(.data..ro_after_init)) \
+ __hyp_data_ro_after_init_end = .; \
+ }
+
#define HYPERVISOR_PERCPU_SECTION \
. = ALIGN(PAGE_SIZE); \
HYP_SECTION_NAME(.data..percpu) : { \
@@ -37,6 +44,7 @@ jiffies = jiffies_64;
}
#else /* CONFIG_KVM */
#define HYPERVISOR_EXTABLE
+#define HYPERVISOR_DATA_SECTIONS
#define HYPERVISOR_PERCPU_SECTION
#endif
@@ -232,6 +240,8 @@ SECTIONS
_sdata = .;
RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN)
+ HYPERVISOR_DATA_SECTIONS
+
/*
* Data written with the MMU off but read with the MMU on requires
* cache lines to be invalidated, discarding up to a Cache Writeback
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 1504c81fbf5d..60fd181df624 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -13,10 +13,10 @@ obj-$(CONFIG_KVM) += hyp/
kvm-y := $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o \
$(KVM)/vfio.o $(KVM)/irqchip.o \
arm.o mmu.o mmio.o psci.o perf.o hypercalls.o pvtime.o \
- inject_fault.o regmap.o va_layout.o handle_exit.o \
+ inject_fault.o va_layout.o handle_exit.o \
guest.o debug.o reset.o sys_regs.o \
vgic-sys-reg-v3.o fpsimd.o pmu.o \
- aarch32.o arch_timer.o \
+ arch_timer.o \
vgic/vgic.o vgic/vgic-init.o \
vgic/vgic-irqfd.o vgic/vgic-v2.o \
vgic/vgic-v3.o vgic/vgic-v4.o \
diff --git a/arch/arm64/kvm/aarch32.c b/arch/arm64/kvm/aarch32.c
deleted file mode 100644
index 40a62a99fbf8..000000000000
--- a/arch/arm64/kvm/aarch32.c
+++ /dev/null
@@ -1,232 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * (not much of an) Emulation layer for 32bit guests.
- *
- * Copyright (C) 2012,2013 - ARM Ltd
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * based on arch/arm/kvm/emulate.c
- * Copyright (C) 2012 - Virtual Open Systems and Columbia University
- * Author: Christoffer Dall <c.dall@virtualopensystems.com>
- */
-
-#include <linux/bits.h>
-#include <linux/kvm_host.h>
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_hyp.h>
-
-#define DFSR_FSC_EXTABT_LPAE 0x10
-#define DFSR_FSC_EXTABT_nLPAE 0x08
-#define DFSR_LPAE BIT(9)
-
-/*
- * Table taken from ARMv8 ARM DDI0487B-B, table G1-10.
- */
-static const u8 return_offsets[8][2] = {
- [0] = { 0, 0 }, /* Reset, unused */
- [1] = { 4, 2 }, /* Undefined */
- [2] = { 0, 0 }, /* SVC, unused */
- [3] = { 4, 4 }, /* Prefetch abort */
- [4] = { 8, 8 }, /* Data abort */
- [5] = { 0, 0 }, /* HVC, unused */
- [6] = { 4, 4 }, /* IRQ, unused */
- [7] = { 4, 4 }, /* FIQ, unused */
-};
-
-static bool pre_fault_synchronize(struct kvm_vcpu *vcpu)
-{
- preempt_disable();
- if (vcpu->arch.sysregs_loaded_on_cpu) {
- kvm_arch_vcpu_put(vcpu);
- return true;
- }
-
- preempt_enable();
- return false;
-}
-
-static void post_fault_synchronize(struct kvm_vcpu *vcpu, bool loaded)
-{
- if (loaded) {
- kvm_arch_vcpu_load(vcpu, smp_processor_id());
- preempt_enable();
- }
-}
-
-/*
- * When an exception is taken, most CPSR fields are left unchanged in the
- * handler. However, some are explicitly overridden (e.g. M[4:0]).
- *
- * The SPSR/SPSR_ELx layouts differ, and the below is intended to work with
- * either format. Note: SPSR.J bit doesn't exist in SPSR_ELx, but this bit was
- * obsoleted by the ARMv7 virtualization extensions and is RES0.
- *
- * For the SPSR layout seen from AArch32, see:
- * - ARM DDI 0406C.d, page B1-1148
- * - ARM DDI 0487E.a, page G8-6264
- *
- * For the SPSR_ELx layout for AArch32 seen from AArch64, see:
- * - ARM DDI 0487E.a, page C5-426
- *
- * Here we manipulate the fields in order of the AArch32 SPSR_ELx layout, from
- * MSB to LSB.
- */
-static unsigned long get_except32_cpsr(struct kvm_vcpu *vcpu, u32 mode)
-{
- u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR);
- unsigned long old, new;
-
- old = *vcpu_cpsr(vcpu);
- new = 0;
-
- new |= (old & PSR_AA32_N_BIT);
- new |= (old & PSR_AA32_Z_BIT);
- new |= (old & PSR_AA32_C_BIT);
- new |= (old & PSR_AA32_V_BIT);
- new |= (old & PSR_AA32_Q_BIT);
-
- // CPSR.IT[7:0] are set to zero upon any exception
- // See ARM DDI 0487E.a, section G1.12.3
- // See ARM DDI 0406C.d, section B1.8.3
-
- new |= (old & PSR_AA32_DIT_BIT);
-
- // CPSR.SSBS is set to SCTLR.DSSBS upon any exception
- // See ARM DDI 0487E.a, page G8-6244
- if (sctlr & BIT(31))
- new |= PSR_AA32_SSBS_BIT;
-
- // CPSR.PAN is unchanged unless SCTLR.SPAN == 0b0
- // SCTLR.SPAN is RES1 when ARMv8.1-PAN is not implemented
- // See ARM DDI 0487E.a, page G8-6246
- new |= (old & PSR_AA32_PAN_BIT);
- if (!(sctlr & BIT(23)))
- new |= PSR_AA32_PAN_BIT;
-
- // SS does not exist in AArch32, so ignore
-
- // CPSR.IL is set to zero upon any exception
- // See ARM DDI 0487E.a, page G1-5527
-
- new |= (old & PSR_AA32_GE_MASK);
-
- // CPSR.IT[7:0] are set to zero upon any exception
- // See prior comment above
-
- // CPSR.E is set to SCTLR.EE upon any exception
- // See ARM DDI 0487E.a, page G8-6245
- // See ARM DDI 0406C.d, page B4-1701
- if (sctlr & BIT(25))
- new |= PSR_AA32_E_BIT;
-
- // CPSR.A is unchanged upon an exception to Undefined, Supervisor
- // CPSR.A is set upon an exception to other modes
- // See ARM DDI 0487E.a, pages G1-5515 to G1-5516
- // See ARM DDI 0406C.d, page B1-1182
- new |= (old & PSR_AA32_A_BIT);
- if (mode != PSR_AA32_MODE_UND && mode != PSR_AA32_MODE_SVC)
- new |= PSR_AA32_A_BIT;
-
- // CPSR.I is set upon any exception
- // See ARM DDI 0487E.a, pages G1-5515 to G1-5516
- // See ARM DDI 0406C.d, page B1-1182
- new |= PSR_AA32_I_BIT;
-
- // CPSR.F is set upon an exception to FIQ
- // CPSR.F is unchanged upon an exception to other modes
- // See ARM DDI 0487E.a, pages G1-5515 to G1-5516
- // See ARM DDI 0406C.d, page B1-1182
- new |= (old & PSR_AA32_F_BIT);
- if (mode == PSR_AA32_MODE_FIQ)
- new |= PSR_AA32_F_BIT;
-
- // CPSR.T is set to SCTLR.TE upon any exception
- // See ARM DDI 0487E.a, page G8-5514
- // See ARM DDI 0406C.d, page B1-1181
- if (sctlr & BIT(30))
- new |= PSR_AA32_T_BIT;
-
- new |= mode;
-
- return new;
-}
-
-static void prepare_fault32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset)
-{
- unsigned long spsr = *vcpu_cpsr(vcpu);
- bool is_thumb = (spsr & PSR_AA32_T_BIT);
- u32 return_offset = return_offsets[vect_offset >> 2][is_thumb];
- u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR);
-
- *vcpu_cpsr(vcpu) = get_except32_cpsr(vcpu, mode);
-
- /* Note: These now point to the banked copies */
- vcpu_write_spsr(vcpu, host_spsr_to_spsr32(spsr));
- *vcpu_reg32(vcpu, 14) = *vcpu_pc(vcpu) + return_offset;
-
- /* Branch to exception vector */
- if (sctlr & (1 << 13))
- vect_offset += 0xffff0000;
- else /* always have security exceptions */
- vect_offset += vcpu_cp15(vcpu, c12_VBAR);
-
- *vcpu_pc(vcpu) = vect_offset;
-}
-
-void kvm_inject_undef32(struct kvm_vcpu *vcpu)
-{
- bool loaded = pre_fault_synchronize(vcpu);
-
- prepare_fault32(vcpu, PSR_AA32_MODE_UND, 4);
- post_fault_synchronize(vcpu, loaded);
-}
-
-/*
- * Modelled after TakeDataAbortException() and TakePrefetchAbortException
- * pseudocode.
- */
-static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt,
- unsigned long addr)
-{
- u32 vect_offset;
- u32 *far, *fsr;
- bool is_lpae;
- bool loaded;
-
- loaded = pre_fault_synchronize(vcpu);
-
- if (is_pabt) {
- vect_offset = 12;
- far = &vcpu_cp15(vcpu, c6_IFAR);
- fsr = &vcpu_cp15(vcpu, c5_IFSR);
- } else { /* !iabt */
- vect_offset = 16;
- far = &vcpu_cp15(vcpu, c6_DFAR);
- fsr = &vcpu_cp15(vcpu, c5_DFSR);
- }
-
- prepare_fault32(vcpu, PSR_AA32_MODE_ABT, vect_offset);
-
- *far = addr;
-
- /* Give the guest an IMPLEMENTATION DEFINED exception */
- is_lpae = (vcpu_cp15(vcpu, c2_TTBCR) >> 31);
- if (is_lpae) {
- *fsr = DFSR_LPAE | DFSR_FSC_EXTABT_LPAE;
- } else {
- /* no need to shuffle FS[4] into DFSR[10] as its 0 */
- *fsr = DFSR_FSC_EXTABT_nLPAE;
- }
-
- post_fault_synchronize(vcpu, loaded);
-}
-
-void kvm_inject_dabt32(struct kvm_vcpu *vcpu, unsigned long addr)
-{
- inject_abt32(vcpu, false, addr);
-}
-
-void kvm_inject_pabt32(struct kvm_vcpu *vcpu, unsigned long addr)
-{
- inject_abt32(vcpu, true, addr);
-}
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index c0ffb019ca8b..6e637d2b4cfb 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -19,6 +19,7 @@
#include <linux/kvm_irqfd.h>
#include <linux/irqbypass.h>
#include <linux/sched/stat.h>
+#include <linux/psci.h>
#include <trace/events/kvm.h>
#define CREATE_TRACE_POINTS
@@ -35,7 +36,6 @@
#include <asm/kvm_asm.h>
#include <asm/kvm_mmu.h>
#include <asm/kvm_emulate.h>
-#include <asm/kvm_coproc.h>
#include <asm/sections.h>
#include <kvm/arm_hypercalls.h>
@@ -46,10 +46,14 @@
__asm__(".arch_extension virt");
#endif
+static enum kvm_mode kvm_mode = KVM_MODE_DEFAULT;
+DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized);
+
DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector);
static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
unsigned long kvm_arm_hyp_percpu_base[NR_CPUS];
+DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
/* The VMID used in the VTTBR */
static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1);
@@ -61,6 +65,10 @@ static bool vgic_present;
static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled);
DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
+extern u64 kvm_nvhe_sym(__cpu_logical_map)[NR_CPUS];
+extern u32 kvm_nvhe_sym(kvm_host_psci_version);
+extern struct psci_0_1_function_ids kvm_nvhe_sym(kvm_host_psci_0_1_function_ids);
+
int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
{
return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
@@ -102,7 +110,7 @@ static int kvm_arm_default_max_vcpus(void)
return vgic_present ? kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS;
}
-static void set_default_csv2(struct kvm *kvm)
+static void set_default_spectre(struct kvm *kvm)
{
/*
* The default is to expose CSV2 == 1 if the HW isn't affected.
@@ -114,6 +122,8 @@ static void set_default_csv2(struct kvm *kvm)
*/
if (arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED)
kvm->arch.pfr0_csv2 = 1;
+ if (arm64_get_meltdown_state() == SPECTRE_UNAFFECTED)
+ kvm->arch.pfr0_csv3 = 1;
}
/**
@@ -141,7 +151,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
/* The maximum number of VCPUs is limited by the host's GIC model */
kvm->arch.max_vcpus = kvm_arm_default_max_vcpus();
- set_default_csv2(kvm);
+ set_default_spectre(kvm);
return ret;
out_free_stage2_pgd:
@@ -198,6 +208,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ARM_IRQ_LINE_LAYOUT_2:
case KVM_CAP_ARM_NISV_TO_USER:
case KVM_CAP_ARM_INJECT_EXT_DABT:
+ case KVM_CAP_SET_GUEST_DEBUG:
+ case KVM_CAP_VCPU_ATTRIBUTES:
r = 1;
break;
case KVM_CAP_ARM_SET_DEVICE_ADDR:
@@ -229,10 +241,35 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_STEAL_TIME:
r = kvm_arm_pvtime_supported();
break;
- default:
- r = kvm_arch_vm_ioctl_check_extension(kvm, ext);
+ case KVM_CAP_ARM_EL1_32BIT:
+ r = cpus_have_const_cap(ARM64_HAS_32BIT_EL1);
+ break;
+ case KVM_CAP_GUEST_DEBUG_HW_BPS:
+ r = get_num_brps();
+ break;
+ case KVM_CAP_GUEST_DEBUG_HW_WPS:
+ r = get_num_wrps();
+ break;
+ case KVM_CAP_ARM_PMU_V3:
+ r = kvm_arm_support_pmu_v3();
+ break;
+ case KVM_CAP_ARM_INJECT_SERROR_ESR:
+ r = cpus_have_const_cap(ARM64_HAS_RAS_EXTN);
break;
+ case KVM_CAP_ARM_VM_IPA_SIZE:
+ r = get_kvm_ipa_limit();
+ break;
+ case KVM_CAP_ARM_SVE:
+ r = system_supports_sve();
+ break;
+ case KVM_CAP_ARM_PTRAUTH_ADDRESS:
+ case KVM_CAP_ARM_PTRAUTH_GENERIC:
+ r = system_has_full_ptr_auth();
+ break;
+ default:
+ r = 0;
}
+
return r;
}
@@ -1311,47 +1348,52 @@ static unsigned long nvhe_percpu_order(void)
return size ? get_order(size) : 0;
}
-static int kvm_map_vectors(void)
+/* A lookup table holding the hypervisor VA for each vector slot */
+static void *hyp_spectre_vector_selector[BP_HARDEN_EL2_SLOTS];
+
+static int __kvm_vector_slot2idx(enum arm64_hyp_spectre_vector slot)
{
- /*
- * SV2 = ARM64_SPECTRE_V2
- * HEL2 = ARM64_HARDEN_EL2_VECTORS
- *
- * !SV2 + !HEL2 -> use direct vectors
- * SV2 + !HEL2 -> use hardened vectors in place
- * !SV2 + HEL2 -> allocate one vector slot and use exec mapping
- * SV2 + HEL2 -> use hardened vectors and use exec mapping
- */
- if (cpus_have_const_cap(ARM64_SPECTRE_V2)) {
- __kvm_bp_vect_base = kvm_ksym_ref(__bp_harden_hyp_vecs);
- __kvm_bp_vect_base = kern_hyp_va(__kvm_bp_vect_base);
- }
+ return slot - (slot != HYP_VECTOR_DIRECT);
+}
- if (cpus_have_const_cap(ARM64_HARDEN_EL2_VECTORS)) {
- phys_addr_t vect_pa = __pa_symbol(__bp_harden_hyp_vecs);
- unsigned long size = __BP_HARDEN_HYP_VECS_SZ;
+static void kvm_init_vector_slot(void *base, enum arm64_hyp_spectre_vector slot)
+{
+ int idx = __kvm_vector_slot2idx(slot);
- /*
- * Always allocate a spare vector slot, as we don't
- * know yet which CPUs have a BP hardening slot that
- * we can reuse.
- */
- __kvm_harden_el2_vector_slot = atomic_inc_return(&arm64_el2_vector_last_slot);
- BUG_ON(__kvm_harden_el2_vector_slot >= BP_HARDEN_EL2_SLOTS);
- return create_hyp_exec_mappings(vect_pa, size,
- &__kvm_bp_vect_base);
+ hyp_spectre_vector_selector[slot] = base + (idx * SZ_2K);
+}
+
+static int kvm_init_vector_slots(void)
+{
+ int err;
+ void *base;
+
+ base = kern_hyp_va(kvm_ksym_ref(__kvm_hyp_vector));
+ kvm_init_vector_slot(base, HYP_VECTOR_DIRECT);
+
+ base = kern_hyp_va(kvm_ksym_ref(__bp_harden_hyp_vecs));
+ kvm_init_vector_slot(base, HYP_VECTOR_SPECTRE_DIRECT);
+
+ if (!cpus_have_const_cap(ARM64_SPECTRE_V3A))
+ return 0;
+
+ if (!has_vhe()) {
+ err = create_hyp_exec_mappings(__pa_symbol(__bp_harden_hyp_vecs),
+ __BP_HARDEN_HYP_VECS_SZ, &base);
+ if (err)
+ return err;
}
+ kvm_init_vector_slot(base, HYP_VECTOR_INDIRECT);
+ kvm_init_vector_slot(base, HYP_VECTOR_SPECTRE_INDIRECT);
return 0;
}
static void cpu_init_hyp_mode(void)
{
- phys_addr_t pgd_ptr;
- unsigned long hyp_stack_ptr;
- unsigned long vector_ptr;
- unsigned long tpidr_el2;
+ struct kvm_nvhe_init_params *params = this_cpu_ptr_nvhe_sym(kvm_init_params);
struct arm_smccc_res res;
+ unsigned long tcr;
/* Switch from the HYP stub to our own HYP init vector */
__hyp_set_vectors(kvm_get_idmap_vector());
@@ -1361,13 +1403,38 @@ static void cpu_init_hyp_mode(void)
* kernel's mapping to the linear mapping, and store it in tpidr_el2
* so that we can use adr_l to access per-cpu variables in EL2.
*/
- tpidr_el2 = (unsigned long)this_cpu_ptr_nvhe_sym(__per_cpu_start) -
- (unsigned long)kvm_ksym_ref(CHOOSE_NVHE_SYM(__per_cpu_start));
+ params->tpidr_el2 = (unsigned long)this_cpu_ptr_nvhe_sym(__per_cpu_start) -
+ (unsigned long)kvm_ksym_ref(CHOOSE_NVHE_SYM(__per_cpu_start));
+
+ params->mair_el2 = read_sysreg(mair_el1);
+
+ /*
+ * The ID map may be configured to use an extended virtual address
+ * range. This is only the case if system RAM is out of range for the
+ * currently configured page size and VA_BITS, in which case we will
+ * also need the extended virtual range for the HYP ID map, or we won't
+ * be able to enable the EL2 MMU.
+ *
+ * However, at EL2, there is only one TTBR register, and we can't switch
+ * between translation tables *and* update TCR_EL2.T0SZ at the same
+ * time. Bottom line: we need to use the extended range with *both* our
+ * translation tables.
+ *
+ * So use the same T0SZ value we use for the ID map.
+ */
+ tcr = (read_sysreg(tcr_el1) & TCR_EL2_MASK) | TCR_EL2_RES1;
+ tcr &= ~TCR_T0SZ_MASK;
+ tcr |= (idmap_t0sz & GENMASK(TCR_TxSZ_WIDTH - 1, 0)) << TCR_T0SZ_OFFSET;
+ params->tcr_el2 = tcr;
+
+ params->stack_hyp_va = kern_hyp_va(__this_cpu_read(kvm_arm_hyp_stack_page) + PAGE_SIZE);
+ params->pgd_pa = kvm_mmu_get_httbr();
- pgd_ptr = kvm_mmu_get_httbr();
- hyp_stack_ptr = __this_cpu_read(kvm_arm_hyp_stack_page) + PAGE_SIZE;
- hyp_stack_ptr = kern_hyp_va(hyp_stack_ptr);
- vector_ptr = (unsigned long)kern_hyp_va(kvm_ksym_ref(__kvm_hyp_host_vector));
+ /*
+ * Flush the init params from the data cache because the struct will
+ * be read while the MMU is off.
+ */
+ kvm_flush_dcache_to_poc(params, sizeof(*params));
/*
* Call initialization code, and switch to the full blown HYP code.
@@ -1376,8 +1443,7 @@ static void cpu_init_hyp_mode(void)
* cpus_have_const_cap() wrapper.
*/
BUG_ON(!system_capabilities_finalized());
- arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(__kvm_hyp_init),
- pgd_ptr, tpidr_el2, hyp_stack_ptr, vector_ptr, &res);
+ arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(__kvm_hyp_init), virt_to_phys(params), &res);
WARN_ON(res.a0 != SMCCC_RET_SUCCESS);
/*
@@ -1396,13 +1462,40 @@ static void cpu_hyp_reset(void)
__hyp_reset_vectors();
}
+/*
+ * EL2 vectors can be mapped and rerouted in a number of ways,
+ * depending on the kernel configuration and CPU present:
+ *
+ * - If the CPU is affected by Spectre-v2, the hardening sequence is
+ * placed in one of the vector slots, which is executed before jumping
+ * to the real vectors.
+ *
+ * - If the CPU also has the ARM64_SPECTRE_V3A cap, the slot
+ * containing the hardening sequence is mapped next to the idmap page,
+ * and executed before jumping to the real vectors.
+ *
+ * - If the CPU only has the ARM64_SPECTRE_V3A cap, then an
+ * empty slot is selected, mapped next to the idmap page, and
+ * executed before jumping to the real vectors.
+ *
+ * Note that ARM64_SPECTRE_V3A is somewhat incompatible with
+ * VHE, as we don't have hypervisor-specific mappings. If the system
+ * is VHE and yet selects this capability, it will be ignored.
+ */
+static void cpu_set_hyp_vector(void)
+{
+ struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data);
+ void *vector = hyp_spectre_vector_selector[data->slot];
+
+ *this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)vector;
+}
+
static void cpu_hyp_reinit(void)
{
kvm_init_host_cpu_context(&this_cpu_ptr_hyp_sym(kvm_host_data)->host_ctxt);
cpu_hyp_reset();
-
- *this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)kvm_get_hyp_vector();
+ cpu_set_hyp_vector();
if (is_kernel_in_hyp_mode())
kvm_timer_init_vhe();
@@ -1439,7 +1532,8 @@ static void _kvm_arch_hardware_disable(void *discard)
void kvm_arch_hardware_disable(void)
{
- _kvm_arch_hardware_disable(NULL);
+ if (!is_protected_kvm_enabled())
+ _kvm_arch_hardware_disable(NULL);
}
#ifdef CONFIG_CPU_PM
@@ -1482,11 +1576,13 @@ static struct notifier_block hyp_init_cpu_pm_nb = {
static void __init hyp_cpu_pm_init(void)
{
- cpu_pm_register_notifier(&hyp_init_cpu_pm_nb);
+ if (!is_protected_kvm_enabled())
+ cpu_pm_register_notifier(&hyp_init_cpu_pm_nb);
}
static void __init hyp_cpu_pm_exit(void)
{
- cpu_pm_unregister_notifier(&hyp_init_cpu_pm_nb);
+ if (!is_protected_kvm_enabled())
+ cpu_pm_unregister_notifier(&hyp_init_cpu_pm_nb);
}
#else
static inline void hyp_cpu_pm_init(void)
@@ -1497,6 +1593,36 @@ static inline void hyp_cpu_pm_exit(void)
}
#endif
+static void init_cpu_logical_map(void)
+{
+ unsigned int cpu;
+
+ /*
+ * Copy the MPIDR <-> logical CPU ID mapping to hyp.
+ * Only copy the set of online CPUs whose features have been chacked
+ * against the finalized system capabilities. The hypervisor will not
+ * allow any other CPUs from the `possible` set to boot.
+ */
+ for_each_online_cpu(cpu)
+ kvm_nvhe_sym(__cpu_logical_map)[cpu] = cpu_logical_map(cpu);
+}
+
+static bool init_psci_relay(void)
+{
+ /*
+ * If PSCI has not been initialized, protected KVM cannot install
+ * itself on newly booted CPUs.
+ */
+ if (!psci_ops.get_version) {
+ kvm_err("Cannot initialize protected mode without PSCI\n");
+ return false;
+ }
+
+ kvm_nvhe_sym(kvm_host_psci_version) = psci_ops.get_version();
+ kvm_nvhe_sym(kvm_host_psci_0_1_function_ids) = get_psci_0_1_function_ids();
+ return true;
+}
+
static int init_common_resources(void)
{
return kvm_set_ipa_limit();
@@ -1541,10 +1667,11 @@ static int init_subsystems(void)
goto out;
kvm_perf_init();
- kvm_coproc_table_init();
+ kvm_sys_reg_table_init();
out:
- on_each_cpu(_kvm_arch_hardware_disable, NULL, 1);
+ if (err || !is_protected_kvm_enabled())
+ on_each_cpu(_kvm_arch_hardware_disable, NULL, 1);
return err;
}
@@ -1618,6 +1745,14 @@ static int init_hyp_mode(void)
goto out_err;
}
+ err = create_hyp_mappings(kvm_ksym_ref(__hyp_data_ro_after_init_start),
+ kvm_ksym_ref(__hyp_data_ro_after_init_end),
+ PAGE_HYP_RO);
+ if (err) {
+ kvm_err("Cannot map .hyp.data..ro_after_init section\n");
+ goto out_err;
+ }
+
err = create_hyp_mappings(kvm_ksym_ref(__start_rodata),
kvm_ksym_ref(__end_rodata), PAGE_HYP_RO);
if (err) {
@@ -1632,12 +1767,6 @@ static int init_hyp_mode(void)
goto out_err;
}
- err = kvm_map_vectors();
- if (err) {
- kvm_err("Cannot map vectors\n");
- goto out_err;
- }
-
/*
* Map the Hyp stack pages
*/
@@ -1667,6 +1796,13 @@ static int init_hyp_mode(void)
}
}
+ if (is_protected_kvm_enabled()) {
+ init_cpu_logical_map();
+
+ if (!init_psci_relay())
+ goto out_err;
+ }
+
return 0;
out_err:
@@ -1781,14 +1917,24 @@ int kvm_arch_init(void *opaque)
goto out_err;
}
+ err = kvm_init_vector_slots();
+ if (err) {
+ kvm_err("Cannot initialise vector slots\n");
+ goto out_err;
+ }
+
err = init_subsystems();
if (err)
goto out_hyp;
- if (in_hyp_mode)
+ if (is_protected_kvm_enabled()) {
+ static_branch_enable(&kvm_protected_mode_initialized);
+ kvm_info("Protected nVHE mode initialized successfully\n");
+ } else if (in_hyp_mode) {
kvm_info("VHE mode initialized successfully\n");
- else
+ } else {
kvm_info("Hyp mode initialized successfully\n");
+ }
return 0;
@@ -1806,6 +1952,25 @@ void kvm_arch_exit(void)
kvm_perf_teardown();
}
+static int __init early_kvm_mode_cfg(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ if (strcmp(arg, "protected") == 0) {
+ kvm_mode = KVM_MODE_PROTECTED;
+ return 0;
+ }
+
+ return -EINVAL;
+}
+early_param("kvm-arm.mode", early_kvm_mode_cfg);
+
+enum kvm_mode kvm_get_mode(void)
+{
+ return kvm_mode;
+}
+
static int arm_init(void)
{
int rc = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index dfb5218137ca..9bbd30e62799 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -24,7 +24,6 @@
#include <asm/fpsimd.h>
#include <asm/kvm.h>
#include <asm/kvm_emulate.h>
-#include <asm/kvm_coproc.h>
#include <asm/sigcontext.h>
#include "trace.h"
@@ -252,10 +251,32 @@ static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
memcpy(addr, valp, KVM_REG_SIZE(reg->id));
if (*vcpu_cpsr(vcpu) & PSR_MODE32_BIT) {
- int i;
+ int i, nr_reg;
+
+ switch (*vcpu_cpsr(vcpu)) {
+ /*
+ * Either we are dealing with user mode, and only the
+ * first 15 registers (+ PC) must be narrowed to 32bit.
+ * AArch32 r0-r14 conveniently map to AArch64 x0-x14.
+ */
+ case PSR_AA32_MODE_USR:
+ case PSR_AA32_MODE_SYS:
+ nr_reg = 15;
+ break;
+
+ /*
+ * Otherwide, this is a priviledged mode, and *all* the
+ * registers must be narrowed to 32bit.
+ */
+ default:
+ nr_reg = 31;
+ break;
+ }
+
+ for (i = 0; i < nr_reg; i++)
+ vcpu_set_reg(vcpu, i, (u32)vcpu_get_reg(vcpu, i));
- for (i = 0; i < 16; i++)
- *vcpu_reg32(vcpu, i) = (u32)*vcpu_reg32(vcpu, i);
+ *vcpu_pc(vcpu) = (u32)*vcpu_pc(vcpu);
}
out:
return err;
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index 5d690d60ccad..cebe39f3b1b6 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -14,7 +14,6 @@
#include <asm/esr.h>
#include <asm/exception.h>
#include <asm/kvm_asm.h>
-#include <asm/kvm_coproc.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_mmu.h>
#include <asm/debug-monitors.h>
@@ -61,7 +60,7 @@ static int handle_smc(struct kvm_vcpu *vcpu)
* otherwise return to the same address...
*/
vcpu_set_reg(vcpu, 0, ~0UL);
- kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
+ kvm_incr_pc(vcpu);
return 1;
}
@@ -100,7 +99,7 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu)
kvm_clear_request(KVM_REQ_UNHALT, vcpu);
}
- kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
+ kvm_incr_pc(vcpu);
return 1;
}
@@ -221,7 +220,7 @@ static int handle_trap_exceptions(struct kvm_vcpu *vcpu)
* that fail their condition code check"
*/
if (!kvm_condition_valid(vcpu)) {
- kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
+ kvm_incr_pc(vcpu);
handled = 1;
} else {
exit_handle_fn exit_handler;
@@ -241,23 +240,6 @@ int handle_exit(struct kvm_vcpu *vcpu, int exception_index)
{
struct kvm_run *run = vcpu->run;
- if (ARM_SERROR_PENDING(exception_index)) {
- u8 esr_ec = ESR_ELx_EC(kvm_vcpu_get_esr(vcpu));
-
- /*
- * HVC/SMC already have an adjusted PC, which we need
- * to correct in order to return to after having
- * injected the SError.
- */
- if (esr_ec == ESR_ELx_EC_HVC32 || esr_ec == ESR_ELx_EC_HVC64 ||
- esr_ec == ESR_ELx_EC_SMC32 || esr_ec == ESR_ELx_EC_SMC64) {
- u32 adj = kvm_vcpu_trap_il_is32bit(vcpu) ? 4 : 2;
- *vcpu_pc(vcpu) -= adj;
- }
-
- return 1;
- }
-
exception_index = ARM_EXCEPTION_CODE(exception_index);
switch (exception_index) {
diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile
index 4a81eddabcd8..687598e41b21 100644
--- a/arch/arm64/kvm/hyp/Makefile
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -10,4 +10,4 @@ subdir-ccflags-y := -I$(incdir) \
-DDISABLE_BRANCH_PROFILING \
$(DISABLE_STACKLEAK_PLUGIN)
-obj-$(CONFIG_KVM) += vhe/ nvhe/ pgtable.o smccc_wa.o
+obj-$(CONFIG_KVM) += vhe/ nvhe/ pgtable.o
diff --git a/arch/arm64/kvm/hyp/aarch32.c b/arch/arm64/kvm/hyp/aarch32.c
index ae56d8a4b382..f98cbe2626a1 100644
--- a/arch/arm64/kvm/hyp/aarch32.c
+++ b/arch/arm64/kvm/hyp/aarch32.c
@@ -123,13 +123,13 @@ static void kvm_adjust_itstate(struct kvm_vcpu *vcpu)
* kvm_skip_instr - skip a trapped instruction and proceed to the next
* @vcpu: The vcpu pointer
*/
-void kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr)
+void kvm_skip_instr32(struct kvm_vcpu *vcpu)
{
u32 pc = *vcpu_pc(vcpu);
bool is_thumb;
is_thumb = !!(*vcpu_cpsr(vcpu) & PSR_AA32_T_BIT);
- if (is_thumb && !is_wide_instr)
+ if (is_thumb && !kvm_vcpu_trap_il_is32bit(vcpu))
pc += 2;
else
pc += 4;
diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c
new file mode 100644
index 000000000000..73629094f903
--- /dev/null
+++ b/arch/arm64/kvm/hyp/exception.c
@@ -0,0 +1,331 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Fault injection for both 32 and 64bit guests.
+ *
+ * Copyright (C) 2012,2013 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * Based on arch/arm/kvm/emulate.c
+ * Copyright (C) 2012 - Virtual Open Systems and Columbia University
+ * Author: Christoffer Dall <c.dall@virtualopensystems.com>
+ */
+
+#include <hyp/adjust_pc.h>
+#include <linux/kvm_host.h>
+#include <asm/kvm_emulate.h>
+
+#if !defined (__KVM_NVHE_HYPERVISOR__) && !defined (__KVM_VHE_HYPERVISOR__)
+#error Hypervisor code only!
+#endif
+
+static inline u64 __vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
+{
+ u64 val;
+
+ if (__vcpu_read_sys_reg_from_cpu(reg, &val))
+ return val;
+
+ return __vcpu_sys_reg(vcpu, reg);
+}
+
+static inline void __vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
+{
+ if (__vcpu_write_sys_reg_to_cpu(val, reg))
+ return;
+
+ __vcpu_sys_reg(vcpu, reg) = val;
+}
+
+static void __vcpu_write_spsr(struct kvm_vcpu *vcpu, u64 val)
+{
+ write_sysreg_el1(val, SYS_SPSR);
+}
+
+static void __vcpu_write_spsr_abt(struct kvm_vcpu *vcpu, u64 val)
+{
+ if (has_vhe())
+ write_sysreg(val, spsr_abt);
+ else
+ vcpu->arch.ctxt.spsr_abt = val;
+}
+
+static void __vcpu_write_spsr_und(struct kvm_vcpu *vcpu, u64 val)
+{
+ if (has_vhe())
+ write_sysreg(val, spsr_und);
+ else
+ vcpu->arch.ctxt.spsr_und = val;
+}
+
+/*
+ * This performs the exception entry at a given EL (@target_mode), stashing PC
+ * and PSTATE into ELR and SPSR respectively, and compute the new PC/PSTATE.
+ * The EL passed to this function *must* be a non-secure, privileged mode with
+ * bit 0 being set (PSTATE.SP == 1).
+ *
+ * When an exception is taken, most PSTATE fields are left unchanged in the
+ * handler. However, some are explicitly overridden (e.g. M[4:0]). Luckily all
+ * of the inherited bits have the same position in the AArch64/AArch32 SPSR_ELx
+ * layouts, so we don't need to shuffle these for exceptions from AArch32 EL0.
+ *
+ * For the SPSR_ELx layout for AArch64, see ARM DDI 0487E.a page C5-429.
+ * For the SPSR_ELx layout for AArch32, see ARM DDI 0487E.a page C5-426.
+ *
+ * Here we manipulate the fields in order of the AArch64 SPSR_ELx layout, from
+ * MSB to LSB.
+ */
+static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode,
+ enum exception_type type)
+{
+ unsigned long sctlr, vbar, old, new, mode;
+ u64 exc_offset;
+
+ mode = *vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT);
+
+ if (mode == target_mode)
+ exc_offset = CURRENT_EL_SP_ELx_VECTOR;
+ else if ((mode | PSR_MODE_THREAD_BIT) == target_mode)
+ exc_offset = CURRENT_EL_SP_EL0_VECTOR;
+ else if (!(mode & PSR_MODE32_BIT))
+ exc_offset = LOWER_EL_AArch64_VECTOR;
+ else
+ exc_offset = LOWER_EL_AArch32_VECTOR;
+
+ switch (target_mode) {
+ case PSR_MODE_EL1h:
+ vbar = __vcpu_read_sys_reg(vcpu, VBAR_EL1);
+ sctlr = __vcpu_read_sys_reg(vcpu, SCTLR_EL1);
+ __vcpu_write_sys_reg(vcpu, *vcpu_pc(vcpu), ELR_EL1);
+ break;
+ default:
+ /* Don't do that */
+ BUG();
+ }
+
+ *vcpu_pc(vcpu) = vbar + exc_offset + type;
+
+ old = *vcpu_cpsr(vcpu);
+ new = 0;
+
+ new |= (old & PSR_N_BIT);
+ new |= (old & PSR_Z_BIT);
+ new |= (old & PSR_C_BIT);
+ new |= (old & PSR_V_BIT);
+
+ // TODO: TCO (if/when ARMv8.5-MemTag is exposed to guests)
+
+ new |= (old & PSR_DIT_BIT);
+
+ // PSTATE.UAO is set to zero upon any exception to AArch64
+ // See ARM DDI 0487E.a, page D5-2579.
+
+ // PSTATE.PAN is unchanged unless SCTLR_ELx.SPAN == 0b0
+ // SCTLR_ELx.SPAN is RES1 when ARMv8.1-PAN is not implemented
+ // See ARM DDI 0487E.a, page D5-2578.
+ new |= (old & PSR_PAN_BIT);
+ if (!(sctlr & SCTLR_EL1_SPAN))
+ new |= PSR_PAN_BIT;
+
+ // PSTATE.SS is set to zero upon any exception to AArch64
+ // See ARM DDI 0487E.a, page D2-2452.
+
+ // PSTATE.IL is set to zero upon any exception to AArch64
+ // See ARM DDI 0487E.a, page D1-2306.
+
+ // PSTATE.SSBS is set to SCTLR_ELx.DSSBS upon any exception to AArch64
+ // See ARM DDI 0487E.a, page D13-3258
+ if (sctlr & SCTLR_ELx_DSSBS)
+ new |= PSR_SSBS_BIT;
+
+ // PSTATE.BTYPE is set to zero upon any exception to AArch64
+ // See ARM DDI 0487E.a, pages D1-2293 to D1-2294.
+
+ new |= PSR_D_BIT;
+ new |= PSR_A_BIT;
+ new |= PSR_I_BIT;
+ new |= PSR_F_BIT;
+
+ new |= target_mode;
+
+ *vcpu_cpsr(vcpu) = new;
+ __vcpu_write_spsr(vcpu, old);
+}
+
+/*
+ * When an exception is taken, most CPSR fields are left unchanged in the
+ * handler. However, some are explicitly overridden (e.g. M[4:0]).
+ *
+ * The SPSR/SPSR_ELx layouts differ, and the below is intended to work with
+ * either format. Note: SPSR.J bit doesn't exist in SPSR_ELx, but this bit was
+ * obsoleted by the ARMv7 virtualization extensions and is RES0.
+ *
+ * For the SPSR layout seen from AArch32, see:
+ * - ARM DDI 0406C.d, page B1-1148
+ * - ARM DDI 0487E.a, page G8-6264
+ *
+ * For the SPSR_ELx layout for AArch32 seen from AArch64, see:
+ * - ARM DDI 0487E.a, page C5-426
+ *
+ * Here we manipulate the fields in order of the AArch32 SPSR_ELx layout, from
+ * MSB to LSB.
+ */
+static unsigned long get_except32_cpsr(struct kvm_vcpu *vcpu, u32 mode)
+{
+ u32 sctlr = __vcpu_read_sys_reg(vcpu, SCTLR_EL1);
+ unsigned long old, new;
+
+ old = *vcpu_cpsr(vcpu);
+ new = 0;
+
+ new |= (old & PSR_AA32_N_BIT);
+ new |= (old & PSR_AA32_Z_BIT);
+ new |= (old & PSR_AA32_C_BIT);
+ new |= (old & PSR_AA32_V_BIT);
+ new |= (old & PSR_AA32_Q_BIT);
+
+ // CPSR.IT[7:0] are set to zero upon any exception
+ // See ARM DDI 0487E.a, section G1.12.3
+ // See ARM DDI 0406C.d, section B1.8.3
+
+ new |= (old & PSR_AA32_DIT_BIT);
+
+ // CPSR.SSBS is set to SCTLR.DSSBS upon any exception
+ // See ARM DDI 0487E.a, page G8-6244
+ if (sctlr & BIT(31))
+ new |= PSR_AA32_SSBS_BIT;
+
+ // CPSR.PAN is unchanged unless SCTLR.SPAN == 0b0
+ // SCTLR.SPAN is RES1 when ARMv8.1-PAN is not implemented
+ // See ARM DDI 0487E.a, page G8-6246
+ new |= (old & PSR_AA32_PAN_BIT);
+ if (!(sctlr & BIT(23)))
+ new |= PSR_AA32_PAN_BIT;
+
+ // SS does not exist in AArch32, so ignore
+
+ // CPSR.IL is set to zero upon any exception
+ // See ARM DDI 0487E.a, page G1-5527
+
+ new |= (old & PSR_AA32_GE_MASK);
+
+ // CPSR.IT[7:0] are set to zero upon any exception
+ // See prior comment above
+
+ // CPSR.E is set to SCTLR.EE upon any exception
+ // See ARM DDI 0487E.a, page G8-6245
+ // See ARM DDI 0406C.d, page B4-1701
+ if (sctlr & BIT(25))
+ new |= PSR_AA32_E_BIT;
+
+ // CPSR.A is unchanged upon an exception to Undefined, Supervisor
+ // CPSR.A is set upon an exception to other modes
+ // See ARM DDI 0487E.a, pages G1-5515 to G1-5516
+ // See ARM DDI 0406C.d, page B1-1182
+ new |= (old & PSR_AA32_A_BIT);
+ if (mode != PSR_AA32_MODE_UND && mode != PSR_AA32_MODE_SVC)
+ new |= PSR_AA32_A_BIT;
+
+ // CPSR.I is set upon any exception
+ // See ARM DDI 0487E.a, pages G1-5515 to G1-5516
+ // See ARM DDI 0406C.d, page B1-1182
+ new |= PSR_AA32_I_BIT;
+
+ // CPSR.F is set upon an exception to FIQ
+ // CPSR.F is unchanged upon an exception to other modes
+ // See ARM DDI 0487E.a, pages G1-5515 to G1-5516
+ // See ARM DDI 0406C.d, page B1-1182
+ new |= (old & PSR_AA32_F_BIT);
+ if (mode == PSR_AA32_MODE_FIQ)
+ new |= PSR_AA32_F_BIT;
+
+ // CPSR.T is set to SCTLR.TE upon any exception
+ // See ARM DDI 0487E.a, page G8-5514
+ // See ARM DDI 0406C.d, page B1-1181
+ if (sctlr & BIT(30))
+ new |= PSR_AA32_T_BIT;
+
+ new |= mode;
+
+ return new;
+}
+
+/*
+ * Table taken from ARMv8 ARM DDI0487B-B, table G1-10.
+ */
+static const u8 return_offsets[8][2] = {
+ [0] = { 0, 0 }, /* Reset, unused */
+ [1] = { 4, 2 }, /* Undefined */
+ [2] = { 0, 0 }, /* SVC, unused */
+ [3] = { 4, 4 }, /* Prefetch abort */
+ [4] = { 8, 8 }, /* Data abort */
+ [5] = { 0, 0 }, /* HVC, unused */
+ [6] = { 4, 4 }, /* IRQ, unused */
+ [7] = { 4, 4 }, /* FIQ, unused */
+};
+
+static void enter_exception32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset)
+{
+ unsigned long spsr = *vcpu_cpsr(vcpu);
+ bool is_thumb = (spsr & PSR_AA32_T_BIT);
+ u32 sctlr = __vcpu_read_sys_reg(vcpu, SCTLR_EL1);
+ u32 return_address;
+
+ *vcpu_cpsr(vcpu) = get_except32_cpsr(vcpu, mode);
+ return_address = *vcpu_pc(vcpu);
+ return_address += return_offsets[vect_offset >> 2][is_thumb];
+
+ /* KVM only enters the ABT and UND modes, so only deal with those */
+ switch(mode) {
+ case PSR_AA32_MODE_ABT:
+ __vcpu_write_spsr_abt(vcpu, host_spsr_to_spsr32(spsr));
+ vcpu_gp_regs(vcpu)->compat_lr_abt = return_address;
+ break;
+
+ case PSR_AA32_MODE_UND:
+ __vcpu_write_spsr_und(vcpu, host_spsr_to_spsr32(spsr));
+ vcpu_gp_regs(vcpu)->compat_lr_und = return_address;
+ break;
+ }
+
+ /* Branch to exception vector */
+ if (sctlr & (1 << 13))
+ vect_offset += 0xffff0000;
+ else /* always have security exceptions */
+ vect_offset += __vcpu_read_sys_reg(vcpu, VBAR_EL1);
+
+ *vcpu_pc(vcpu) = vect_offset;
+}
+
+void kvm_inject_exception(struct kvm_vcpu *vcpu)
+{
+ if (vcpu_el1_is_32bit(vcpu)) {
+ switch (vcpu->arch.flags & KVM_ARM64_EXCEPT_MASK) {
+ case KVM_ARM64_EXCEPT_AA32_UND:
+ enter_exception32(vcpu, PSR_AA32_MODE_UND, 4);
+ break;
+ case KVM_ARM64_EXCEPT_AA32_IABT:
+ enter_exception32(vcpu, PSR_AA32_MODE_ABT, 12);
+ break;
+ case KVM_ARM64_EXCEPT_AA32_DABT:
+ enter_exception32(vcpu, PSR_AA32_MODE_ABT, 16);
+ break;
+ default:
+ /* Err... */
+ break;
+ }
+ } else {
+ switch (vcpu->arch.flags & KVM_ARM64_EXCEPT_MASK) {
+ case (KVM_ARM64_EXCEPT_AA64_ELx_SYNC |
+ KVM_ARM64_EXCEPT_AA64_EL1):
+ enter_exception64(vcpu, PSR_MODE_EL1h, except_type_sync);
+ break;
+ default:
+ /*
+ * Only EL1_SYNC makes sense so far, EL2_{SYNC,IRQ}
+ * will be implemented at some point. Everything
+ * else gets silently ignored.
+ */
+ break;
+ }
+ }
+}
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index 0a5b36eb54b3..d179056e1af8 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -13,6 +13,7 @@
#include <asm/kvm_arm.h>
#include <asm/kvm_asm.h>
#include <asm/mmu.h>
+#include <asm/spectre.h>
.macro save_caller_saved_regs_vect
/* x0 and x1 were saved in the vector entry */
@@ -187,52 +188,60 @@ SYM_CODE_START(__kvm_hyp_vector)
valid_vect el1_error // Error 32-bit EL1
SYM_CODE_END(__kvm_hyp_vector)
-.macro hyp_ventry
- .align 7
+.macro spectrev2_smccc_wa1_smc
+ sub sp, sp, #(8 * 4)
+ stp x2, x3, [sp, #(8 * 0)]
+ stp x0, x1, [sp, #(8 * 2)]
+ mov w0, #ARM_SMCCC_ARCH_WORKAROUND_1
+ smc #0
+ ldp x2, x3, [sp, #(8 * 0)]
+ add sp, sp, #(8 * 2)
+.endm
+
+.macro hyp_ventry indirect, spectrev2
+ .align 7
1: esb
- .rept 26
- nop
- .endr
-/*
- * The default sequence is to directly branch to the KVM vectors,
- * using the computed offset. This applies for VHE as well as
- * !ARM64_HARDEN_EL2_VECTORS. The first vector must always run the preamble.
- *
- * For ARM64_HARDEN_EL2_VECTORS configurations, this gets replaced
- * with:
- *
- * stp x0, x1, [sp, #-16]!
- * movz x0, #(addr & 0xffff)
- * movk x0, #((addr >> 16) & 0xffff), lsl #16
- * movk x0, #((addr >> 32) & 0xffff), lsl #32
- * br x0
- *
- * Where:
- * addr = kern_hyp_va(__kvm_hyp_vector) + vector-offset + KVM_VECTOR_PREAMBLE.
- * See kvm_patch_vector_branch for details.
- */
-alternative_cb kvm_patch_vector_branch
+ .if \spectrev2 != 0
+ spectrev2_smccc_wa1_smc
+ .else
stp x0, x1, [sp, #-16]!
- b __kvm_hyp_vector + (1b - 0b + KVM_VECTOR_PREAMBLE)
+ .endif
+ .if \indirect != 0
+ alternative_cb kvm_patch_vector_branch
+ /*
+ * For ARM64_SPECTRE_V3A configurations, these NOPs get replaced with:
+ *
+ * movz x0, #(addr & 0xffff)
+ * movk x0, #((addr >> 16) & 0xffff), lsl #16
+ * movk x0, #((addr >> 32) & 0xffff), lsl #32
+ * br x0
+ *
+ * Where:
+ * addr = kern_hyp_va(__kvm_hyp_vector) + vector-offset + KVM_VECTOR_PREAMBLE.
+ * See kvm_patch_vector_branch for details.
+ */
nop
nop
nop
-alternative_cb_end
+ nop
+ alternative_cb_end
+ .endif
+ b __kvm_hyp_vector + (1b - 0b + KVM_VECTOR_PREAMBLE)
.endm
-.macro generate_vectors
+.macro generate_vectors indirect, spectrev2
0:
.rept 16
- hyp_ventry
+ hyp_ventry \indirect, \spectrev2
.endr
.org 0b + SZ_2K // Safety measure
.endm
.align 11
SYM_CODE_START(__bp_harden_hyp_vecs)
- .rept BP_HARDEN_EL2_SLOTS
- generate_vectors
- .endr
+ generate_vectors indirect = 0, spectrev2 = 1 // HYP_VECTOR_SPECTRE_DIRECT
+ generate_vectors indirect = 1, spectrev2 = 0 // HYP_VECTOR_INDIRECT
+ generate_vectors indirect = 1, spectrev2 = 1 // HYP_VECTOR_SPECTRE_INDIRECT
1: .org __bp_harden_hyp_vecs + __BP_HARDEN_HYP_VECS_SZ
.org 1b
SYM_CODE_END(__bp_harden_hyp_vecs)
diff --git a/arch/arm64/kvm/hyp/include/hyp/adjust_pc.h b/arch/arm64/kvm/hyp/include/hyp/adjust_pc.h
new file mode 100644
index 000000000000..b1f60923a8fe
--- /dev/null
+++ b/arch/arm64/kvm/hyp/include/hyp/adjust_pc.h
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Guest PC manipulation helpers
+ *
+ * Copyright (C) 2012,2013 - ARM Ltd
+ * Copyright (C) 2020 - Google LLC
+ * Author: Marc Zyngier <maz@kernel.org>
+ */
+
+#ifndef __ARM64_KVM_HYP_ADJUST_PC_H__
+#define __ARM64_KVM_HYP_ADJUST_PC_H__
+
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_host.h>
+
+void kvm_inject_exception(struct kvm_vcpu *vcpu);
+
+static inline void kvm_skip_instr(struct kvm_vcpu *vcpu)
+{
+ if (vcpu_mode_is_32bit(vcpu)) {
+ kvm_skip_instr32(vcpu);
+ } else {
+ *vcpu_pc(vcpu) += 4;
+ *vcpu_cpsr(vcpu) &= ~PSR_BTYPE_MASK;
+ }
+
+ /* advance the singlestep state machine */
+ *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS;
+}
+
+/*
+ * Skip an instruction which has been emulated at hyp while most guest sysregs
+ * are live.
+ */
+static inline void __kvm_skip_instr(struct kvm_vcpu *vcpu)
+{
+ *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR);
+ vcpu_gp_regs(vcpu)->pstate = read_sysreg_el2(SYS_SPSR);
+
+ kvm_skip_instr(vcpu);
+
+ write_sysreg_el2(vcpu_gp_regs(vcpu)->pstate, SYS_SPSR);
+ write_sysreg_el2(*vcpu_pc(vcpu), SYS_ELR);
+}
+
+/*
+ * Adjust the guest PC on entry, depending on flags provided by EL1
+ * for the purpose of emulation (MMIO, sysreg) or exception injection.
+ */
+static inline void __adjust_pc(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.flags & KVM_ARM64_PENDING_EXCEPTION) {
+ kvm_inject_exception(vcpu);
+ vcpu->arch.flags &= ~(KVM_ARM64_PENDING_EXCEPTION |
+ KVM_ARM64_EXCEPT_MASK);
+ } else if (vcpu->arch.flags & KVM_ARM64_INCREMENT_PC) {
+ kvm_skip_instr(vcpu);
+ vcpu->arch.flags &= ~KVM_ARM64_INCREMENT_PC;
+ }
+}
+
+#endif
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 1f875a8f20c4..84473574c2e7 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -7,6 +7,8 @@
#ifndef __ARM64_KVM_HYP_SWITCH_H__
#define __ARM64_KVM_HYP_SWITCH_H__
+#include <hyp/adjust_pc.h>
+
#include <linux/arm-smccc.h>
#include <linux/kvm_host.h>
#include <linux/types.h>
@@ -409,6 +411,21 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
if (ARM_EXCEPTION_CODE(*exit_code) != ARM_EXCEPTION_IRQ)
vcpu->arch.fault.esr_el2 = read_sysreg_el2(SYS_ESR);
+ if (ARM_SERROR_PENDING(*exit_code)) {
+ u8 esr_ec = kvm_vcpu_trap_get_class(vcpu);
+
+ /*
+ * HVC already have an adjusted PC, which we need to
+ * correct in order to return to after having injected
+ * the SError.
+ *
+ * SMC, on the other hand, is *trapped*, meaning its
+ * preferred return address is the SMC itself.
+ */
+ if (esr_ec == ESR_ELx_EC_HVC32 || esr_ec == ESR_ELx_EC_HVC64)
+ write_sysreg_el2(read_sysreg_el2(SYS_ELR) - 4, SYS_ELR);
+ }
+
/*
* We're using the raw exception code in order to only process
* the trap if no SError is pending. We will come back to the
diff --git a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h
new file mode 100644
index 000000000000..1e6d995968a1
--- /dev/null
+++ b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Trap handler helpers.
+ *
+ * Copyright (C) 2020 - Google LLC
+ * Author: Marc Zyngier <maz@kernel.org>
+ */
+
+#ifndef __ARM64_KVM_NVHE_TRAP_HANDLER_H__
+#define __ARM64_KVM_NVHE_TRAP_HANDLER_H__
+
+#include <asm/kvm_host.h>
+
+#define cpu_reg(ctxt, r) (ctxt)->regs.regs[r]
+#define DECLARE_REG(type, name, ctxt, reg) \
+ type name = (type)cpu_reg(ctxt, (reg))
+
+#endif /* __ARM64_KVM_NVHE_TRAP_HANDLER_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index ddde15fe85f2..1f1e351c5fe2 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -6,9 +6,10 @@
asflags-y := -D__KVM_NVHE_HYPERVISOR__
ccflags-y := -D__KVM_NVHE_HYPERVISOR__
-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o hyp-main.o
+obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \
+ hyp-main.o hyp-smp.o psci-relay.o
obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
- ../fpsimd.o ../hyp-entry.o
+ ../fpsimd.o ../hyp-entry.o ../exception.o
##
## Build rules for compiling nVHE hyp code
diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S
index ed27f06a31ba..a820dfdc9c25 100644
--- a/arch/arm64/kvm/hyp/nvhe/host.S
+++ b/arch/arm64/kvm/hyp/nvhe/host.S
@@ -13,8 +13,6 @@
.text
SYM_FUNC_START(__host_exit)
- stp x0, x1, [sp, #-16]!
-
get_host_ctxt x0, x1
/* Store the host regs x2 and x3 */
@@ -41,6 +39,7 @@ SYM_FUNC_START(__host_exit)
bl handle_trap
/* Restore host regs x0-x17 */
+__host_enter_restore_full:
ldp x0, x1, [x29, #CPU_XREG_OFFSET(0)]
ldp x2, x3, [x29, #CPU_XREG_OFFSET(2)]
ldp x4, x5, [x29, #CPU_XREG_OFFSET(4)]
@@ -64,6 +63,14 @@ __host_enter_without_restoring:
SYM_FUNC_END(__host_exit)
/*
+ * void __noreturn __host_enter(struct kvm_cpu_context *host_ctxt);
+ */
+SYM_FUNC_START(__host_enter)
+ mov x29, x0
+ b __host_enter_restore_full
+SYM_FUNC_END(__host_enter)
+
+/*
* void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par);
*/
SYM_FUNC_START(__hyp_do_panic)
@@ -99,13 +106,15 @@ SYM_FUNC_END(__hyp_do_panic)
mrs x0, esr_el2
lsr x0, x0, #ESR_ELx_EC_SHIFT
cmp x0, #ESR_ELx_EC_HVC64
- ldp x0, x1, [sp], #16
b.ne __host_exit
+ ldp x0, x1, [sp] // Don't fixup the stack yet
+
/* Check for a stub HVC call */
cmp x0, #HVC_STUB_HCALL_NR
b.hs __host_exit
+ add sp, sp, #16
/*
* Compute the idmap address of __kvm_handle_stub_hvc and
* jump there. Since we use kimage_voffset, do not use the
@@ -115,10 +124,7 @@ SYM_FUNC_END(__hyp_do_panic)
* Preserve x0-x4, which may contain stub parameters.
*/
ldr x5, =__kvm_handle_stub_hvc
- ldr_l x6, kimage_voffset
-
- /* x5 = __pa(x5) */
- sub x5, x5, x6
+ kimg_pa x5, x6
br x5
.L__vect_end\@:
.if ((.L__vect_end\@ - .L__vect_start\@) > 0x80)
@@ -183,3 +189,41 @@ SYM_CODE_START(__kvm_hyp_host_vector)
invalid_host_el1_vect // FIQ 32-bit EL1
invalid_host_el1_vect // Error 32-bit EL1
SYM_CODE_END(__kvm_hyp_host_vector)
+
+/*
+ * Forward SMC with arguments in struct kvm_cpu_context, and
+ * store the result into the same struct. Assumes SMCCC 1.2 or older.
+ *
+ * x0: struct kvm_cpu_context*
+ */
+SYM_CODE_START(__kvm_hyp_host_forward_smc)
+ /*
+ * Use x18 to keep the pointer to the host context because
+ * x18 is callee-saved in SMCCC but not in AAPCS64.
+ */
+ mov x18, x0
+
+ ldp x0, x1, [x18, #CPU_XREG_OFFSET(0)]
+ ldp x2, x3, [x18, #CPU_XREG_OFFSET(2)]
+ ldp x4, x5, [x18, #CPU_XREG_OFFSET(4)]
+ ldp x6, x7, [x18, #CPU_XREG_OFFSET(6)]
+ ldp x8, x9, [x18, #CPU_XREG_OFFSET(8)]
+ ldp x10, x11, [x18, #CPU_XREG_OFFSET(10)]
+ ldp x12, x13, [x18, #CPU_XREG_OFFSET(12)]
+ ldp x14, x15, [x18, #CPU_XREG_OFFSET(14)]
+ ldp x16, x17, [x18, #CPU_XREG_OFFSET(16)]
+
+ smc #0
+
+ stp x0, x1, [x18, #CPU_XREG_OFFSET(0)]
+ stp x2, x3, [x18, #CPU_XREG_OFFSET(2)]
+ stp x4, x5, [x18, #CPU_XREG_OFFSET(4)]
+ stp x6, x7, [x18, #CPU_XREG_OFFSET(6)]
+ stp x8, x9, [x18, #CPU_XREG_OFFSET(8)]
+ stp x10, x11, [x18, #CPU_XREG_OFFSET(10)]
+ stp x12, x13, [x18, #CPU_XREG_OFFSET(12)]
+ stp x14, x15, [x18, #CPU_XREG_OFFSET(14)]
+ stp x16, x17, [x18, #CPU_XREG_OFFSET(16)]
+
+ ret
+SYM_CODE_END(__kvm_hyp_host_forward_smc)
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
index b11a9d7db677..31b060a44045 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
@@ -9,6 +9,7 @@
#include <asm/alternative.h>
#include <asm/assembler.h>
+#include <asm/el2_setup.h>
#include <asm/kvm_arm.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_mmu.h>
@@ -47,10 +48,7 @@ __invalid:
/*
* x0: SMCCC function ID
- * x1: HYP pgd
- * x2: per-CPU offset
- * x3: HYP stack
- * x4: HYP vectors
+ * x1: struct kvm_nvhe_init_params PA
*/
__do_hyp_init:
/* Check for a stub HVC call */
@@ -71,48 +69,53 @@ __do_hyp_init:
mov x0, #SMCCC_RET_NOT_SUPPORTED
eret
-1:
- /* Set tpidr_el2 for use by HYP to free a register */
- msr tpidr_el2, x2
+1: mov x0, x1
+ mov x4, lr
+ bl ___kvm_hyp_init
+ mov lr, x4
- phys_to_ttbr x0, x1
-alternative_if ARM64_HAS_CNP
- orr x0, x0, #TTBR_CNP_BIT
+ /* Hello, World! */
+ mov x0, #SMCCC_RET_SUCCESS
+ eret
+SYM_CODE_END(__kvm_hyp_init)
+
+/*
+ * Initialize the hypervisor in EL2.
+ *
+ * Only uses x0..x3 so as to not clobber callee-saved SMCCC registers
+ * and leave x4 for the caller.
+ *
+ * x0: struct kvm_nvhe_init_params PA
+ */
+SYM_CODE_START_LOCAL(___kvm_hyp_init)
+alternative_if ARM64_KVM_PROTECTED_MODE
+ mov_q x1, HCR_HOST_NVHE_PROTECTED_FLAGS
+ msr hcr_el2, x1
alternative_else_nop_endif
- msr ttbr0_el2, x0
- mrs x0, tcr_el1
- mov_q x1, TCR_EL2_MASK
- and x0, x0, x1
- mov x1, #TCR_EL2_RES1
- orr x0, x0, x1
+ ldr x1, [x0, #NVHE_INIT_TPIDR_EL2]
+ msr tpidr_el2, x1
- /*
- * The ID map may be configured to use an extended virtual address
- * range. This is only the case if system RAM is out of range for the
- * currently configured page size and VA_BITS, in which case we will
- * also need the extended virtual range for the HYP ID map, or we won't
- * be able to enable the EL2 MMU.
- *
- * However, at EL2, there is only one TTBR register, and we can't switch
- * between translation tables *and* update TCR_EL2.T0SZ at the same
- * time. Bottom line: we need to use the extended range with *both* our
- * translation tables.
- *
- * So use the same T0SZ value we use for the ID map.
- */
- ldr_l x1, idmap_t0sz
- bfi x0, x1, TCR_T0SZ_OFFSET, TCR_TxSZ_WIDTH
+ ldr x1, [x0, #NVHE_INIT_STACK_HYP_VA]
+ mov sp, x1
+
+ ldr x1, [x0, #NVHE_INIT_MAIR_EL2]
+ msr mair_el2, x1
+
+ ldr x1, [x0, #NVHE_INIT_PGD_PA]
+ phys_to_ttbr x2, x1
+alternative_if ARM64_HAS_CNP
+ orr x2, x2, #TTBR_CNP_BIT
+alternative_else_nop_endif
+ msr ttbr0_el2, x2
/*
* Set the PS bits in TCR_EL2.
*/
- tcr_compute_pa_size x0, #TCR_EL2_PS_SHIFT, x1, x2
+ ldr x1, [x0, #NVHE_INIT_TCR_EL2]
+ tcr_compute_pa_size x1, #TCR_EL2_PS_SHIFT, x2, x3
+ msr tcr_el2, x1
- msr tcr_el2, x0
-
- mrs x0, mair_el1
- msr mair_el2, x0
isb
/* Invalidate the stale TLBs from Bootloader */
@@ -134,14 +137,70 @@ alternative_else_nop_endif
msr sctlr_el2, x0
isb
- /* Set the stack and new vectors */
- mov sp, x3
- msr vbar_el2, x4
+ /* Set the host vector */
+ ldr x0, =__kvm_hyp_host_vector
+ kimg_hyp_va x0, x1
+ msr vbar_el2, x0
- /* Hello, World! */
- mov x0, #SMCCC_RET_SUCCESS
- eret
-SYM_CODE_END(__kvm_hyp_init)
+ ret
+SYM_CODE_END(___kvm_hyp_init)
+
+/*
+ * PSCI CPU_ON entry point
+ *
+ * x0: struct kvm_nvhe_init_params PA
+ */
+SYM_CODE_START(kvm_hyp_cpu_entry)
+ mov x1, #1 // is_cpu_on = true
+ b __kvm_hyp_init_cpu
+SYM_CODE_END(kvm_hyp_cpu_entry)
+
+/*
+ * PSCI CPU_SUSPEND / SYSTEM_SUSPEND entry point
+ *
+ * x0: struct kvm_nvhe_init_params PA
+ */
+SYM_CODE_START(kvm_hyp_cpu_resume)
+ mov x1, #0 // is_cpu_on = false
+ b __kvm_hyp_init_cpu
+SYM_CODE_END(kvm_hyp_cpu_resume)
+
+/*
+ * Common code for CPU entry points. Initializes EL2 state and
+ * installs the hypervisor before handing over to a C handler.
+ *
+ * x0: struct kvm_nvhe_init_params PA
+ * x1: bool is_cpu_on
+ */
+SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu)
+ mov x28, x0 // Stash arguments
+ mov x29, x1
+
+ /* Check that the core was booted in EL2. */
+ mrs x0, CurrentEL
+ cmp x0, #CurrentEL_EL2
+ b.eq 2f
+
+ /* The core booted in EL1. KVM cannot be initialized on it. */
+1: wfe
+ wfi
+ b 1b
+
+2: msr SPsel, #1 // We want to use SP_EL{1,2}
+
+ /* Initialize EL2 CPU state to sane values. */
+ init_el2_state nvhe // Clobbers x0..x2
+
+ /* Enable MMU, set vectors and stack. */
+ mov x0, x28
+ bl ___kvm_hyp_init // Clobbers x0..x3
+
+ /* Leave idmap. */
+ mov x0, x29
+ ldr x1, =kvm_host_psci_cpu_entry
+ kimg_hyp_va x1, x2
+ br x1
+SYM_CODE_END(__kvm_hyp_init_cpu)
SYM_CODE_START(__kvm_handle_stub_hvc)
cmp x0, #HVC_SOFT_RESTART
@@ -176,6 +235,11 @@ reset:
msr sctlr_el2, x5
isb
+alternative_if ARM64_KVM_PROTECTED_MODE
+ mov_q x5, HCR_HOST_NVHE_FLAGS
+ msr hcr_el2, x5
+alternative_else_nop_endif
+
/* Install stub vectors */
adr_l x5, __hyp_stub_vectors
msr vbar_el2, x5
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index e2eafe2c93af..bde658d51404 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -12,106 +12,183 @@
#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
-#include <kvm/arm_hypercalls.h>
+#include <nvhe/trap_handler.h>
-static void handle_host_hcall(unsigned long func_id,
- struct kvm_cpu_context *host_ctxt)
+DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
+
+void __kvm_hyp_host_forward_smc(struct kvm_cpu_context *host_ctxt);
+
+static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt)
{
- unsigned long ret = 0;
+ DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1);
- switch (func_id) {
- case KVM_HOST_SMCCC_FUNC(__kvm_vcpu_run): {
- unsigned long r1 = host_ctxt->regs.regs[1];
- struct kvm_vcpu *vcpu = (struct kvm_vcpu *)r1;
+ cpu_reg(host_ctxt, 1) = __kvm_vcpu_run(kern_hyp_va(vcpu));
+}
- ret = __kvm_vcpu_run(kern_hyp_va(vcpu));
- break;
- }
- case KVM_HOST_SMCCC_FUNC(__kvm_flush_vm_context):
- __kvm_flush_vm_context();
- break;
- case KVM_HOST_SMCCC_FUNC(__kvm_tlb_flush_vmid_ipa): {
- unsigned long r1 = host_ctxt->regs.regs[1];
- struct kvm_s2_mmu *mmu = (struct kvm_s2_mmu *)r1;
- phys_addr_t ipa = host_ctxt->regs.regs[2];
- int level = host_ctxt->regs.regs[3];
+static void handle___kvm_flush_vm_context(struct kvm_cpu_context *host_ctxt)
+{
+ __kvm_flush_vm_context();
+}
- __kvm_tlb_flush_vmid_ipa(kern_hyp_va(mmu), ipa, level);
- break;
- }
- case KVM_HOST_SMCCC_FUNC(__kvm_tlb_flush_vmid): {
- unsigned long r1 = host_ctxt->regs.regs[1];
- struct kvm_s2_mmu *mmu = (struct kvm_s2_mmu *)r1;
+static void handle___kvm_tlb_flush_vmid_ipa(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1);
+ DECLARE_REG(phys_addr_t, ipa, host_ctxt, 2);
+ DECLARE_REG(int, level, host_ctxt, 3);
- __kvm_tlb_flush_vmid(kern_hyp_va(mmu));
- break;
- }
- case KVM_HOST_SMCCC_FUNC(__kvm_tlb_flush_local_vmid): {
- unsigned long r1 = host_ctxt->regs.regs[1];
- struct kvm_s2_mmu *mmu = (struct kvm_s2_mmu *)r1;
+ __kvm_tlb_flush_vmid_ipa(kern_hyp_va(mmu), ipa, level);
+}
- __kvm_tlb_flush_local_vmid(kern_hyp_va(mmu));
- break;
- }
- case KVM_HOST_SMCCC_FUNC(__kvm_timer_set_cntvoff): {
- u64 cntvoff = host_ctxt->regs.regs[1];
+static void handle___kvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1);
- __kvm_timer_set_cntvoff(cntvoff);
- break;
- }
- case KVM_HOST_SMCCC_FUNC(__kvm_enable_ssbs):
- __kvm_enable_ssbs();
- break;
- case KVM_HOST_SMCCC_FUNC(__vgic_v3_get_ich_vtr_el2):
- ret = __vgic_v3_get_ich_vtr_el2();
- break;
- case KVM_HOST_SMCCC_FUNC(__vgic_v3_read_vmcr):
- ret = __vgic_v3_read_vmcr();
- break;
- case KVM_HOST_SMCCC_FUNC(__vgic_v3_write_vmcr): {
- u32 vmcr = host_ctxt->regs.regs[1];
+ __kvm_tlb_flush_vmid(kern_hyp_va(mmu));
+}
- __vgic_v3_write_vmcr(vmcr);
- break;
- }
- case KVM_HOST_SMCCC_FUNC(__vgic_v3_init_lrs):
- __vgic_v3_init_lrs();
- break;
- case KVM_HOST_SMCCC_FUNC(__kvm_get_mdcr_el2):
- ret = __kvm_get_mdcr_el2();
- break;
- case KVM_HOST_SMCCC_FUNC(__vgic_v3_save_aprs): {
- unsigned long r1 = host_ctxt->regs.regs[1];
- struct vgic_v3_cpu_if *cpu_if = (struct vgic_v3_cpu_if *)r1;
+static void handle___kvm_tlb_flush_local_vmid(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1);
- __vgic_v3_save_aprs(kern_hyp_va(cpu_if));
- break;
- }
- case KVM_HOST_SMCCC_FUNC(__vgic_v3_restore_aprs): {
- unsigned long r1 = host_ctxt->regs.regs[1];
- struct vgic_v3_cpu_if *cpu_if = (struct vgic_v3_cpu_if *)r1;
+ __kvm_tlb_flush_local_vmid(kern_hyp_va(mmu));
+}
- __vgic_v3_restore_aprs(kern_hyp_va(cpu_if));
- break;
- }
- default:
- /* Invalid host HVC. */
- host_ctxt->regs.regs[0] = SMCCC_RET_NOT_SUPPORTED;
- return;
- }
+static void handle___kvm_timer_set_cntvoff(struct kvm_cpu_context *host_ctxt)
+{
+ __kvm_timer_set_cntvoff(cpu_reg(host_ctxt, 1));
+}
+
+static void handle___kvm_enable_ssbs(struct kvm_cpu_context *host_ctxt)
+{
+ u64 tmp;
- host_ctxt->regs.regs[0] = SMCCC_RET_SUCCESS;
- host_ctxt->regs.regs[1] = ret;
+ tmp = read_sysreg_el2(SYS_SCTLR);
+ tmp |= SCTLR_ELx_DSSBS;
+ write_sysreg_el2(tmp, SYS_SCTLR);
+}
+
+static void handle___vgic_v3_get_ich_vtr_el2(struct kvm_cpu_context *host_ctxt)
+{
+ cpu_reg(host_ctxt, 1) = __vgic_v3_get_ich_vtr_el2();
+}
+
+static void handle___vgic_v3_read_vmcr(struct kvm_cpu_context *host_ctxt)
+{
+ cpu_reg(host_ctxt, 1) = __vgic_v3_read_vmcr();
+}
+
+static void handle___vgic_v3_write_vmcr(struct kvm_cpu_context *host_ctxt)
+{
+ __vgic_v3_write_vmcr(cpu_reg(host_ctxt, 1));
+}
+
+static void handle___vgic_v3_init_lrs(struct kvm_cpu_context *host_ctxt)
+{
+ __vgic_v3_init_lrs();
+}
+
+static void handle___kvm_get_mdcr_el2(struct kvm_cpu_context *host_ctxt)
+{
+ cpu_reg(host_ctxt, 1) = __kvm_get_mdcr_el2();
+}
+
+static void handle___vgic_v3_save_aprs(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(struct vgic_v3_cpu_if *, cpu_if, host_ctxt, 1);
+
+ __vgic_v3_save_aprs(kern_hyp_va(cpu_if));
+}
+
+static void handle___vgic_v3_restore_aprs(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(struct vgic_v3_cpu_if *, cpu_if, host_ctxt, 1);
+
+ __vgic_v3_restore_aprs(kern_hyp_va(cpu_if));
+}
+
+typedef void (*hcall_t)(struct kvm_cpu_context *);
+
+#define HANDLE_FUNC(x) [__KVM_HOST_SMCCC_FUNC_##x] = kimg_fn_ptr(handle_##x)
+
+static const hcall_t *host_hcall[] = {
+ HANDLE_FUNC(__kvm_vcpu_run),
+ HANDLE_FUNC(__kvm_flush_vm_context),
+ HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa),
+ HANDLE_FUNC(__kvm_tlb_flush_vmid),
+ HANDLE_FUNC(__kvm_tlb_flush_local_vmid),
+ HANDLE_FUNC(__kvm_timer_set_cntvoff),
+ HANDLE_FUNC(__kvm_enable_ssbs),
+ HANDLE_FUNC(__vgic_v3_get_ich_vtr_el2),
+ HANDLE_FUNC(__vgic_v3_read_vmcr),
+ HANDLE_FUNC(__vgic_v3_write_vmcr),
+ HANDLE_FUNC(__vgic_v3_init_lrs),
+ HANDLE_FUNC(__kvm_get_mdcr_el2),
+ HANDLE_FUNC(__vgic_v3_save_aprs),
+ HANDLE_FUNC(__vgic_v3_restore_aprs),
+};
+
+static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(unsigned long, id, host_ctxt, 0);
+ const hcall_t *kfn;
+ hcall_t hfn;
+
+ id -= KVM_HOST_SMCCC_ID(0);
+
+ if (unlikely(id >= ARRAY_SIZE(host_hcall)))
+ goto inval;
+
+ kfn = host_hcall[id];
+ if (unlikely(!kfn))
+ goto inval;
+
+ cpu_reg(host_ctxt, 0) = SMCCC_RET_SUCCESS;
+
+ hfn = kimg_fn_hyp_va(kfn);
+ hfn(host_ctxt);
+
+ return;
+inval:
+ cpu_reg(host_ctxt, 0) = SMCCC_RET_NOT_SUPPORTED;
+}
+
+static void default_host_smc_handler(struct kvm_cpu_context *host_ctxt)
+{
+ __kvm_hyp_host_forward_smc(host_ctxt);
+}
+
+static void skip_host_instruction(void)
+{
+ write_sysreg_el2(read_sysreg_el2(SYS_ELR) + 4, SYS_ELR);
+}
+
+static void handle_host_smc(struct kvm_cpu_context *host_ctxt)
+{
+ bool handled;
+
+ handled = kvm_host_psci_handler(host_ctxt);
+ if (!handled)
+ default_host_smc_handler(host_ctxt);
+
+ /*
+ * Unlike HVC, the return address of an SMC is the instruction's PC.
+ * Move the return address past the instruction.
+ */
+ skip_host_instruction();
}
void handle_trap(struct kvm_cpu_context *host_ctxt)
{
u64 esr = read_sysreg_el2(SYS_ESR);
- unsigned long func_id;
- if (ESR_ELx_EC(esr) != ESR_ELx_EC_HVC64)
+ switch (ESR_ELx_EC(esr)) {
+ case ESR_ELx_EC_HVC64:
+ handle_host_hcall(host_ctxt);
+ break;
+ case ESR_ELx_EC_SMC64:
+ handle_host_smc(host_ctxt);
+ break;
+ default:
hyp_panic();
-
- func_id = host_ctxt->regs.regs[0];
- handle_host_hcall(func_id, host_ctxt);
+ }
}
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-smp.c b/arch/arm64/kvm/hyp/nvhe/hyp-smp.c
new file mode 100644
index 000000000000..cbab0c6246e2
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-smp.c
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 - Google LLC
+ * Author: David Brazdil <dbrazdil@google.com>
+ */
+
+#include <asm/kvm_asm.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+
+/*
+ * nVHE copy of data structures tracking available CPU cores.
+ * Only entries for CPUs that were online at KVM init are populated.
+ * Other CPUs should not be allowed to boot because their features were
+ * not checked against the finalized system capabilities.
+ */
+u64 __ro_after_init __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID };
+
+u64 cpu_logical_map(unsigned int cpu)
+{
+ if (cpu >= ARRAY_SIZE(__cpu_logical_map))
+ hyp_panic();
+
+ return __cpu_logical_map[cpu];
+}
+
+unsigned long __hyp_per_cpu_offset(unsigned int cpu)
+{
+ unsigned long *cpu_base_array;
+ unsigned long this_cpu_base;
+ unsigned long elf_base;
+
+ if (cpu >= ARRAY_SIZE(kvm_arm_hyp_percpu_base))
+ hyp_panic();
+
+ cpu_base_array = (unsigned long *)hyp_symbol_addr(kvm_arm_hyp_percpu_base);
+ this_cpu_base = kern_hyp_va(cpu_base_array[cpu]);
+ elf_base = (unsigned long)hyp_symbol_addr(__per_cpu_start);
+ return this_cpu_base - elf_base;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
index a797abace13f..1206d0d754d5 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
@@ -21,4 +21,5 @@ SECTIONS {
HYP_SECTION_NAME(.data..percpu) : {
PERCPU_INPUT(L1_CACHE_BYTES)
}
+ HYP_SECTION(.data..ro_after_init)
}
diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
new file mode 100644
index 000000000000..08dc9de69314
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
@@ -0,0 +1,324 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 - Google LLC
+ * Author: David Brazdil <dbrazdil@google.com>
+ */
+
+#include <asm/kvm_asm.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+#include <kvm/arm_hypercalls.h>
+#include <linux/arm-smccc.h>
+#include <linux/kvm_host.h>
+#include <linux/psci.h>
+#include <kvm/arm_psci.h>
+#include <uapi/linux/psci.h>
+
+#include <nvhe/trap_handler.h>
+
+void kvm_hyp_cpu_entry(unsigned long r0);
+void kvm_hyp_cpu_resume(unsigned long r0);
+
+void __noreturn __host_enter(struct kvm_cpu_context *host_ctxt);
+
+/* Config options set by the host. */
+__ro_after_init u32 kvm_host_psci_version;
+__ro_after_init struct psci_0_1_function_ids kvm_host_psci_0_1_function_ids;
+__ro_after_init s64 hyp_physvirt_offset;
+
+#define __hyp_pa(x) ((phys_addr_t)((x)) + hyp_physvirt_offset)
+
+#define INVALID_CPU_ID UINT_MAX
+
+struct psci_boot_args {
+ atomic_t lock;
+ unsigned long pc;
+ unsigned long r0;
+};
+
+#define PSCI_BOOT_ARGS_UNLOCKED 0
+#define PSCI_BOOT_ARGS_LOCKED 1
+
+#define PSCI_BOOT_ARGS_INIT \
+ ((struct psci_boot_args){ \
+ .lock = ATOMIC_INIT(PSCI_BOOT_ARGS_UNLOCKED), \
+ })
+
+static DEFINE_PER_CPU(struct psci_boot_args, cpu_on_args) = PSCI_BOOT_ARGS_INIT;
+static DEFINE_PER_CPU(struct psci_boot_args, suspend_args) = PSCI_BOOT_ARGS_INIT;
+
+static u64 get_psci_func_id(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(u64, func_id, host_ctxt, 0);
+
+ return func_id;
+}
+
+static bool is_psci_0_1_call(u64 func_id)
+{
+ return (func_id == kvm_host_psci_0_1_function_ids.cpu_suspend) ||
+ (func_id == kvm_host_psci_0_1_function_ids.cpu_on) ||
+ (func_id == kvm_host_psci_0_1_function_ids.cpu_off) ||
+ (func_id == kvm_host_psci_0_1_function_ids.migrate);
+}
+
+static bool is_psci_0_2_call(u64 func_id)
+{
+ /* SMCCC reserves IDs 0x00-1F with the given 32/64-bit base for PSCI. */
+ return (PSCI_0_2_FN(0) <= func_id && func_id <= PSCI_0_2_FN(31)) ||
+ (PSCI_0_2_FN64(0) <= func_id && func_id <= PSCI_0_2_FN64(31));
+}
+
+static bool is_psci_call(u64 func_id)
+{
+ switch (kvm_host_psci_version) {
+ case PSCI_VERSION(0, 1):
+ return is_psci_0_1_call(func_id);
+ default:
+ return is_psci_0_2_call(func_id);
+ }
+}
+
+static unsigned long psci_call(unsigned long fn, unsigned long arg0,
+ unsigned long arg1, unsigned long arg2)
+{
+ struct arm_smccc_res res;
+
+ arm_smccc_1_1_smc(fn, arg0, arg1, arg2, &res);
+ return res.a0;
+}
+
+static unsigned long psci_forward(struct kvm_cpu_context *host_ctxt)
+{
+ return psci_call(cpu_reg(host_ctxt, 0), cpu_reg(host_ctxt, 1),
+ cpu_reg(host_ctxt, 2), cpu_reg(host_ctxt, 3));
+}
+
+static __noreturn unsigned long psci_forward_noreturn(struct kvm_cpu_context *host_ctxt)
+{
+ psci_forward(host_ctxt);
+ hyp_panic(); /* unreachable */
+}
+
+static unsigned int find_cpu_id(u64 mpidr)
+{
+ unsigned int i;
+
+ /* Reject invalid MPIDRs */
+ if (mpidr & ~MPIDR_HWID_BITMASK)
+ return INVALID_CPU_ID;
+
+ for (i = 0; i < NR_CPUS; i++) {
+ if (cpu_logical_map(i) == mpidr)
+ return i;
+ }
+
+ return INVALID_CPU_ID;
+}
+
+static __always_inline bool try_acquire_boot_args(struct psci_boot_args *args)
+{
+ return atomic_cmpxchg_acquire(&args->lock,
+ PSCI_BOOT_ARGS_UNLOCKED,
+ PSCI_BOOT_ARGS_LOCKED) ==
+ PSCI_BOOT_ARGS_UNLOCKED;
+}
+
+static __always_inline void release_boot_args(struct psci_boot_args *args)
+{
+ atomic_set_release(&args->lock, PSCI_BOOT_ARGS_UNLOCKED);
+}
+
+static int psci_cpu_on(u64 func_id, struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(u64, mpidr, host_ctxt, 1);
+ DECLARE_REG(unsigned long, pc, host_ctxt, 2);
+ DECLARE_REG(unsigned long, r0, host_ctxt, 3);
+
+ unsigned int cpu_id;
+ struct psci_boot_args *boot_args;
+ struct kvm_nvhe_init_params *init_params;
+ int ret;
+
+ /*
+ * Find the logical CPU ID for the given MPIDR. The search set is
+ * the set of CPUs that were online at the point of KVM initialization.
+ * Booting other CPUs is rejected because their cpufeatures were not
+ * checked against the finalized capabilities. This could be relaxed
+ * by doing the feature checks in hyp.
+ */
+ cpu_id = find_cpu_id(mpidr);
+ if (cpu_id == INVALID_CPU_ID)
+ return PSCI_RET_INVALID_PARAMS;
+
+ boot_args = per_cpu_ptr(hyp_symbol_addr(cpu_on_args), cpu_id);
+ init_params = per_cpu_ptr(hyp_symbol_addr(kvm_init_params), cpu_id);
+
+ /* Check if the target CPU is already being booted. */
+ if (!try_acquire_boot_args(boot_args))
+ return PSCI_RET_ALREADY_ON;
+
+ boot_args->pc = pc;
+ boot_args->r0 = r0;
+ wmb();
+
+ ret = psci_call(func_id, mpidr,
+ __hyp_pa(hyp_symbol_addr(kvm_hyp_cpu_entry)),
+ __hyp_pa(init_params));
+
+ /* If successful, the lock will be released by the target CPU. */
+ if (ret != PSCI_RET_SUCCESS)
+ release_boot_args(boot_args);
+
+ return ret;
+}
+
+static int psci_cpu_suspend(u64 func_id, struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(u64, power_state, host_ctxt, 1);
+ DECLARE_REG(unsigned long, pc, host_ctxt, 2);
+ DECLARE_REG(unsigned long, r0, host_ctxt, 3);
+
+ struct psci_boot_args *boot_args;
+ struct kvm_nvhe_init_params *init_params;
+
+ boot_args = this_cpu_ptr(hyp_symbol_addr(suspend_args));
+ init_params = this_cpu_ptr(hyp_symbol_addr(kvm_init_params));
+
+ /*
+ * No need to acquire a lock before writing to boot_args because a core
+ * can only suspend itself. Racy CPU_ON calls use a separate struct.
+ */
+ boot_args->pc = pc;
+ boot_args->r0 = r0;
+
+ /*
+ * Will either return if shallow sleep state, or wake up into the entry
+ * point if it is a deep sleep state.
+ */
+ return psci_call(func_id, power_state,
+ __hyp_pa(hyp_symbol_addr(kvm_hyp_cpu_resume)),
+ __hyp_pa(init_params));
+}
+
+static int psci_system_suspend(u64 func_id, struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(unsigned long, pc, host_ctxt, 1);
+ DECLARE_REG(unsigned long, r0, host_ctxt, 2);
+
+ struct psci_boot_args *boot_args;
+ struct kvm_nvhe_init_params *init_params;
+
+ boot_args = this_cpu_ptr(hyp_symbol_addr(suspend_args));
+ init_params = this_cpu_ptr(hyp_symbol_addr(kvm_init_params));
+
+ /*
+ * No need to acquire a lock before writing to boot_args because a core
+ * can only suspend itself. Racy CPU_ON calls use a separate struct.
+ */
+ boot_args->pc = pc;
+ boot_args->r0 = r0;
+
+ /* Will only return on error. */
+ return psci_call(func_id,
+ __hyp_pa(hyp_symbol_addr(kvm_hyp_cpu_resume)),
+ __hyp_pa(init_params), 0);
+}
+
+asmlinkage void __noreturn kvm_host_psci_cpu_entry(bool is_cpu_on)
+{
+ struct psci_boot_args *boot_args;
+ struct kvm_cpu_context *host_ctxt;
+
+ host_ctxt = &this_cpu_ptr(hyp_symbol_addr(kvm_host_data))->host_ctxt;
+
+ if (is_cpu_on)
+ boot_args = this_cpu_ptr(hyp_symbol_addr(cpu_on_args));
+ else
+ boot_args = this_cpu_ptr(hyp_symbol_addr(suspend_args));
+
+ cpu_reg(host_ctxt, 0) = boot_args->r0;
+ write_sysreg_el2(boot_args->pc, SYS_ELR);
+
+ if (is_cpu_on)
+ release_boot_args(boot_args);
+
+ __host_enter(host_ctxt);
+}
+
+static unsigned long psci_0_1_handler(u64 func_id, struct kvm_cpu_context *host_ctxt)
+{
+ if ((func_id == kvm_host_psci_0_1_function_ids.cpu_off) ||
+ (func_id == kvm_host_psci_0_1_function_ids.migrate))
+ return psci_forward(host_ctxt);
+ else if (func_id == kvm_host_psci_0_1_function_ids.cpu_on)
+ return psci_cpu_on(func_id, host_ctxt);
+ else if (func_id == kvm_host_psci_0_1_function_ids.cpu_suspend)
+ return psci_cpu_suspend(func_id, host_ctxt);
+ else
+ return PSCI_RET_NOT_SUPPORTED;
+}
+
+static unsigned long psci_0_2_handler(u64 func_id, struct kvm_cpu_context *host_ctxt)
+{
+ switch (func_id) {
+ case PSCI_0_2_FN_PSCI_VERSION:
+ case PSCI_0_2_FN_CPU_OFF:
+ case PSCI_0_2_FN64_AFFINITY_INFO:
+ case PSCI_0_2_FN64_MIGRATE:
+ case PSCI_0_2_FN_MIGRATE_INFO_TYPE:
+ case PSCI_0_2_FN64_MIGRATE_INFO_UP_CPU:
+ return psci_forward(host_ctxt);
+ case PSCI_0_2_FN_SYSTEM_OFF:
+ case PSCI_0_2_FN_SYSTEM_RESET:
+ psci_forward_noreturn(host_ctxt);
+ unreachable();
+ case PSCI_0_2_FN64_CPU_SUSPEND:
+ return psci_cpu_suspend(func_id, host_ctxt);
+ case PSCI_0_2_FN64_CPU_ON:
+ return psci_cpu_on(func_id, host_ctxt);
+ default:
+ return PSCI_RET_NOT_SUPPORTED;
+ }
+}
+
+static unsigned long psci_1_0_handler(u64 func_id, struct kvm_cpu_context *host_ctxt)
+{
+ switch (func_id) {
+ case PSCI_1_0_FN_PSCI_FEATURES:
+ case PSCI_1_0_FN_SET_SUSPEND_MODE:
+ case PSCI_1_1_FN64_SYSTEM_RESET2:
+ return psci_forward(host_ctxt);
+ case PSCI_1_0_FN64_SYSTEM_SUSPEND:
+ return psci_system_suspend(func_id, host_ctxt);
+ default:
+ return psci_0_2_handler(func_id, host_ctxt);
+ }
+}
+
+bool kvm_host_psci_handler(struct kvm_cpu_context *host_ctxt)
+{
+ u64 func_id = get_psci_func_id(host_ctxt);
+ unsigned long ret;
+
+ if (!is_psci_call(func_id))
+ return false;
+
+ switch (kvm_host_psci_version) {
+ case PSCI_VERSION(0, 1):
+ ret = psci_0_1_handler(func_id, host_ctxt);
+ break;
+ case PSCI_VERSION(0, 2):
+ ret = psci_0_2_handler(func_id, host_ctxt);
+ break;
+ default:
+ ret = psci_1_0_handler(func_id, host_ctxt);
+ break;
+ }
+
+ cpu_reg(host_ctxt, 0) = ret;
+ cpu_reg(host_ctxt, 1) = 0;
+ cpu_reg(host_ctxt, 2) = 0;
+ cpu_reg(host_ctxt, 3) = 0;
+ return true;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index 8ae8160bc93a..f3d0e9eca56c 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -4,6 +4,7 @@
* Author: Marc Zyngier <marc.zyngier@arm.com>
*/
+#include <hyp/adjust_pc.h>
#include <hyp/switch.h>
#include <hyp/sysreg-sr.h>
@@ -96,7 +97,10 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
mdcr_el2 |= MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT;
write_sysreg(mdcr_el2, mdcr_el2);
- write_sysreg(HCR_HOST_NVHE_FLAGS, hcr_el2);
+ if (is_protected_kvm_enabled())
+ write_sysreg(HCR_HOST_NVHE_PROTECTED_FLAGS, hcr_el2);
+ else
+ write_sysreg(HCR_HOST_NVHE_FLAGS, hcr_el2);
write_sysreg(CPTR_EL2_DEFAULT, cptr_el2);
write_sysreg(__kvm_hyp_host_vector, vbar_el2);
}
@@ -189,6 +193,8 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
__sysreg_save_state_nvhe(host_ctxt);
+ __adjust_pc(vcpu);
+
/*
* We must restore the 32-bit state before the sysregs, thanks
* to erratum #852523 (Cortex-A57) or #853709 (Cortex-A72).
diff --git a/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c b/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c
index 88a25fc8fcd3..29305022bc04 100644
--- a/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c
@@ -33,14 +33,3 @@ void __sysreg_restore_state_nvhe(struct kvm_cpu_context *ctxt)
__sysreg_restore_user_state(ctxt);
__sysreg_restore_el2_return_state(ctxt);
}
-
-void __kvm_enable_ssbs(void)
-{
- u64 tmp;
-
- asm volatile(
- "mrs %0, sctlr_el2\n"
- "orr %0, %0, %1\n"
- "msr sctlr_el2, %0"
- : "=&r" (tmp) : "L" (SCTLR_ELx_DSSBS));
-}
diff --git a/arch/arm64/kvm/hyp/smccc_wa.S b/arch/arm64/kvm/hyp/smccc_wa.S
deleted file mode 100644
index b0441dbdf68b..000000000000
--- a/arch/arm64/kvm/hyp/smccc_wa.S
+++ /dev/null
@@ -1,32 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2015-2018 - ARM Ltd
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- */
-
-#include <linux/arm-smccc.h>
-#include <linux/linkage.h>
-
-#include <asm/kvm_asm.h>
-#include <asm/kvm_mmu.h>
-
- /*
- * This is not executed directly and is instead copied into the vectors
- * by install_bp_hardening_cb().
- */
- .data
- .pushsection .rodata
- .global __smccc_workaround_1_smc
-SYM_DATA_START(__smccc_workaround_1_smc)
- esb
- sub sp, sp, #(8 * 4)
- stp x2, x3, [sp, #(8 * 0)]
- stp x0, x1, [sp, #(8 * 2)]
- mov w0, #ARM_SMCCC_ARCH_WORKAROUND_1
- smc #0
- ldp x2, x3, [sp, #(8 * 0)]
- ldp x0, x1, [sp, #(8 * 2)]
- add sp, sp, #(8 * 4)
-1: .org __smccc_workaround_1_smc + __SMCCC_WORKAROUND_1_SMC_SZ
- .org 1b
-SYM_DATA_END(__smccc_workaround_1_smc)
diff --git a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c
index bd1bab551d48..8f0585640241 100644
--- a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c
+++ b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c
@@ -4,6 +4,8 @@
* Author: Marc Zyngier <marc.zyngier@arm.com>
*/
+#include <hyp/adjust_pc.h>
+
#include <linux/compiler.h>
#include <linux/irqchip/arm-gic.h>
#include <linux/kvm_host.h>
diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c
index 452f4cacd674..80406f463c28 100644
--- a/arch/arm64/kvm/hyp/vgic-v3-sr.c
+++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c
@@ -4,6 +4,8 @@
* Author: Marc Zyngier <marc.zyngier@arm.com>
*/
+#include <hyp/adjust_pc.h>
+
#include <linux/compiler.h>
#include <linux/irqchip/arm-gic-v3.h>
#include <linux/kvm_host.h>
diff --git a/arch/arm64/kvm/hyp/vhe/Makefile b/arch/arm64/kvm/hyp/vhe/Makefile
index 461e97c375cc..96bec0ecf9dd 100644
--- a/arch/arm64/kvm/hyp/vhe/Makefile
+++ b/arch/arm64/kvm/hyp/vhe/Makefile
@@ -8,4 +8,4 @@ ccflags-y := -D__KVM_VHE_HYPERVISOR__
obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o
obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
- ../fpsimd.o ../hyp-entry.o
+ ../fpsimd.o ../hyp-entry.o ../exception.o
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index 62546e20b251..af8e940d0f03 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -4,6 +4,7 @@
* Author: Marc Zyngier <marc.zyngier@arm.com>
*/
+#include <hyp/adjust_pc.h>
#include <hyp/switch.h>
#include <linux/arm-smccc.h>
@@ -133,6 +134,8 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
__load_guest_stage2(vcpu->arch.hw_mmu);
__activate_traps(vcpu);
+ __adjust_pc(vcpu);
+
sysreg_restore_guest_state_vhe(guest_ctxt);
__debug_switch_to_guest(vcpu);
diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c
index 34a96ab244fa..b47df73e98d7 100644
--- a/arch/arm64/kvm/inject_fault.c
+++ b/arch/arm64/kvm/inject_fault.c
@@ -14,119 +14,15 @@
#include <asm/kvm_emulate.h>
#include <asm/esr.h>
-#define CURRENT_EL_SP_EL0_VECTOR 0x0
-#define CURRENT_EL_SP_ELx_VECTOR 0x200
-#define LOWER_EL_AArch64_VECTOR 0x400
-#define LOWER_EL_AArch32_VECTOR 0x600
-
-enum exception_type {
- except_type_sync = 0,
- except_type_irq = 0x80,
- except_type_fiq = 0x100,
- except_type_serror = 0x180,
-};
-
-/*
- * This performs the exception entry at a given EL (@target_mode), stashing PC
- * and PSTATE into ELR and SPSR respectively, and compute the new PC/PSTATE.
- * The EL passed to this function *must* be a non-secure, privileged mode with
- * bit 0 being set (PSTATE.SP == 1).
- *
- * When an exception is taken, most PSTATE fields are left unchanged in the
- * handler. However, some are explicitly overridden (e.g. M[4:0]). Luckily all
- * of the inherited bits have the same position in the AArch64/AArch32 SPSR_ELx
- * layouts, so we don't need to shuffle these for exceptions from AArch32 EL0.
- *
- * For the SPSR_ELx layout for AArch64, see ARM DDI 0487E.a page C5-429.
- * For the SPSR_ELx layout for AArch32, see ARM DDI 0487E.a page C5-426.
- *
- * Here we manipulate the fields in order of the AArch64 SPSR_ELx layout, from
- * MSB to LSB.
- */
-static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode,
- enum exception_type type)
-{
- unsigned long sctlr, vbar, old, new, mode;
- u64 exc_offset;
-
- mode = *vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT);
-
- if (mode == target_mode)
- exc_offset = CURRENT_EL_SP_ELx_VECTOR;
- else if ((mode | PSR_MODE_THREAD_BIT) == target_mode)
- exc_offset = CURRENT_EL_SP_EL0_VECTOR;
- else if (!(mode & PSR_MODE32_BIT))
- exc_offset = LOWER_EL_AArch64_VECTOR;
- else
- exc_offset = LOWER_EL_AArch32_VECTOR;
-
- switch (target_mode) {
- case PSR_MODE_EL1h:
- vbar = vcpu_read_sys_reg(vcpu, VBAR_EL1);
- sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
- vcpu_write_sys_reg(vcpu, *vcpu_pc(vcpu), ELR_EL1);
- break;
- default:
- /* Don't do that */
- BUG();
- }
-
- *vcpu_pc(vcpu) = vbar + exc_offset + type;
-
- old = *vcpu_cpsr(vcpu);
- new = 0;
-
- new |= (old & PSR_N_BIT);
- new |= (old & PSR_Z_BIT);
- new |= (old & PSR_C_BIT);
- new |= (old & PSR_V_BIT);
-
- // TODO: TCO (if/when ARMv8.5-MemTag is exposed to guests)
-
- new |= (old & PSR_DIT_BIT);
-
- // PSTATE.UAO is set to zero upon any exception to AArch64
- // See ARM DDI 0487E.a, page D5-2579.
-
- // PSTATE.PAN is unchanged unless SCTLR_ELx.SPAN == 0b0
- // SCTLR_ELx.SPAN is RES1 when ARMv8.1-PAN is not implemented
- // See ARM DDI 0487E.a, page D5-2578.
- new |= (old & PSR_PAN_BIT);
- if (!(sctlr & SCTLR_EL1_SPAN))
- new |= PSR_PAN_BIT;
-
- // PSTATE.SS is set to zero upon any exception to AArch64
- // See ARM DDI 0487E.a, page D2-2452.
-
- // PSTATE.IL is set to zero upon any exception to AArch64
- // See ARM DDI 0487E.a, page D1-2306.
-
- // PSTATE.SSBS is set to SCTLR_ELx.DSSBS upon any exception to AArch64
- // See ARM DDI 0487E.a, page D13-3258
- if (sctlr & SCTLR_ELx_DSSBS)
- new |= PSR_SSBS_BIT;
-
- // PSTATE.BTYPE is set to zero upon any exception to AArch64
- // See ARM DDI 0487E.a, pages D1-2293 to D1-2294.
-
- new |= PSR_D_BIT;
- new |= PSR_A_BIT;
- new |= PSR_I_BIT;
- new |= PSR_F_BIT;
-
- new |= target_mode;
-
- *vcpu_cpsr(vcpu) = new;
- vcpu_write_spsr(vcpu, old);
-}
-
static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr)
{
unsigned long cpsr = *vcpu_cpsr(vcpu);
bool is_aarch32 = vcpu_mode_is_32bit(vcpu);
u32 esr = 0;
- enter_exception64(vcpu, PSR_MODE_EL1h, except_type_sync);
+ vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA64_EL1 |
+ KVM_ARM64_EXCEPT_AA64_ELx_SYNC |
+ KVM_ARM64_PENDING_EXCEPTION);
vcpu_write_sys_reg(vcpu, addr, FAR_EL1);
@@ -156,7 +52,9 @@ static void inject_undef64(struct kvm_vcpu *vcpu)
{
u32 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT);
- enter_exception64(vcpu, PSR_MODE_EL1h, except_type_sync);
+ vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA64_EL1 |
+ KVM_ARM64_EXCEPT_AA64_ELx_SYNC |
+ KVM_ARM64_PENDING_EXCEPTION);
/*
* Build an unknown exception, depending on the instruction
@@ -168,6 +66,53 @@ static void inject_undef64(struct kvm_vcpu *vcpu)
vcpu_write_sys_reg(vcpu, esr, ESR_EL1);
}
+#define DFSR_FSC_EXTABT_LPAE 0x10
+#define DFSR_FSC_EXTABT_nLPAE 0x08
+#define DFSR_LPAE BIT(9)
+#define TTBCR_EAE BIT(31)
+
+static void inject_undef32(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA32_UND |
+ KVM_ARM64_PENDING_EXCEPTION);
+}
+
+/*
+ * Modelled after TakeDataAbortException() and TakePrefetchAbortException
+ * pseudocode.
+ */
+static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt, u32 addr)
+{
+ u64 far;
+ u32 fsr;
+
+ /* Give the guest an IMPLEMENTATION DEFINED exception */
+ if (vcpu_read_sys_reg(vcpu, TCR_EL1) & TTBCR_EAE) {
+ fsr = DFSR_LPAE | DFSR_FSC_EXTABT_LPAE;
+ } else {
+ /* no need to shuffle FS[4] into DFSR[10] as its 0 */
+ fsr = DFSR_FSC_EXTABT_nLPAE;
+ }
+
+ far = vcpu_read_sys_reg(vcpu, FAR_EL1);
+
+ if (is_pabt) {
+ vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA32_IABT |
+ KVM_ARM64_PENDING_EXCEPTION);
+ far &= GENMASK(31, 0);
+ far |= (u64)addr << 32;
+ vcpu_write_sys_reg(vcpu, fsr, IFSR32_EL2);
+ } else { /* !iabt */
+ vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA32_DABT |
+ KVM_ARM64_PENDING_EXCEPTION);
+ far &= GENMASK(63, 32);
+ far |= addr;
+ vcpu_write_sys_reg(vcpu, fsr, ESR_EL1);
+ }
+
+ vcpu_write_sys_reg(vcpu, far, FAR_EL1);
+}
+
/**
* kvm_inject_dabt - inject a data abort into the guest
* @vcpu: The VCPU to receive the data abort
@@ -179,7 +124,7 @@ static void inject_undef64(struct kvm_vcpu *vcpu)
void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr)
{
if (vcpu_el1_is_32bit(vcpu))
- kvm_inject_dabt32(vcpu, addr);
+ inject_abt32(vcpu, false, addr);
else
inject_abt64(vcpu, false, addr);
}
@@ -195,7 +140,7 @@ void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr)
void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr)
{
if (vcpu_el1_is_32bit(vcpu))
- kvm_inject_pabt32(vcpu, addr);
+ inject_abt32(vcpu, true, addr);
else
inject_abt64(vcpu, true, addr);
}
@@ -210,7 +155,7 @@ void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr)
void kvm_inject_undefined(struct kvm_vcpu *vcpu)
{
if (vcpu_el1_is_32bit(vcpu))
- kvm_inject_undef32(vcpu);
+ inject_undef32(vcpu);
else
inject_undef64(vcpu);
}
diff --git a/arch/arm64/kvm/mmio.c b/arch/arm64/kvm/mmio.c
index 6a2826f1bf5e..3e2d8ba11a02 100644
--- a/arch/arm64/kvm/mmio.c
+++ b/arch/arm64/kvm/mmio.c
@@ -115,7 +115,7 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu)
* The MMIO instruction is emulated and should not be re-executed
* in the guest.
*/
- kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
+ kvm_incr_pc(vcpu);
return 0;
}
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 75814a02d189..7d2257cc5438 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1023,7 +1023,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
* cautious, and skip the instruction.
*/
if (kvm_is_error_hva(hva) && kvm_vcpu_dabt_is_cm(vcpu)) {
- kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
+ kvm_incr_pc(vcpu);
ret = 1;
goto out_unlock;
}
diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
index 2ed5ef8f274b..398f6df1bbe4 100644
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -384,7 +384,7 @@ static void kvm_pmu_update_state(struct kvm_vcpu *vcpu)
struct kvm_pmu *pmu = &vcpu->arch.pmu;
bool overflow;
- if (!kvm_arm_pmu_v3_ready(vcpu))
+ if (!kvm_vcpu_has_pmu(vcpu))
return;
overflow = !!kvm_pmu_overflow_status(vcpu);
@@ -825,9 +825,12 @@ bool kvm_arm_support_pmu_v3(void)
int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu)
{
- if (!vcpu->arch.pmu.created)
+ if (!kvm_vcpu_has_pmu(vcpu))
return 0;
+ if (!vcpu->arch.pmu.created)
+ return -EINVAL;
+
/*
* A valid interrupt configuration for the PMU is either to have a
* properly configured interrupt number and using an in-kernel
@@ -835,9 +838,6 @@ int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu)
*/
if (irqchip_in_kernel(vcpu->kvm)) {
int irq = vcpu->arch.pmu.irq_num;
- if (!kvm_arm_pmu_irq_initialized(vcpu))
- return -EINVAL;
-
/*
* If we are using an in-kernel vgic, at this point we know
* the vgic will be initialized, so we can check the PMU irq
@@ -851,7 +851,6 @@ int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu)
}
kvm_pmu_vcpu_reset(vcpu);
- vcpu->arch.pmu.ready = true;
return 0;
}
@@ -913,8 +912,7 @@ static bool pmu_irq_is_valid(struct kvm *kvm, int irq)
int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
{
- if (!kvm_arm_support_pmu_v3() ||
- !test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
+ if (!kvm_vcpu_has_pmu(vcpu))
return -ENODEV;
if (vcpu->arch.pmu.created)
@@ -1015,7 +1013,7 @@ int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
if (!irqchip_in_kernel(vcpu->kvm))
return -EINVAL;
- if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
+ if (!kvm_vcpu_has_pmu(vcpu))
return -ENODEV;
if (!kvm_arm_pmu_irq_initialized(vcpu))
@@ -1035,8 +1033,7 @@ int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
case KVM_ARM_VCPU_PMU_V3_IRQ:
case KVM_ARM_VCPU_PMU_V3_INIT:
case KVM_ARM_VCPU_PMU_V3_FILTER:
- if (kvm_arm_support_pmu_v3() &&
- test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
+ if (kvm_vcpu_has_pmu(vcpu))
return 0;
}
diff --git a/arch/arm64/kvm/pvtime.c b/arch/arm64/kvm/pvtime.c
index 920ac43077ad..78a09f7a6637 100644
--- a/arch/arm64/kvm/pvtime.c
+++ b/arch/arm64/kvm/pvtime.c
@@ -53,7 +53,6 @@ gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu)
struct pvclock_vcpu_stolen_time init_values = {};
struct kvm *kvm = vcpu->kvm;
u64 base = vcpu->arch.steal.base;
- int idx;
if (base == GPA_INVALID)
return base;
@@ -63,10 +62,7 @@ gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu)
* the feature enabled.
*/
vcpu->arch.steal.last_steal = current->sched_info.run_delay;
-
- idx = srcu_read_lock(&kvm->srcu);
- kvm_write_guest(kvm, base, &init_values, sizeof(init_values));
- srcu_read_unlock(&kvm->srcu, idx);
+ kvm_write_guest_lock(kvm, base, &init_values, sizeof(init_values));
return base;
}
diff --git a/arch/arm64/kvm/regmap.c b/arch/arm64/kvm/regmap.c
deleted file mode 100644
index accc1d5fba61..000000000000
--- a/arch/arm64/kvm/regmap.c
+++ /dev/null
@@ -1,224 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2012,2013 - ARM Ltd
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * Derived from arch/arm/kvm/emulate.c:
- * Copyright (C) 2012 - Virtual Open Systems and Columbia University
- * Author: Christoffer Dall <c.dall@virtualopensystems.com>
- */
-
-#include <linux/mm.h>
-#include <linux/kvm_host.h>
-#include <asm/kvm_emulate.h>
-#include <asm/ptrace.h>
-
-#define VCPU_NR_MODES 6
-#define REG_OFFSET(_reg) \
- (offsetof(struct user_pt_regs, _reg) / sizeof(unsigned long))
-
-#define USR_REG_OFFSET(R) REG_OFFSET(compat_usr(R))
-
-static const unsigned long vcpu_reg_offsets[VCPU_NR_MODES][16] = {
- /* USR Registers */
- {
- USR_REG_OFFSET(0), USR_REG_OFFSET(1), USR_REG_OFFSET(2),
- USR_REG_OFFSET(3), USR_REG_OFFSET(4), USR_REG_OFFSET(5),
- USR_REG_OFFSET(6), USR_REG_OFFSET(7), USR_REG_OFFSET(8),
- USR_REG_OFFSET(9), USR_REG_OFFSET(10), USR_REG_OFFSET(11),
- USR_REG_OFFSET(12), USR_REG_OFFSET(13), USR_REG_OFFSET(14),
- REG_OFFSET(pc)
- },
-
- /* FIQ Registers */
- {
- USR_REG_OFFSET(0), USR_REG_OFFSET(1), USR_REG_OFFSET(2),
- USR_REG_OFFSET(3), USR_REG_OFFSET(4), USR_REG_OFFSET(5),
- USR_REG_OFFSET(6), USR_REG_OFFSET(7),
- REG_OFFSET(compat_r8_fiq), /* r8 */
- REG_OFFSET(compat_r9_fiq), /* r9 */
- REG_OFFSET(compat_r10_fiq), /* r10 */
- REG_OFFSET(compat_r11_fiq), /* r11 */
- REG_OFFSET(compat_r12_fiq), /* r12 */
- REG_OFFSET(compat_sp_fiq), /* r13 */
- REG_OFFSET(compat_lr_fiq), /* r14 */
- REG_OFFSET(pc)
- },
-
- /* IRQ Registers */
- {
- USR_REG_OFFSET(0), USR_REG_OFFSET(1), USR_REG_OFFSET(2),
- USR_REG_OFFSET(3), USR_REG_OFFSET(4), USR_REG_OFFSET(5),
- USR_REG_OFFSET(6), USR_REG_OFFSET(7), USR_REG_OFFSET(8),
- USR_REG_OFFSET(9), USR_REG_OFFSET(10), USR_REG_OFFSET(11),
- USR_REG_OFFSET(12),
- REG_OFFSET(compat_sp_irq), /* r13 */
- REG_OFFSET(compat_lr_irq), /* r14 */
- REG_OFFSET(pc)
- },
-
- /* SVC Registers */
- {
- USR_REG_OFFSET(0), USR_REG_OFFSET(1), USR_REG_OFFSET(2),
- USR_REG_OFFSET(3), USR_REG_OFFSET(4), USR_REG_OFFSET(5),
- USR_REG_OFFSET(6), USR_REG_OFFSET(7), USR_REG_OFFSET(8),
- USR_REG_OFFSET(9), USR_REG_OFFSET(10), USR_REG_OFFSET(11),
- USR_REG_OFFSET(12),
- REG_OFFSET(compat_sp_svc), /* r13 */
- REG_OFFSET(compat_lr_svc), /* r14 */
- REG_OFFSET(pc)
- },
-
- /* ABT Registers */
- {
- USR_REG_OFFSET(0), USR_REG_OFFSET(1), USR_REG_OFFSET(2),
- USR_REG_OFFSET(3), USR_REG_OFFSET(4), USR_REG_OFFSET(5),
- USR_REG_OFFSET(6), USR_REG_OFFSET(7), USR_REG_OFFSET(8),
- USR_REG_OFFSET(9), USR_REG_OFFSET(10), USR_REG_OFFSET(11),
- USR_REG_OFFSET(12),
- REG_OFFSET(compat_sp_abt), /* r13 */
- REG_OFFSET(compat_lr_abt), /* r14 */
- REG_OFFSET(pc)
- },
-
- /* UND Registers */
- {
- USR_REG_OFFSET(0), USR_REG_OFFSET(1), USR_REG_OFFSET(2),
- USR_REG_OFFSET(3), USR_REG_OFFSET(4), USR_REG_OFFSET(5),
- USR_REG_OFFSET(6), USR_REG_OFFSET(7), USR_REG_OFFSET(8),
- USR_REG_OFFSET(9), USR_REG_OFFSET(10), USR_REG_OFFSET(11),
- USR_REG_OFFSET(12),
- REG_OFFSET(compat_sp_und), /* r13 */
- REG_OFFSET(compat_lr_und), /* r14 */
- REG_OFFSET(pc)
- },
-};
-
-/*
- * Return a pointer to the register number valid in the current mode of
- * the virtual CPU.
- */
-unsigned long *vcpu_reg32(const struct kvm_vcpu *vcpu, u8 reg_num)
-{
- unsigned long *reg_array = (unsigned long *)&vcpu->arch.ctxt.regs;
- unsigned long mode = *vcpu_cpsr(vcpu) & PSR_AA32_MODE_MASK;
-
- switch (mode) {
- case PSR_AA32_MODE_USR ... PSR_AA32_MODE_SVC:
- mode &= ~PSR_MODE32_BIT; /* 0 ... 3 */
- break;
-
- case PSR_AA32_MODE_ABT:
- mode = 4;
- break;
-
- case PSR_AA32_MODE_UND:
- mode = 5;
- break;
-
- case PSR_AA32_MODE_SYS:
- mode = 0; /* SYS maps to USR */
- break;
-
- default:
- BUG();
- }
-
- return reg_array + vcpu_reg_offsets[mode][reg_num];
-}
-
-/*
- * Return the SPSR for the current mode of the virtual CPU.
- */
-static int vcpu_spsr32_mode(const struct kvm_vcpu *vcpu)
-{
- unsigned long mode = *vcpu_cpsr(vcpu) & PSR_AA32_MODE_MASK;
- switch (mode) {
- case PSR_AA32_MODE_SVC: return KVM_SPSR_SVC;
- case PSR_AA32_MODE_ABT: return KVM_SPSR_ABT;
- case PSR_AA32_MODE_UND: return KVM_SPSR_UND;
- case PSR_AA32_MODE_IRQ: return KVM_SPSR_IRQ;
- case PSR_AA32_MODE_FIQ: return KVM_SPSR_FIQ;
- default: BUG();
- }
-}
-
-unsigned long vcpu_read_spsr32(const struct kvm_vcpu *vcpu)
-{
- int spsr_idx = vcpu_spsr32_mode(vcpu);
-
- if (!vcpu->arch.sysregs_loaded_on_cpu) {
- switch (spsr_idx) {
- case KVM_SPSR_SVC:
- return __vcpu_sys_reg(vcpu, SPSR_EL1);
- case KVM_SPSR_ABT:
- return vcpu->arch.ctxt.spsr_abt;
- case KVM_SPSR_UND:
- return vcpu->arch.ctxt.spsr_und;
- case KVM_SPSR_IRQ:
- return vcpu->arch.ctxt.spsr_irq;
- case KVM_SPSR_FIQ:
- return vcpu->arch.ctxt.spsr_fiq;
- }
- }
-
- switch (spsr_idx) {
- case KVM_SPSR_SVC:
- return read_sysreg_el1(SYS_SPSR);
- case KVM_SPSR_ABT:
- return read_sysreg(spsr_abt);
- case KVM_SPSR_UND:
- return read_sysreg(spsr_und);
- case KVM_SPSR_IRQ:
- return read_sysreg(spsr_irq);
- case KVM_SPSR_FIQ:
- return read_sysreg(spsr_fiq);
- default:
- BUG();
- }
-}
-
-void vcpu_write_spsr32(struct kvm_vcpu *vcpu, unsigned long v)
-{
- int spsr_idx = vcpu_spsr32_mode(vcpu);
-
- if (!vcpu->arch.sysregs_loaded_on_cpu) {
- switch (spsr_idx) {
- case KVM_SPSR_SVC:
- __vcpu_sys_reg(vcpu, SPSR_EL1) = v;
- break;
- case KVM_SPSR_ABT:
- vcpu->arch.ctxt.spsr_abt = v;
- break;
- case KVM_SPSR_UND:
- vcpu->arch.ctxt.spsr_und = v;
- break;
- case KVM_SPSR_IRQ:
- vcpu->arch.ctxt.spsr_irq = v;
- break;
- case KVM_SPSR_FIQ:
- vcpu->arch.ctxt.spsr_fiq = v;
- break;
- }
-
- return;
- }
-
- switch (spsr_idx) {
- case KVM_SPSR_SVC:
- write_sysreg_el1(v, SYS_SPSR);
- break;
- case KVM_SPSR_ABT:
- write_sysreg(v, spsr_abt);
- break;
- case KVM_SPSR_UND:
- write_sysreg(v, spsr_und);
- break;
- case KVM_SPSR_IRQ:
- write_sysreg(v, spsr_irq);
- break;
- case KVM_SPSR_FIQ:
- write_sysreg(v, spsr_fiq);
- break;
- }
-}
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index f32490229a4c..47f3f035f3ea 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -25,7 +25,6 @@
#include <asm/ptrace.h>
#include <asm/kvm_arm.h>
#include <asm/kvm_asm.h>
-#include <asm/kvm_coproc.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_mmu.h>
#include <asm/virt.h>
@@ -42,58 +41,6 @@ static u32 kvm_ipa_limit;
#define VCPU_RESET_PSTATE_SVC (PSR_AA32_MODE_SVC | PSR_AA32_A_BIT | \
PSR_AA32_I_BIT | PSR_AA32_F_BIT)
-static bool system_has_full_ptr_auth(void)
-{
- return system_supports_address_auth() && system_supports_generic_auth();
-}
-
-/**
- * kvm_arch_vm_ioctl_check_extension
- *
- * We currently assume that the number of HW registers is uniform
- * across all CPUs (see cpuinfo_sanity_check).
- */
-int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext)
-{
- int r;
-
- switch (ext) {
- case KVM_CAP_ARM_EL1_32BIT:
- r = cpus_have_const_cap(ARM64_HAS_32BIT_EL1);
- break;
- case KVM_CAP_GUEST_DEBUG_HW_BPS:
- r = get_num_brps();
- break;
- case KVM_CAP_GUEST_DEBUG_HW_WPS:
- r = get_num_wrps();
- break;
- case KVM_CAP_ARM_PMU_V3:
- r = kvm_arm_support_pmu_v3();
- break;
- case KVM_CAP_ARM_INJECT_SERROR_ESR:
- r = cpus_have_const_cap(ARM64_HAS_RAS_EXTN);
- break;
- case KVM_CAP_SET_GUEST_DEBUG:
- case KVM_CAP_VCPU_ATTRIBUTES:
- r = 1;
- break;
- case KVM_CAP_ARM_VM_IPA_SIZE:
- r = kvm_ipa_limit;
- break;
- case KVM_CAP_ARM_SVE:
- r = system_supports_sve();
- break;
- case KVM_CAP_ARM_PTRAUTH_ADDRESS:
- case KVM_CAP_ARM_PTRAUTH_GENERIC:
- r = system_has_full_ptr_auth();
- break;
- default:
- r = 0;
- }
-
- return r;
-}
-
unsigned int kvm_sve_max_vl;
int kvm_arm_init_sve(void)
@@ -286,6 +233,10 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
pstate = VCPU_RESET_PSTATE_EL1;
}
+ if (kvm_vcpu_has_pmu(vcpu) && !kvm_arm_support_pmu_v3()) {
+ ret = -EINVAL;
+ goto out;
+ }
break;
}
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index c1fac9836af1..3313dedfa505 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -20,7 +20,6 @@
#include <asm/debug-monitors.h>
#include <asm/esr.h>
#include <asm/kvm_arm.h>
-#include <asm/kvm_coproc.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
@@ -64,87 +63,6 @@ static bool write_to_read_only(struct kvm_vcpu *vcpu,
return false;
}
-static bool __vcpu_read_sys_reg_from_cpu(int reg, u64 *val)
-{
- /*
- * System registers listed in the switch are not saved on every
- * exit from the guest but are only saved on vcpu_put.
- *
- * Note that MPIDR_EL1 for the guest is set by KVM via VMPIDR_EL2 but
- * should never be listed below, because the guest cannot modify its
- * own MPIDR_EL1 and MPIDR_EL1 is accessed for VCPU A from VCPU B's
- * thread when emulating cross-VCPU communication.
- */
- switch (reg) {
- case CSSELR_EL1: *val = read_sysreg_s(SYS_CSSELR_EL1); break;
- case SCTLR_EL1: *val = read_sysreg_s(SYS_SCTLR_EL12); break;
- case CPACR_EL1: *val = read_sysreg_s(SYS_CPACR_EL12); break;
- case TTBR0_EL1: *val = read_sysreg_s(SYS_TTBR0_EL12); break;
- case TTBR1_EL1: *val = read_sysreg_s(SYS_TTBR1_EL12); break;
- case TCR_EL1: *val = read_sysreg_s(SYS_TCR_EL12); break;
- case ESR_EL1: *val = read_sysreg_s(SYS_ESR_EL12); break;
- case AFSR0_EL1: *val = read_sysreg_s(SYS_AFSR0_EL12); break;
- case AFSR1_EL1: *val = read_sysreg_s(SYS_AFSR1_EL12); break;
- case FAR_EL1: *val = read_sysreg_s(SYS_FAR_EL12); break;
- case MAIR_EL1: *val = read_sysreg_s(SYS_MAIR_EL12); break;
- case VBAR_EL1: *val = read_sysreg_s(SYS_VBAR_EL12); break;
- case CONTEXTIDR_EL1: *val = read_sysreg_s(SYS_CONTEXTIDR_EL12);break;
- case TPIDR_EL0: *val = read_sysreg_s(SYS_TPIDR_EL0); break;
- case TPIDRRO_EL0: *val = read_sysreg_s(SYS_TPIDRRO_EL0); break;
- case TPIDR_EL1: *val = read_sysreg_s(SYS_TPIDR_EL1); break;
- case AMAIR_EL1: *val = read_sysreg_s(SYS_AMAIR_EL12); break;
- case CNTKCTL_EL1: *val = read_sysreg_s(SYS_CNTKCTL_EL12); break;
- case ELR_EL1: *val = read_sysreg_s(SYS_ELR_EL12); break;
- case PAR_EL1: *val = read_sysreg_par(); break;
- case DACR32_EL2: *val = read_sysreg_s(SYS_DACR32_EL2); break;
- case IFSR32_EL2: *val = read_sysreg_s(SYS_IFSR32_EL2); break;
- case DBGVCR32_EL2: *val = read_sysreg_s(SYS_DBGVCR32_EL2); break;
- default: return false;
- }
-
- return true;
-}
-
-static bool __vcpu_write_sys_reg_to_cpu(u64 val, int reg)
-{
- /*
- * System registers listed in the switch are not restored on every
- * entry to the guest but are only restored on vcpu_load.
- *
- * Note that MPIDR_EL1 for the guest is set by KVM via VMPIDR_EL2 but
- * should never be listed below, because the MPIDR should only be set
- * once, before running the VCPU, and never changed later.
- */
- switch (reg) {
- case CSSELR_EL1: write_sysreg_s(val, SYS_CSSELR_EL1); break;
- case SCTLR_EL1: write_sysreg_s(val, SYS_SCTLR_EL12); break;
- case CPACR_EL1: write_sysreg_s(val, SYS_CPACR_EL12); break;
- case TTBR0_EL1: write_sysreg_s(val, SYS_TTBR0_EL12); break;
- case TTBR1_EL1: write_sysreg_s(val, SYS_TTBR1_EL12); break;
- case TCR_EL1: write_sysreg_s(val, SYS_TCR_EL12); break;
- case ESR_EL1: write_sysreg_s(val, SYS_ESR_EL12); break;
- case AFSR0_EL1: write_sysreg_s(val, SYS_AFSR0_EL12); break;
- case AFSR1_EL1: write_sysreg_s(val, SYS_AFSR1_EL12); break;
- case FAR_EL1: write_sysreg_s(val, SYS_FAR_EL12); break;
- case MAIR_EL1: write_sysreg_s(val, SYS_MAIR_EL12); break;
- case VBAR_EL1: write_sysreg_s(val, SYS_VBAR_EL12); break;
- case CONTEXTIDR_EL1: write_sysreg_s(val, SYS_CONTEXTIDR_EL12);break;
- case TPIDR_EL0: write_sysreg_s(val, SYS_TPIDR_EL0); break;
- case TPIDRRO_EL0: write_sysreg_s(val, SYS_TPIDRRO_EL0); break;
- case TPIDR_EL1: write_sysreg_s(val, SYS_TPIDR_EL1); break;
- case AMAIR_EL1: write_sysreg_s(val, SYS_AMAIR_EL12); break;
- case CNTKCTL_EL1: write_sysreg_s(val, SYS_CNTKCTL_EL12); break;
- case ELR_EL1: write_sysreg_s(val, SYS_ELR_EL12); break;
- case PAR_EL1: write_sysreg_s(val, SYS_PAR_EL1); break;
- case DACR32_EL2: write_sysreg_s(val, SYS_DACR32_EL2); break;
- case IFSR32_EL2: write_sysreg_s(val, SYS_IFSR32_EL2); break;
- case DBGVCR32_EL2: write_sysreg_s(val, SYS_DBGVCR32_EL2); break;
- default: return false;
- }
-
- return true;
-}
-
u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
{
u64 val = 0x8badf00d8badf00d;
@@ -169,7 +87,7 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
static u32 cache_levels;
/* CSSELR values; used to index KVM_REG_ARM_DEMUX_ID_CCSIDR */
-#define CSSELR_MAX 12
+#define CSSELR_MAX 14
/* Which cache CCSIDR represents depends on CSSELR value. */
static u32 get_ccsidr(u32 csselr)
@@ -209,6 +127,24 @@ static bool access_dcsw(struct kvm_vcpu *vcpu,
return true;
}
+static void get_access_mask(const struct sys_reg_desc *r, u64 *mask, u64 *shift)
+{
+ switch (r->aarch32_map) {
+ case AA32_LO:
+ *mask = GENMASK_ULL(31, 0);
+ *shift = 0;
+ break;
+ case AA32_HI:
+ *mask = GENMASK_ULL(63, 32);
+ *shift = 32;
+ break;
+ default:
+ *mask = GENMASK_ULL(63, 0);
+ *shift = 0;
+ break;
+ }
+}
+
/*
* Generic accessor for VM registers. Only called as long as HCR_TVM
* is set. If the guest enables the MMU, we stop trapping the VM
@@ -219,26 +155,21 @@ static bool access_vm_reg(struct kvm_vcpu *vcpu,
const struct sys_reg_desc *r)
{
bool was_enabled = vcpu_has_cache_enabled(vcpu);
- u64 val;
- int reg = r->reg;
+ u64 val, mask, shift;
BUG_ON(!p->is_write);
- /* See the 32bit mapping in kvm_host.h */
- if (p->is_aarch32)
- reg = r->reg / 2;
+ get_access_mask(r, &mask, &shift);
- if (!p->is_aarch32 || !p->is_32bit) {
- val = p->regval;
+ if (~mask) {
+ val = vcpu_read_sys_reg(vcpu, r->reg);
+ val &= ~mask;
} else {
- val = vcpu_read_sys_reg(vcpu, reg);
- if (r->reg % 2)
- val = (p->regval << 32) | (u64)lower_32_bits(val);
- else
- val = ((u64)upper_32_bits(val) << 32) |
- lower_32_bits(p->regval);
+ val = 0;
}
- vcpu_write_sys_reg(vcpu, val, reg);
+
+ val |= (p->regval & (mask >> shift)) << shift;
+ vcpu_write_sys_reg(vcpu, val, r->reg);
kvm_toggle_cache(vcpu, was_enabled);
return true;
@@ -248,17 +179,13 @@ static bool access_actlr(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
+ u64 mask, shift;
+
if (p->is_write)
return ignore_write(vcpu, p);
- p->regval = vcpu_read_sys_reg(vcpu, ACTLR_EL1);
-
- if (p->is_aarch32) {
- if (r->Op2 & 2)
- p->regval = upper_32_bits(p->regval);
- else
- p->regval = lower_32_bits(p->regval);
- }
+ get_access_mask(r, &mask, &shift);
+ p->regval = (vcpu_read_sys_reg(vcpu, r->reg) & mask) >> shift;
return true;
}
@@ -285,7 +212,7 @@ static bool access_gic_sgi(struct kvm_vcpu *vcpu,
* equivalent to ICC_SGI0R_EL1, as there is no "alternative" secure
* group.
*/
- if (p->is_aarch32) {
+ if (p->Op0 == 0) { /* AArch32 */
switch (p->Op1) {
default: /* Keep GCC quiet */
case 0: /* ICC_SGI1R */
@@ -296,7 +223,7 @@ static bool access_gic_sgi(struct kvm_vcpu *vcpu,
g1 = false;
break;
}
- } else {
+ } else { /* AArch64 */
switch (p->Op2) {
default: /* Keep GCC quiet */
case 5: /* ICC_SGI1R_EL1 */
@@ -438,26 +365,30 @@ static bool trap_debug_regs(struct kvm_vcpu *vcpu,
*/
static void reg_to_dbg(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
+ const struct sys_reg_desc *rd,
u64 *dbg_reg)
{
- u64 val = p->regval;
+ u64 mask, shift, val;
- if (p->is_32bit) {
- val &= 0xffffffffUL;
- val |= ((*dbg_reg >> 32) << 32);
- }
+ get_access_mask(rd, &mask, &shift);
+ val = *dbg_reg;
+ val &= ~mask;
+ val |= (p->regval & (mask >> shift)) << shift;
*dbg_reg = val;
+
vcpu->arch.flags |= KVM_ARM64_DEBUG_DIRTY;
}
static void dbg_to_reg(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
+ const struct sys_reg_desc *rd,
u64 *dbg_reg)
{
- p->regval = *dbg_reg;
- if (p->is_32bit)
- p->regval &= 0xffffffffUL;
+ u64 mask, shift;
+
+ get_access_mask(rd, &mask, &shift);
+ p->regval = (*dbg_reg & mask) >> shift;
}
static bool trap_bvr(struct kvm_vcpu *vcpu,
@@ -467,9 +398,9 @@ static bool trap_bvr(struct kvm_vcpu *vcpu,
u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg];
if (p->is_write)
- reg_to_dbg(vcpu, p, dbg_reg);
+ reg_to_dbg(vcpu, p, rd, dbg_reg);
else
- dbg_to_reg(vcpu, p, dbg_reg);
+ dbg_to_reg(vcpu, p, rd, dbg_reg);
trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg);
@@ -509,9 +440,9 @@ static bool trap_bcr(struct kvm_vcpu *vcpu,
u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg];
if (p->is_write)
- reg_to_dbg(vcpu, p, dbg_reg);
+ reg_to_dbg(vcpu, p, rd, dbg_reg);
else
- dbg_to_reg(vcpu, p, dbg_reg);
+ dbg_to_reg(vcpu, p, rd, dbg_reg);
trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg);
@@ -552,9 +483,9 @@ static bool trap_wvr(struct kvm_vcpu *vcpu,
u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg];
if (p->is_write)
- reg_to_dbg(vcpu, p, dbg_reg);
+ reg_to_dbg(vcpu, p, rd, dbg_reg);
else
- dbg_to_reg(vcpu, p, dbg_reg);
+ dbg_to_reg(vcpu, p, rd, dbg_reg);
trace_trap_reg(__func__, rd->reg, p->is_write,
vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg]);
@@ -595,9 +526,9 @@ static bool trap_wcr(struct kvm_vcpu *vcpu,
u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg];
if (p->is_write)
- reg_to_dbg(vcpu, p, dbg_reg);
+ reg_to_dbg(vcpu, p, rd, dbg_reg);
else
- dbg_to_reg(vcpu, p, dbg_reg);
+ dbg_to_reg(vcpu, p, rd, dbg_reg);
trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg);
@@ -678,8 +609,9 @@ static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
static bool check_pmu_access_disabled(struct kvm_vcpu *vcpu, u64 flags)
{
u64 reg = __vcpu_sys_reg(vcpu, PMUSERENR_EL0);
- bool enabled = (reg & flags) || vcpu_mode_priv(vcpu);
+ bool enabled = kvm_vcpu_has_pmu(vcpu);
+ enabled &= (reg & flags) || vcpu_mode_priv(vcpu);
if (!enabled)
kvm_inject_undefined(vcpu);
@@ -711,9 +643,6 @@ static bool access_pmcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
{
u64 val;
- if (!kvm_arm_pmu_v3_ready(vcpu))
- return trap_raz_wi(vcpu, p, r);
-
if (pmu_access_el0_disabled(vcpu))
return false;
@@ -740,9 +669,6 @@ static bool access_pmcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
static bool access_pmselr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
- if (!kvm_arm_pmu_v3_ready(vcpu))
- return trap_raz_wi(vcpu, p, r);
-
if (pmu_access_event_counter_el0_disabled(vcpu))
return false;
@@ -761,9 +687,6 @@ static bool access_pmceid(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
{
u64 pmceid;
- if (!kvm_arm_pmu_v3_ready(vcpu))
- return trap_raz_wi(vcpu, p, r);
-
BUG_ON(p->is_write);
if (pmu_access_el0_disabled(vcpu))
@@ -794,10 +717,7 @@ static bool access_pmu_evcntr(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
- u64 idx;
-
- if (!kvm_arm_pmu_v3_ready(vcpu))
- return trap_raz_wi(vcpu, p, r);
+ u64 idx = ~0UL;
if (r->CRn == 9 && r->CRm == 13) {
if (r->Op2 == 2) {
@@ -813,8 +733,6 @@ static bool access_pmu_evcntr(struct kvm_vcpu *vcpu,
return false;
idx = ARMV8_PMU_CYCLE_IDX;
- } else {
- return false;
}
} else if (r->CRn == 0 && r->CRm == 9) {
/* PMCCNTR */
@@ -828,10 +746,11 @@ static bool access_pmu_evcntr(struct kvm_vcpu *vcpu,
return false;
idx = ((r->CRm & 3) << 3) | (r->Op2 & 7);
- } else {
- return false;
}
+ /* Catch any decoding mistake */
+ WARN_ON(idx == ~0UL);
+
if (!pmu_counter_idx_valid(vcpu, idx))
return false;
@@ -852,9 +771,6 @@ static bool access_pmu_evtyper(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
{
u64 idx, reg;
- if (!kvm_arm_pmu_v3_ready(vcpu))
- return trap_raz_wi(vcpu, p, r);
-
if (pmu_access_el0_disabled(vcpu))
return false;
@@ -892,9 +808,6 @@ static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
{
u64 val, mask;
- if (!kvm_arm_pmu_v3_ready(vcpu))
- return trap_raz_wi(vcpu, p, r);
-
if (pmu_access_el0_disabled(vcpu))
return false;
@@ -923,13 +836,8 @@ static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
{
u64 mask = kvm_pmu_valid_counter_mask(vcpu);
- if (!kvm_arm_pmu_v3_ready(vcpu))
- return trap_raz_wi(vcpu, p, r);
-
- if (!vcpu_mode_priv(vcpu)) {
- kvm_inject_undefined(vcpu);
+ if (check_pmu_access_disabled(vcpu, 0))
return false;
- }
if (p->is_write) {
u64 val = p->regval & mask;
@@ -952,9 +860,6 @@ static bool access_pmovs(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
{
u64 mask = kvm_pmu_valid_counter_mask(vcpu);
- if (!kvm_arm_pmu_v3_ready(vcpu))
- return trap_raz_wi(vcpu, p, r);
-
if (pmu_access_el0_disabled(vcpu))
return false;
@@ -977,9 +882,6 @@ static bool access_pmswinc(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
{
u64 mask;
- if (!kvm_arm_pmu_v3_ready(vcpu))
- return trap_raz_wi(vcpu, p, r);
-
if (!p->is_write)
return read_from_write_only(vcpu, p, r);
@@ -994,8 +896,10 @@ static bool access_pmswinc(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
- if (!kvm_arm_pmu_v3_ready(vcpu))
- return trap_raz_wi(vcpu, p, r);
+ if (!kvm_vcpu_has_pmu(vcpu)) {
+ kvm_inject_undefined(vcpu);
+ return false;
+ }
if (p->is_write) {
if (!vcpu_mode_priv(vcpu)) {
@@ -1122,6 +1026,8 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu,
val &= ~(0xfUL << ID_AA64PFR0_AMU_SHIFT);
val &= ~(0xfUL << ID_AA64PFR0_CSV2_SHIFT);
val |= ((u64)vcpu->kvm->arch.pfr0_csv2 << ID_AA64PFR0_CSV2_SHIFT);
+ val &= ~(0xfUL << ID_AA64PFR0_CSV3_SHIFT);
+ val |= ((u64)vcpu->kvm->arch.pfr0_csv3 << ID_AA64PFR0_CSV3_SHIFT);
} else if (id == SYS_ID_AA64PFR1_EL1) {
val &= ~(0xfUL << ID_AA64PFR1_MTE_SHIFT);
} else if (id == SYS_ID_AA64ISAR1_EL1 && !vcpu_has_ptrauth(vcpu)) {
@@ -1130,10 +1036,15 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu,
(0xfUL << ID_AA64ISAR1_GPA_SHIFT) |
(0xfUL << ID_AA64ISAR1_GPI_SHIFT));
} else if (id == SYS_ID_AA64DFR0_EL1) {
+ u64 cap = 0;
+
/* Limit guests to PMUv3 for ARMv8.1 */
+ if (kvm_vcpu_has_pmu(vcpu))
+ cap = ID_AA64DFR0_PMUVER_8_1;
+
val = cpuid_feature_cap_perfmon_field(val,
ID_AA64DFR0_PMUVER_SHIFT,
- ID_AA64DFR0_PMUVER_8_1);
+ cap);
} else if (id == SYS_ID_DFR0_EL1) {
/* Limit guests to PMUv3 for ARMv8.1 */
val = cpuid_feature_cap_perfmon_field(val,
@@ -1209,9 +1120,9 @@ static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu,
const struct kvm_one_reg *reg, void __user *uaddr)
{
const u64 id = sys_reg_to_index(rd);
+ u8 csv2, csv3;
int err;
u64 val;
- u8 csv2;
err = reg_from_user(&val, uaddr, id);
if (err)
@@ -1227,13 +1138,21 @@ static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu,
(csv2 && arm64_get_spectre_v2_state() != SPECTRE_UNAFFECTED))
return -EINVAL;
- /* We can only differ with CSV2, and anything else is an error */
+ /* Same thing for CSV3 */
+ csv3 = cpuid_feature_extract_unsigned_field(val, ID_AA64PFR0_CSV3_SHIFT);
+ if (csv3 > 1 ||
+ (csv3 && arm64_get_meltdown_state() != SPECTRE_UNAFFECTED))
+ return -EINVAL;
+
+ /* We can only differ with CSV[23], and anything else is an error */
val ^= read_id_reg(vcpu, rd, false);
- val &= ~(0xFUL << ID_AA64PFR0_CSV2_SHIFT);
+ val &= ~((0xFUL << ID_AA64PFR0_CSV2_SHIFT) |
+ (0xFUL << ID_AA64PFR0_CSV3_SHIFT));
if (val)
return -EINVAL;
vcpu->kvm->arch.pfr0_csv2 = csv2;
+ vcpu->kvm->arch.pfr0_csv3 = csv3 ;
return 0;
}
@@ -1327,10 +1246,6 @@ static bool access_csselr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
{
int reg = r->reg;
- /* See the 32bit mapping in kvm_host.h */
- if (p->is_aarch32)
- reg = r->reg / 2;
-
if (p->is_write)
vcpu_write_sys_reg(vcpu, p->regval, reg);
else
@@ -1801,66 +1716,27 @@ static bool trap_dbgidr(struct kvm_vcpu *vcpu,
}
}
-static bool trap_debug32(struct kvm_vcpu *vcpu,
- struct sys_reg_params *p,
- const struct sys_reg_desc *r)
-{
- if (p->is_write) {
- vcpu_cp14(vcpu, r->reg) = p->regval;
- vcpu->arch.flags |= KVM_ARM64_DEBUG_DIRTY;
- } else {
- p->regval = vcpu_cp14(vcpu, r->reg);
- }
-
- return true;
-}
-
-/* AArch32 debug register mappings
+/*
+ * AArch32 debug register mappings
*
* AArch32 DBGBVRn is mapped to DBGBVRn_EL1[31:0]
* AArch32 DBGBXVRn is mapped to DBGBVRn_EL1[63:32]
*
- * All control registers and watchpoint value registers are mapped to
- * the lower 32 bits of their AArch64 equivalents. We share the trap
- * handlers with the above AArch64 code which checks what mode the
- * system is in.
+ * None of the other registers share their location, so treat them as
+ * if they were 64bit.
*/
-
-static bool trap_xvr(struct kvm_vcpu *vcpu,
- struct sys_reg_params *p,
- const struct sys_reg_desc *rd)
-{
- u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg];
-
- if (p->is_write) {
- u64 val = *dbg_reg;
-
- val &= 0xffffffffUL;
- val |= p->regval << 32;
- *dbg_reg = val;
-
- vcpu->arch.flags |= KVM_ARM64_DEBUG_DIRTY;
- } else {
- p->regval = *dbg_reg >> 32;
- }
-
- trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg);
-
- return true;
-}
-
-#define DBG_BCR_BVR_WCR_WVR(n) \
- /* DBGBVRn */ \
- { Op1( 0), CRn( 0), CRm((n)), Op2( 4), trap_bvr, NULL, n }, \
- /* DBGBCRn */ \
- { Op1( 0), CRn( 0), CRm((n)), Op2( 5), trap_bcr, NULL, n }, \
- /* DBGWVRn */ \
- { Op1( 0), CRn( 0), CRm((n)), Op2( 6), trap_wvr, NULL, n }, \
- /* DBGWCRn */ \
+#define DBG_BCR_BVR_WCR_WVR(n) \
+ /* DBGBVRn */ \
+ { AA32(LO), Op1( 0), CRn( 0), CRm((n)), Op2( 4), trap_bvr, NULL, n }, \
+ /* DBGBCRn */ \
+ { Op1( 0), CRn( 0), CRm((n)), Op2( 5), trap_bcr, NULL, n }, \
+ /* DBGWVRn */ \
+ { Op1( 0), CRn( 0), CRm((n)), Op2( 6), trap_wvr, NULL, n }, \
+ /* DBGWCRn */ \
{ Op1( 0), CRn( 0), CRm((n)), Op2( 7), trap_wcr, NULL, n }
-#define DBGBXVR(n) \
- { Op1( 0), CRn( 1), CRm((n)), Op2( 1), trap_xvr, NULL, n }
+#define DBGBXVR(n) \
+ { AA32(HI), Op1( 0), CRn( 1), CRm((n)), Op2( 1), trap_bvr, NULL, n }
/*
* Trapped cp14 registers. We generally ignore most of the external
@@ -1878,9 +1754,9 @@ static const struct sys_reg_desc cp14_regs[] = {
{ Op1( 0), CRn( 0), CRm( 1), Op2( 0), trap_raz_wi },
DBG_BCR_BVR_WCR_WVR(1),
/* DBGDCCINT */
- { Op1( 0), CRn( 0), CRm( 2), Op2( 0), trap_debug32, NULL, cp14_DBGDCCINT },
+ { Op1( 0), CRn( 0), CRm( 2), Op2( 0), trap_debug_regs, NULL, MDCCINT_EL1 },
/* DBGDSCRext */
- { Op1( 0), CRn( 0), CRm( 2), Op2( 2), trap_debug32, NULL, cp14_DBGDSCRext },
+ { Op1( 0), CRn( 0), CRm( 2), Op2( 2), trap_debug_regs, NULL, MDSCR_EL1 },
DBG_BCR_BVR_WCR_WVR(2),
/* DBGDTR[RT]Xint */
{ Op1( 0), CRn( 0), CRm( 3), Op2( 0), trap_raz_wi },
@@ -1895,7 +1771,7 @@ static const struct sys_reg_desc cp14_regs[] = {
{ Op1( 0), CRn( 0), CRm( 6), Op2( 2), trap_raz_wi },
DBG_BCR_BVR_WCR_WVR(6),
/* DBGVCR */
- { Op1( 0), CRn( 0), CRm( 7), Op2( 0), trap_debug32, NULL, cp14_DBGVCR },
+ { Op1( 0), CRn( 0), CRm( 7), Op2( 0), trap_debug_regs, NULL, DBGVCR32_EL2 },
DBG_BCR_BVR_WCR_WVR(7),
DBG_BCR_BVR_WCR_WVR(8),
DBG_BCR_BVR_WCR_WVR(9),
@@ -1981,19 +1857,29 @@ static const struct sys_reg_desc cp14_64_regs[] = {
*/
static const struct sys_reg_desc cp15_regs[] = {
{ Op1( 0), CRn( 0), CRm( 0), Op2( 1), access_ctr },
- { Op1( 0), CRn( 1), CRm( 0), Op2( 0), access_vm_reg, NULL, c1_SCTLR },
- { Op1( 0), CRn( 1), CRm( 0), Op2( 1), access_actlr },
- { Op1( 0), CRn( 1), CRm( 0), Op2( 3), access_actlr },
- { Op1( 0), CRn( 2), CRm( 0), Op2( 0), access_vm_reg, NULL, c2_TTBR0 },
- { Op1( 0), CRn( 2), CRm( 0), Op2( 1), access_vm_reg, NULL, c2_TTBR1 },
- { Op1( 0), CRn( 2), CRm( 0), Op2( 2), access_vm_reg, NULL, c2_TTBCR },
- { Op1( 0), CRn( 3), CRm( 0), Op2( 0), access_vm_reg, NULL, c3_DACR },
- { Op1( 0), CRn( 5), CRm( 0), Op2( 0), access_vm_reg, NULL, c5_DFSR },
- { Op1( 0), CRn( 5), CRm( 0), Op2( 1), access_vm_reg, NULL, c5_IFSR },
- { Op1( 0), CRn( 5), CRm( 1), Op2( 0), access_vm_reg, NULL, c5_ADFSR },
- { Op1( 0), CRn( 5), CRm( 1), Op2( 1), access_vm_reg, NULL, c5_AIFSR },
- { Op1( 0), CRn( 6), CRm( 0), Op2( 0), access_vm_reg, NULL, c6_DFAR },
- { Op1( 0), CRn( 6), CRm( 0), Op2( 2), access_vm_reg, NULL, c6_IFAR },
+ { Op1( 0), CRn( 1), CRm( 0), Op2( 0), access_vm_reg, NULL, SCTLR_EL1 },
+ /* ACTLR */
+ { AA32(LO), Op1( 0), CRn( 1), CRm( 0), Op2( 1), access_actlr, NULL, ACTLR_EL1 },
+ /* ACTLR2 */
+ { AA32(HI), Op1( 0), CRn( 1), CRm( 0), Op2( 3), access_actlr, NULL, ACTLR_EL1 },
+ { Op1( 0), CRn( 2), CRm( 0), Op2( 0), access_vm_reg, NULL, TTBR0_EL1 },
+ { Op1( 0), CRn( 2), CRm( 0), Op2( 1), access_vm_reg, NULL, TTBR1_EL1 },
+ /* TTBCR */
+ { AA32(LO), Op1( 0), CRn( 2), CRm( 0), Op2( 2), access_vm_reg, NULL, TCR_EL1 },
+ /* TTBCR2 */
+ { AA32(HI), Op1( 0), CRn( 2), CRm( 0), Op2( 3), access_vm_reg, NULL, TCR_EL1 },
+ { Op1( 0), CRn( 3), CRm( 0), Op2( 0), access_vm_reg, NULL, DACR32_EL2 },
+ /* DFSR */
+ { Op1( 0), CRn( 5), CRm( 0), Op2( 0), access_vm_reg, NULL, ESR_EL1 },
+ { Op1( 0), CRn( 5), CRm( 0), Op2( 1), access_vm_reg, NULL, IFSR32_EL2 },
+ /* ADFSR */
+ { Op1( 0), CRn( 5), CRm( 1), Op2( 0), access_vm_reg, NULL, AFSR0_EL1 },
+ /* AIFSR */
+ { Op1( 0), CRn( 5), CRm( 1), Op2( 1), access_vm_reg, NULL, AFSR1_EL1 },
+ /* DFAR */
+ { AA32(LO), Op1( 0), CRn( 6), CRm( 0), Op2( 0), access_vm_reg, NULL, FAR_EL1 },
+ /* IFAR */
+ { AA32(HI), Op1( 0), CRn( 6), CRm( 0), Op2( 2), access_vm_reg, NULL, FAR_EL1 },
/*
* DC{C,I,CI}SW operations:
@@ -2019,15 +1905,19 @@ static const struct sys_reg_desc cp15_regs[] = {
{ Op1( 0), CRn( 9), CRm(14), Op2( 2), access_pminten },
{ Op1( 0), CRn( 9), CRm(14), Op2( 3), access_pmovs },
- { Op1( 0), CRn(10), CRm( 2), Op2( 0), access_vm_reg, NULL, c10_PRRR },
- { Op1( 0), CRn(10), CRm( 2), Op2( 1), access_vm_reg, NULL, c10_NMRR },
- { Op1( 0), CRn(10), CRm( 3), Op2( 0), access_vm_reg, NULL, c10_AMAIR0 },
- { Op1( 0), CRn(10), CRm( 3), Op2( 1), access_vm_reg, NULL, c10_AMAIR1 },
+ /* PRRR/MAIR0 */
+ { AA32(LO), Op1( 0), CRn(10), CRm( 2), Op2( 0), access_vm_reg, NULL, MAIR_EL1 },
+ /* NMRR/MAIR1 */
+ { AA32(HI), Op1( 0), CRn(10), CRm( 2), Op2( 1), access_vm_reg, NULL, MAIR_EL1 },
+ /* AMAIR0 */
+ { AA32(LO), Op1( 0), CRn(10), CRm( 3), Op2( 0), access_vm_reg, NULL, AMAIR_EL1 },
+ /* AMAIR1 */
+ { AA32(HI), Op1( 0), CRn(10), CRm( 3), Op2( 1), access_vm_reg, NULL, AMAIR_EL1 },
/* ICC_SRE */
{ Op1( 0), CRn(12), CRm(12), Op2( 5), access_gic_sre },
- { Op1( 0), CRn(13), CRm( 0), Op2( 1), access_vm_reg, NULL, c13_CID },
+ { Op1( 0), CRn(13), CRm( 0), Op2( 1), access_vm_reg, NULL, CONTEXTIDR_EL1 },
/* Arch Tmers */
{ SYS_DESC(SYS_AARCH32_CNTP_TVAL), access_arch_timer },
@@ -2102,14 +1992,14 @@ static const struct sys_reg_desc cp15_regs[] = {
{ Op1(1), CRn( 0), CRm( 0), Op2(0), access_ccsidr },
{ Op1(1), CRn( 0), CRm( 0), Op2(1), access_clidr },
- { Op1(2), CRn( 0), CRm( 0), Op2(0), access_csselr, NULL, c0_CSSELR },
+ { Op1(2), CRn( 0), CRm( 0), Op2(0), access_csselr, NULL, CSSELR_EL1 },
};
static const struct sys_reg_desc cp15_64_regs[] = {
- { Op1( 0), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, c2_TTBR0 },
+ { Op1( 0), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, TTBR0_EL1 },
{ Op1( 0), CRn( 0), CRm( 9), Op2( 0), access_pmu_evcntr },
{ Op1( 0), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_SGI1R */
- { Op1( 1), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, c2_TTBR1 },
+ { Op1( 1), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, TTBR1_EL1 },
{ Op1( 1), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_ASGI1R */
{ Op1( 2), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_SGI0R */
{ SYS_DESC(SYS_AARCH32_CNTP_CVAL), access_arch_timer },
@@ -2180,7 +2070,7 @@ static void perform_access(struct kvm_vcpu *vcpu,
/* Skip instruction if instructed so */
if (likely(r->access(vcpu, params, r)))
- kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
+ kvm_incr_pc(vcpu);
}
/*
@@ -2253,8 +2143,6 @@ static int kvm_handle_cp_64(struct kvm_vcpu *vcpu,
int Rt = kvm_vcpu_sys_get_rt(vcpu);
int Rt2 = (esr >> 10) & 0x1f;
- params.is_aarch32 = true;
- params.is_32bit = false;
params.CRm = (esr >> 1) & 0xf;
params.is_write = ((esr & 1) == 0);
@@ -2304,8 +2192,6 @@ static int kvm_handle_cp_32(struct kvm_vcpu *vcpu,
u32 esr = kvm_vcpu_get_esr(vcpu);
int Rt = kvm_vcpu_sys_get_rt(vcpu);
- params.is_aarch32 = true;
- params.is_32bit = true;
params.CRm = (esr >> 1) & 0xf;
params.regval = vcpu_get_reg(vcpu, Rt);
params.is_write = ((esr & 1) == 0);
@@ -2399,8 +2285,6 @@ int kvm_handle_sys_reg(struct kvm_vcpu *vcpu)
trace_kvm_handle_sys_reg(esr);
- params.is_aarch32 = false;
- params.is_32bit = false;
params.Op0 = (esr >> 20) & 3;
params.Op1 = (esr >> 14) & 0x7;
params.CRn = (esr >> 10) & 0xf;
diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h
index 0f95964339b1..9d0621417c2a 100644
--- a/arch/arm64/kvm/sys_regs.h
+++ b/arch/arm64/kvm/sys_regs.h
@@ -19,14 +19,18 @@ struct sys_reg_params {
u8 Op2;
u64 regval;
bool is_write;
- bool is_aarch32;
- bool is_32bit; /* Only valid if is_aarch32 is true */
};
struct sys_reg_desc {
/* Sysreg string for debug */
const char *name;
+ enum {
+ AA32_ZEROHIGH,
+ AA32_LO,
+ AA32_HI,
+ } aarch32_map;
+
/* MRS/MSR instruction which accesses it. */
u8 Op0;
u8 Op1;
@@ -153,6 +157,7 @@ const struct sys_reg_desc *find_reg_by_id(u64 id,
const struct sys_reg_desc table[],
unsigned int num);
+#define AA32(_x) .aarch32_map = AA32_##_x
#define Op0(_x) .Op0 = _x
#define Op1(_x) .Op1 = _x
#define CRn(_x) .CRn = _x
diff --git a/arch/arm64/kvm/va_layout.c b/arch/arm64/kvm/va_layout.c
index e0404bcab019..d8cc51bd60bf 100644
--- a/arch/arm64/kvm/va_layout.c
+++ b/arch/arm64/kvm/va_layout.c
@@ -11,6 +11,7 @@
#include <asm/debug-monitors.h>
#include <asm/insn.h>
#include <asm/kvm_mmu.h>
+#include <asm/memory.h>
/*
* The LSB of the HYP VA tag
@@ -23,6 +24,30 @@ static u64 tag_val;
static u64 va_mask;
/*
+ * Compute HYP VA by using the same computation as kern_hyp_va().
+ */
+static u64 __early_kern_hyp_va(u64 addr)
+{
+ addr &= va_mask;
+ addr |= tag_val << tag_lsb;
+ return addr;
+}
+
+/*
+ * Store a hyp VA <-> PA offset into a hyp-owned variable.
+ */
+static void init_hyp_physvirt_offset(void)
+{
+ extern s64 kvm_nvhe_sym(hyp_physvirt_offset);
+ u64 kern_va, hyp_va;
+
+ /* Compute the offset from the hyp VA and PA of a random symbol. */
+ kern_va = (u64)kvm_ksym_ref(__hyp_text_start);
+ hyp_va = __early_kern_hyp_va(kern_va);
+ CHOOSE_NVHE_SYM(hyp_physvirt_offset) = (s64)__pa(kern_va) - (s64)hyp_va;
+}
+
+/*
* We want to generate a hyp VA with the following format (with V ==
* vabits_actual):
*
@@ -53,6 +78,8 @@ __init void kvm_compute_layout(void)
tag_val |= get_random_long() & GENMASK_ULL(vabits_actual - 2, tag_lsb);
}
tag_val >>= tag_lsb;
+
+ init_hyp_physvirt_offset();
}
static u32 compute_instruction(int n, u32 rd, u32 rn)
@@ -131,28 +158,21 @@ void __init kvm_update_va_mask(struct alt_instr *alt,
}
}
-void *__kvm_bp_vect_base;
-int __kvm_harden_el2_vector_slot;
-
void kvm_patch_vector_branch(struct alt_instr *alt,
__le32 *origptr, __le32 *updptr, int nr_inst)
{
u64 addr;
u32 insn;
- BUG_ON(nr_inst != 5);
+ BUG_ON(nr_inst != 4);
- if (has_vhe() || !cpus_have_const_cap(ARM64_HARDEN_EL2_VECTORS)) {
- WARN_ON_ONCE(cpus_have_const_cap(ARM64_HARDEN_EL2_VECTORS));
+ if (!cpus_have_const_cap(ARM64_SPECTRE_V3A) || WARN_ON_ONCE(has_vhe()))
return;
- }
/*
* Compute HYP VA by using the same computation as kern_hyp_va()
*/
- addr = (uintptr_t)kvm_ksym_ref(__kvm_hyp_vector);
- addr &= va_mask;
- addr |= tag_val << tag_lsb;
+ addr = __early_kern_hyp_va((u64)kvm_ksym_ref(__kvm_hyp_vector));
/* Use PC[10:7] to branch to the same vector in KVM */
addr |= ((u64)origptr & GENMASK_ULL(10, 7));
@@ -163,15 +183,6 @@ void kvm_patch_vector_branch(struct alt_instr *alt,
*/
addr += KVM_VECTOR_PREAMBLE;
- /* stp x0, x1, [sp, #-16]! */
- insn = aarch64_insn_gen_load_store_pair(AARCH64_INSN_REG_0,
- AARCH64_INSN_REG_1,
- AARCH64_INSN_REG_SP,
- -16,
- AARCH64_INSN_VARIANT_64BIT,
- AARCH64_INSN_LDST_STORE_PAIR_PRE_INDEX);
- *updptr++ = cpu_to_le32(insn);
-
/* movz x0, #(addr & 0xffff) */
insn = aarch64_insn_gen_movewide(AARCH64_INSN_REG_0,
(u16)addr,
@@ -201,3 +212,58 @@ void kvm_patch_vector_branch(struct alt_instr *alt,
AARCH64_INSN_BRANCH_NOLINK);
*updptr++ = cpu_to_le32(insn);
}
+
+static void generate_mov_q(u64 val, __le32 *origptr, __le32 *updptr, int nr_inst)
+{
+ u32 insn, oinsn, rd;
+
+ BUG_ON(nr_inst != 4);
+
+ /* Compute target register */
+ oinsn = le32_to_cpu(*origptr);
+ rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, oinsn);
+
+ /* movz rd, #(val & 0xffff) */
+ insn = aarch64_insn_gen_movewide(rd,
+ (u16)val,
+ 0,
+ AARCH64_INSN_VARIANT_64BIT,
+ AARCH64_INSN_MOVEWIDE_ZERO);
+ *updptr++ = cpu_to_le32(insn);
+
+ /* movk rd, #((val >> 16) & 0xffff), lsl #16 */
+ insn = aarch64_insn_gen_movewide(rd,
+ (u16)(val >> 16),
+ 16,
+ AARCH64_INSN_VARIANT_64BIT,
+ AARCH64_INSN_MOVEWIDE_KEEP);
+ *updptr++ = cpu_to_le32(insn);
+
+ /* movk rd, #((val >> 32) & 0xffff), lsl #32 */
+ insn = aarch64_insn_gen_movewide(rd,
+ (u16)(val >> 32),
+ 32,
+ AARCH64_INSN_VARIANT_64BIT,
+ AARCH64_INSN_MOVEWIDE_KEEP);
+ *updptr++ = cpu_to_le32(insn);
+
+ /* movk rd, #((val >> 48) & 0xffff), lsl #48 */
+ insn = aarch64_insn_gen_movewide(rd,
+ (u16)(val >> 48),
+ 48,
+ AARCH64_INSN_VARIANT_64BIT,
+ AARCH64_INSN_MOVEWIDE_KEEP);
+ *updptr++ = cpu_to_le32(insn);
+}
+
+void kvm_update_kimg_phys_offset(struct alt_instr *alt,
+ __le32 *origptr, __le32 *updptr, int nr_inst)
+{
+ generate_mov_q(kimage_voffset + PHYS_OFFSET, origptr, updptr, nr_inst);
+}
+
+void kvm_get_kimage_voffset(struct alt_instr *alt,
+ __le32 *origptr, __le32 *updptr, int nr_inst)
+{
+ generate_mov_q(kimage_voffset, origptr, updptr, nr_inst);
+}
diff --git a/arch/arm64/kvm/vgic-sys-reg-v3.c b/arch/arm64/kvm/vgic-sys-reg-v3.c
index 2f92bdcb1188..07d5271e9f05 100644
--- a/arch/arm64/kvm/vgic-sys-reg-v3.c
+++ b/arch/arm64/kvm/vgic-sys-reg-v3.c
@@ -268,8 +268,6 @@ int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, bool is_write, u64 id,
params.regval = *reg;
params.is_write = is_write;
- params.is_aarch32 = false;
- params.is_32bit = false;
if (find_reg_by_id(sysreg, &params, gic_v3_icc_reg_descs,
ARRAY_SIZE(gic_v3_icc_reg_descs)))
@@ -288,8 +286,6 @@ int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, bool is_write, u64 id,
if (is_write)
params.regval = *reg;
params.is_write = is_write;
- params.is_aarch32 = false;
- params.is_32bit = false;
r = find_reg_by_id(sysreg, &params, gic_v3_icc_reg_descs,
ARRAY_SIZE(gic_v3_icc_reg_descs));
diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
index b5fa73c9fd35..66508b03094f 100644
--- a/arch/arm64/kvm/vgic/vgic-v4.c
+++ b/arch/arm64/kvm/vgic/vgic-v4.c
@@ -353,6 +353,18 @@ int vgic_v4_load(struct kvm_vcpu *vcpu)
return err;
}
+void vgic_v4_commit(struct kvm_vcpu *vcpu)
+{
+ struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
+
+ /*
+ * No need to wait for the vPE to be ready across a shallow guest
+ * exit, as only a vcpu_put will invalidate it.
+ */
+ if (!vpe->ready)
+ its_commit_vpe(vpe);
+}
+
static struct vgic_its *vgic_get_its(struct kvm *kvm,
struct kvm_kernel_irq_routing_entry *irq_entry)
{
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index c3643b7f101b..1c597c9885fa 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -915,6 +915,9 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
if (can_access_vgic_from_kernel())
vgic_restore_state(vcpu);
+
+ if (vgic_supports_direct_msis(vcpu->kvm))
+ vgic_v4_commit(vcpu);
}
void kvm_vgic_load(struct kvm_vcpu *vcpu)
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 463c24e26000..74f9a036bab2 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -459,6 +459,7 @@ struct kvm_vcpu_stat {
u64 diagnose_308;
u64 diagnose_500;
u64 diagnose_other;
+ u64 pfault_sync;
};
#define PGM_OPERATION 0x01
diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c
index 394a5f53805b..3765c4223bf9 100644
--- a/arch/s390/kvm/guestdbg.c
+++ b/arch/s390/kvm/guestdbg.c
@@ -184,7 +184,7 @@ static int __import_wp_info(struct kvm_vcpu *vcpu,
if (wp_info->len < 0 || wp_info->len > MAX_WP_SIZE)
return -EINVAL;
- wp_info->old_data = kmalloc(bp_data->len, GFP_KERNEL);
+ wp_info->old_data = kmalloc(bp_data->len, GFP_KERNEL_ACCOUNT);
if (!wp_info->old_data)
return -ENOMEM;
/* try to backup the original value */
@@ -234,7 +234,7 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu,
if (nr_wp > 0) {
wp_info = kmalloc_array(nr_wp,
sizeof(*wp_info),
- GFP_KERNEL);
+ GFP_KERNEL_ACCOUNT);
if (!wp_info) {
ret = -ENOMEM;
goto error;
@@ -243,7 +243,7 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu,
if (nr_bp > 0) {
bp_info = kmalloc_array(nr_bp,
sizeof(*bp_info),
- GFP_KERNEL);
+ GFP_KERNEL_ACCOUNT);
if (!bp_info) {
ret = -ENOMEM;
goto error;
@@ -349,7 +349,7 @@ static struct kvm_hw_wp_info_arch *any_wp_changed(struct kvm_vcpu *vcpu)
if (!wp_info || !wp_info->old_data || wp_info->len <= 0)
continue;
- temp = kmalloc(wp_info->len, GFP_KERNEL);
+ temp = kmalloc(wp_info->len, GFP_KERNEL_ACCOUNT);
if (!temp)
continue;
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index e7a7c499a73f..72b25b7cc6ae 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -398,7 +398,7 @@ int handle_sthyi(struct kvm_vcpu *vcpu)
if (!kvm_s390_pv_cpu_is_protected(vcpu) && (addr & ~PAGE_MASK))
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
- sctns = (void *)get_zeroed_page(GFP_KERNEL);
+ sctns = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (!sctns)
return -ENOMEM;
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 2f177298c663..e3183bd05910 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -1792,7 +1792,7 @@ struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
goto out;
}
gisa_out:
- tmp_inti = kzalloc(sizeof(*inti), GFP_KERNEL);
+ tmp_inti = kzalloc(sizeof(*inti), GFP_KERNEL_ACCOUNT);
if (tmp_inti) {
tmp_inti->type = KVM_S390_INT_IO(1, 0, 0, 0);
tmp_inti->io.io_int_word = isc_to_int_word(isc);
@@ -2015,7 +2015,7 @@ int kvm_s390_inject_vm(struct kvm *kvm,
struct kvm_s390_interrupt_info *inti;
int rc;
- inti = kzalloc(sizeof(*inti), GFP_KERNEL);
+ inti = kzalloc(sizeof(*inti), GFP_KERNEL_ACCOUNT);
if (!inti)
return -ENOMEM;
@@ -2414,7 +2414,7 @@ static int enqueue_floating_irq(struct kvm_device *dev,
return -EINVAL;
while (len >= sizeof(struct kvm_s390_irq)) {
- inti = kzalloc(sizeof(*inti), GFP_KERNEL);
+ inti = kzalloc(sizeof(*inti), GFP_KERNEL_ACCOUNT);
if (!inti)
return -ENOMEM;
@@ -2462,7 +2462,7 @@ static int register_io_adapter(struct kvm_device *dev,
if (dev->kvm->arch.adapters[adapter_info.id] != NULL)
return -EINVAL;
- adapter = kzalloc(sizeof(*adapter), GFP_KERNEL);
+ adapter = kzalloc(sizeof(*adapter), GFP_KERNEL_ACCOUNT);
if (!adapter)
return -ENOMEM;
@@ -3290,7 +3290,7 @@ int kvm_s390_gib_init(u8 nisc)
goto out;
}
- gib = (struct kvm_s390_gib *)get_zeroed_page(GFP_KERNEL | GFP_DMA);
+ gib = (struct kvm_s390_gib *)get_zeroed_page(GFP_KERNEL_ACCOUNT | GFP_DMA);
if (!gib) {
rc = -ENOMEM;
goto out;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 425d3d75320b..dbafd057ca6a 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -60,6 +60,7 @@
struct kvm_stats_debugfs_item debugfs_entries[] = {
VCPU_STAT("userspace_handled", exit_userspace),
VCPU_STAT("exit_null", exit_null),
+ VCPU_STAT("pfault_sync", pfault_sync),
VCPU_STAT("exit_validity", exit_validity),
VCPU_STAT("exit_stop_request", exit_stop_request),
VCPU_STAT("exit_external_request", exit_external_request),
@@ -1254,7 +1255,7 @@ static int kvm_s390_set_processor(struct kvm *kvm, struct kvm_device_attr *attr)
ret = -EBUSY;
goto out;
}
- proc = kzalloc(sizeof(*proc), GFP_KERNEL);
+ proc = kzalloc(sizeof(*proc), GFP_KERNEL_ACCOUNT);
if (!proc) {
ret = -ENOMEM;
goto out;
@@ -1416,7 +1417,7 @@ static int kvm_s390_get_processor(struct kvm *kvm, struct kvm_device_attr *attr)
struct kvm_s390_vm_cpu_processor *proc;
int ret = 0;
- proc = kzalloc(sizeof(*proc), GFP_KERNEL);
+ proc = kzalloc(sizeof(*proc), GFP_KERNEL_ACCOUNT);
if (!proc) {
ret = -ENOMEM;
goto out;
@@ -1444,7 +1445,7 @@ static int kvm_s390_get_machine(struct kvm *kvm, struct kvm_device_attr *attr)
struct kvm_s390_vm_cpu_machine *mach;
int ret = 0;
- mach = kzalloc(sizeof(*mach), GFP_KERNEL);
+ mach = kzalloc(sizeof(*mach), GFP_KERNEL_ACCOUNT);
if (!mach) {
ret = -ENOMEM;
goto out;
@@ -1812,7 +1813,7 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
if (args->count < 1 || args->count > KVM_S390_SKEYS_MAX)
return -EINVAL;
- keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL);
+ keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL_ACCOUNT);
if (!keys)
return -ENOMEM;
@@ -1857,7 +1858,7 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
if (args->count < 1 || args->count > KVM_S390_SKEYS_MAX)
return -EINVAL;
- keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL);
+ keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL_ACCOUNT);
if (!keys)
return -ENOMEM;
@@ -2625,7 +2626,7 @@ static void sca_dispose(struct kvm *kvm)
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
{
- gfp_t alloc_flags = GFP_KERNEL;
+ gfp_t alloc_flags = GFP_KERNEL_ACCOUNT;
int i, rc;
char debug_name[16];
static unsigned long sca_offset;
@@ -2670,7 +2671,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
BUILD_BUG_ON(sizeof(struct sie_page2) != 4096);
kvm->arch.sie_page2 =
- (struct sie_page2 *) get_zeroed_page(GFP_KERNEL | GFP_DMA);
+ (struct sie_page2 *) get_zeroed_page(GFP_KERNEL_ACCOUNT | GFP_DMA);
if (!kvm->arch.sie_page2)
goto out_err;
@@ -2900,7 +2901,7 @@ static int sca_switch_to_extended(struct kvm *kvm)
if (kvm->arch.use_esca)
return 0;
- new_sca = alloc_pages_exact(sizeof(*new_sca), GFP_KERNEL|__GFP_ZERO);
+ new_sca = alloc_pages_exact(sizeof(*new_sca), GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!new_sca)
return -ENOMEM;
@@ -3133,7 +3134,7 @@ void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu)
int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu)
{
- vcpu->arch.sie_block->cbrlo = get_zeroed_page(GFP_KERNEL);
+ vcpu->arch.sie_block->cbrlo = get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (!vcpu->arch.sie_block->cbrlo)
return -ENOMEM;
return 0;
@@ -3243,7 +3244,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
int rc;
BUILD_BUG_ON(sizeof(struct sie_page) != 4096);
- sie_page = (struct sie_page *) get_zeroed_page(GFP_KERNEL);
+ sie_page = (struct sie_page *) get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (!sie_page)
return -ENOMEM;
@@ -4109,6 +4110,7 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
current->thread.gmap_pfault = 0;
if (kvm_arch_setup_async_pf(vcpu))
return 0;
+ vcpu->stat.pfault_sync++;
return kvm_arch_fault_in_page(vcpu, current->thread.gmap_addr, 1);
}
return vcpu_post_run_fault_in_sie(vcpu);
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index cd74989ce0b0..9928f785c677 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -879,7 +879,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
switch (fc) {
case 1: /* same handling for 1 and 2 */
case 2:
- mem = get_zeroed_page(GFP_KERNEL);
+ mem = get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (!mem)
goto out_no_data;
if (stsi((void *) mem, fc, sel1, sel2))
@@ -888,7 +888,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
case 3:
if (sel1 != 2 || sel2 != 2)
goto out_no_data;
- mem = get_zeroed_page(GFP_KERNEL);
+ mem = get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (!mem)
goto out_no_data;
handle_stsi_3_2_2(vcpu, (void *) mem);
diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
index f5847f9dec7c..813b6e93dc83 100644
--- a/arch/s390/kvm/pv.c
+++ b/arch/s390/kvm/pv.c
@@ -60,7 +60,7 @@ int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc)
if (kvm_s390_pv_cpu_get_handle(vcpu))
return -EINVAL;
- vcpu->arch.pv.stor_base = __get_free_pages(GFP_KERNEL,
+ vcpu->arch.pv.stor_base = __get_free_pages(GFP_KERNEL_ACCOUNT,
get_order(uv_info.guest_cpu_stor_len));
if (!vcpu->arch.pv.stor_base)
return -ENOMEM;
@@ -72,7 +72,7 @@ int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc)
uvcb.stor_origin = (u64)vcpu->arch.pv.stor_base;
/* Alloc Secure Instruction Data Area Designation */
- vcpu->arch.sie_block->sidad = __get_free_page(GFP_KERNEL | __GFP_ZERO);
+ vcpu->arch.sie_block->sidad = __get_free_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!vcpu->arch.sie_block->sidad) {
free_pages(vcpu->arch.pv.stor_base,
get_order(uv_info.guest_cpu_stor_len));
@@ -120,7 +120,7 @@ static int kvm_s390_pv_alloc_vm(struct kvm *kvm)
struct kvm_memory_slot *memslot;
kvm->arch.pv.stor_var = NULL;
- kvm->arch.pv.stor_base = __get_free_pages(GFP_KERNEL, get_order(base));
+ kvm->arch.pv.stor_base = __get_free_pages(GFP_KERNEL_ACCOUNT, get_order(base));
if (!kvm->arch.pv.stor_base)
return -ENOMEM;
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 4f3cbf6003a9..c5d0a58b2c29 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -1234,7 +1234,7 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr)
mutex_lock(&kvm->arch.vsie.mutex);
if (kvm->arch.vsie.page_count < nr_vcpus) {
- page = alloc_page(GFP_KERNEL | __GFP_ZERO | GFP_DMA);
+ page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO | GFP_DMA);
if (!page) {
mutex_unlock(&kvm->arch.vsie.mutex);
return ERR_PTR(-ENOMEM);
@@ -1336,7 +1336,7 @@ out_put:
void kvm_s390_vsie_init(struct kvm *kvm)
{
mutex_init(&kvm->arch.vsie.mutex);
- INIT_RADIX_TREE(&kvm->arch.vsie.addr_to_page, GFP_KERNEL);
+ INIT_RADIX_TREE(&kvm->arch.vsie.addr_to_page, GFP_KERNEL_ACCOUNT);
}
/* Destroy the vsie data structures. To be called when a vm is destroyed. */
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 64795d034926..9bb2c7512cd5 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -2,7 +2,7 @@
/*
* KVM guest address space mapping code
*
- * Copyright IBM Corp. 2007, 2016, 2018
+ * Copyright IBM Corp. 2007, 2020
* Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
* David Hildenbrand <david@redhat.com>
* Janosch Frank <frankja@linux.vnet.ibm.com>
@@ -56,19 +56,19 @@ static struct gmap *gmap_alloc(unsigned long limit)
atype = _ASCE_TYPE_REGION1;
etype = _REGION1_ENTRY_EMPTY;
}
- gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL);
+ gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL_ACCOUNT);
if (!gmap)
goto out;
INIT_LIST_HEAD(&gmap->crst_list);
INIT_LIST_HEAD(&gmap->children);
INIT_LIST_HEAD(&gmap->pt_list);
- INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL);
- INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC);
- INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC);
+ INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL_ACCOUNT);
+ INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC | __GFP_ACCOUNT);
+ INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC | __GFP_ACCOUNT);
spin_lock_init(&gmap->guest_table_lock);
spin_lock_init(&gmap->shadow_lock);
refcount_set(&gmap->ref_count, 1);
- page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
+ page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
if (!page)
goto out_free;
page->index = 0;
@@ -309,7 +309,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
unsigned long *new;
/* since we dont free the gmap table until gmap_free we can unlock */
- page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
+ page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
if (!page)
return -ENOMEM;
new = (unsigned long *) page_to_phys(page);
@@ -594,7 +594,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
if (pmd_large(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m)
return -EFAULT;
/* Link gmap segment table entry location to page table. */
- rc = radix_tree_preload(GFP_KERNEL);
+ rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
if (rc)
return rc;
ptl = pmd_lock(mm, pmd);
@@ -1218,11 +1218,11 @@ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
vmaddr = __gmap_translate(parent, paddr);
if (IS_ERR_VALUE(vmaddr))
return vmaddr;
- rmap = kzalloc(sizeof(*rmap), GFP_KERNEL);
+ rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT);
if (!rmap)
return -ENOMEM;
rmap->raddr = raddr;
- rc = radix_tree_preload(GFP_KERNEL);
+ rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
if (rc) {
kfree(rmap);
return rc;
@@ -1741,7 +1741,7 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
BUG_ON(!gmap_is_shadow(sg));
/* Allocate a shadow region second table */
- page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
+ page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
if (!page)
return -ENOMEM;
page->index = r2t & _REGION_ENTRY_ORIGIN;
@@ -1825,7 +1825,7 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
BUG_ON(!gmap_is_shadow(sg));
/* Allocate a shadow region second table */
- page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
+ page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
if (!page)
return -ENOMEM;
page->index = r3t & _REGION_ENTRY_ORIGIN;
@@ -1909,7 +1909,7 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE));
/* Allocate a shadow segment table */
- page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
+ page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
if (!page)
return -ENOMEM;
page->index = sgt & _REGION_ENTRY_ORIGIN;
@@ -2116,7 +2116,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
parent = sg->parent;
prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE;
- rmap = kzalloc(sizeof(*rmap), GFP_KERNEL);
+ rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT);
if (!rmap)
return -ENOMEM;
rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE;
@@ -2128,7 +2128,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
rc = vmaddr;
break;
}
- rc = radix_tree_preload(GFP_KERNEL);
+ rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
if (rc)
break;
rc = -EAGAIN;
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index f5ef2d5b9231..84b887825f12 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -237,6 +237,7 @@
#define X86_FEATURE_VMCALL ( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */
#define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */
#define X86_FEATURE_SEV_ES ( 8*32+20) /* AMD Secure Encrypted Virtualization - Encrypted State */
+#define X86_FEATURE_VM_PAGE_FLUSH ( 8*32+21) /* "" VM Page Flush MSR is supported */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */
#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/
@@ -376,6 +377,7 @@
#define X86_FEATURE_TSXLDTRK (18*32+16) /* TSX Suspend Load Address Tracking */
#define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */
#define X86_FEATURE_ARCH_LBR (18*32+19) /* Intel ARCH LBR */
+#define X86_FEATURE_AVX512_FP16 (18*32+23) /* AVX512 FP16 */
#define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */
#define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */
#define X86_FEATURE_FLUSH_L1D (18*32+28) /* Flush L1D cache */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7e5f33a0d0e2..3ab7b46087b7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -614,6 +614,7 @@ struct kvm_vcpu_arch {
struct kvm_pio_request pio;
void *pio_data;
+ void *guest_ins_data;
u8 event_exit_inst_len;
@@ -805,6 +806,9 @@ struct kvm_vcpu_arch {
*/
bool enforce;
} pv_cpuid;
+
+ /* Protected Guests */
+ bool guest_state_protected;
};
struct kvm_lpage_info {
@@ -1088,7 +1092,7 @@ struct kvm_x86_ops {
void (*hardware_disable)(void);
void (*hardware_unsetup)(void);
bool (*cpu_has_accelerated_tpr)(void);
- bool (*has_emulated_msr)(u32 index);
+ bool (*has_emulated_msr)(struct kvm *kvm, u32 index);
void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu);
unsigned int vm_size;
@@ -1115,7 +1119,8 @@ struct kvm_x86_ops {
struct kvm_segment *var, int seg);
void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
- int (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
+ bool (*is_valid_cr4)(struct kvm_vcpu *vcpu, unsigned long cr0);
+ void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
int (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
@@ -1231,6 +1236,7 @@ struct kvm_x86_ops {
void (*enable_log_dirty_pt_masked)(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t offset, unsigned long mask);
+ int (*cpu_dirty_log_size)(void);
/* pmu operations of sub-arch */
const struct kvm_pmu_ops *pmu_ops;
@@ -1280,6 +1286,7 @@ struct kvm_x86_ops {
void (*migrate_timers)(struct kvm_vcpu *vcpu);
void (*msr_filter_changed)(struct kvm_vcpu *vcpu);
+ int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err);
};
struct kvm_x86_nested_ops {
@@ -1470,6 +1477,10 @@ void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
int reason, bool has_error_code, u32 error_code);
+void kvm_free_guest_fpu(struct kvm_vcpu *vcpu);
+
+void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0);
+void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4);
int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
@@ -1696,7 +1707,8 @@ void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu);
int kvm_is_in_guest(void);
-int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size);
+void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
+ u32 size);
bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu);
bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
@@ -1743,4 +1755,6 @@ static inline int kvm_cpu_get_apicid(int mps_cpu)
#define GET_SMSTATE(type, buf, offset) \
(*(type *)((buf) + (offset) - 0x7e00))
+int kvm_cpu_dirty_log_size(void);
+
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 2b5fc9accec4..546d6ecf0a35 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -472,6 +472,7 @@
#define MSR_AMD64_ICIBSEXTDCTL 0xc001103c
#define MSR_AMD64_IBSOPDATA4 0xc001103d
#define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */
+#define MSR_AMD64_VM_PAGE_FLUSH 0xc001011e
#define MSR_AMD64_SEV_ES_GHCB 0xc0010130
#define MSR_AMD64_SEV 0xc0010131
#define MSR_AMD64_SEV_ENABLED_BIT 0
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 71d630bb5e08..1c561945b426 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -98,6 +98,16 @@ enum {
INTERCEPT_MWAIT_COND,
INTERCEPT_XSETBV,
INTERCEPT_RDPRU,
+ TRAP_EFER_WRITE,
+ TRAP_CR0_WRITE,
+ TRAP_CR1_WRITE,
+ TRAP_CR2_WRITE,
+ TRAP_CR3_WRITE,
+ TRAP_CR4_WRITE,
+ TRAP_CR5_WRITE,
+ TRAP_CR6_WRITE,
+ TRAP_CR7_WRITE,
+ TRAP_CR8_WRITE,
/* Byte offset 014h (word 5) */
INTERCEPT_INVLPGB = 160,
INTERCEPT_INVLPGB_ILLEGAL,
@@ -130,7 +140,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
u32 exit_int_info_err;
u64 nested_ctl;
u64 avic_vapic_bar;
- u8 reserved_4[8];
+ u64 ghcb_gpa;
u32 event_inj;
u32 event_inj_err;
u64 nested_cr3;
@@ -144,6 +154,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
u8 reserved_6[8]; /* Offset 0xe8 */
u64 avic_logical_id; /* Offset 0xf0 */
u64 avic_physical_id; /* Offset 0xf8 */
+ u8 reserved_7[8];
+ u64 vmsa_pa; /* Used for an SEV-ES guest */
};
@@ -178,7 +190,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define LBR_CTL_ENABLE_MASK BIT_ULL(0)
#define VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK BIT_ULL(1)
-#define SVM_INTERRUPT_SHADOW_MASK 1
+#define SVM_INTERRUPT_SHADOW_MASK BIT_ULL(0)
+#define SVM_GUEST_INTERRUPT_MASK BIT_ULL(1)
#define SVM_IOIO_STR_SHIFT 2
#define SVM_IOIO_REP_SHIFT 3
@@ -197,6 +210,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define SVM_NESTED_CTL_NP_ENABLE BIT(0)
#define SVM_NESTED_CTL_SEV_ENABLE BIT(1)
+#define SVM_NESTED_CTL_SEV_ES_ENABLE BIT(2)
struct vmcb_seg {
u16 selector;
@@ -220,7 +234,8 @@ struct vmcb_save_area {
u8 cpl;
u8 reserved_2[4];
u64 efer;
- u8 reserved_3[112];
+ u8 reserved_3[104];
+ u64 xss; /* Valid for SEV-ES only */
u64 cr4;
u64 cr3;
u64 cr0;
@@ -251,9 +266,12 @@ struct vmcb_save_area {
/*
* The following part of the save area is valid only for
- * SEV-ES guests when referenced through the GHCB.
+ * SEV-ES guests when referenced through the GHCB or for
+ * saving to the host save area.
*/
- u8 reserved_7[104];
+ u8 reserved_7[80];
+ u32 pkru;
+ u8 reserved_7a[20];
u64 reserved_8; /* rax already available at 0x01f8 */
u64 rcx;
u64 rdx;
@@ -294,7 +312,7 @@ struct ghcb {
#define EXPECTED_VMCB_SAVE_AREA_SIZE 1032
-#define EXPECTED_VMCB_CONTROL_AREA_SIZE 256
+#define EXPECTED_VMCB_CONTROL_AREA_SIZE 272
#define EXPECTED_GHCB_SIZE PAGE_SIZE
static inline void __unused_size_checks(void)
@@ -379,6 +397,16 @@ struct vmcb {
(unsigned long *)&ghcb->save.valid_bitmap); \
} \
\
+ static inline u64 ghcb_get_##field(struct ghcb *ghcb) \
+ { \
+ return ghcb->save.field; \
+ } \
+ \
+ static inline u64 ghcb_get_##field##_if_valid(struct ghcb *ghcb) \
+ { \
+ return ghcb_##field##_is_valid(ghcb) ? ghcb->save.field : 0; \
+ } \
+ \
static inline void ghcb_set_##field(struct ghcb *ghcb, u64 value) \
{ \
__set_bit(GHCB_BITMAP_IDX(field), \
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index f8ba5289ecb0..38ca445a8429 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -113,6 +113,7 @@
#define VMX_MISC_PREEMPTION_TIMER_RATE_MASK 0x0000001f
#define VMX_MISC_SAVE_EFER_LMA 0x00000020
#define VMX_MISC_ACTIVITY_HLT 0x00000040
+#define VMX_MISC_ACTIVITY_WAIT_SIPI 0x00000100
#define VMX_MISC_ZERO_LEN_INS 0x40000000
#define VMX_MISC_MSR_LIST_MULTIPLIER 512
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 89e5f3d1bba8..8e76d3701db3 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -12,6 +12,7 @@
#define KVM_PIO_PAGE_OFFSET 1
#define KVM_COALESCED_MMIO_PAGE_OFFSET 2
+#define KVM_DIRTY_LOG_PAGE_OFFSET 64
#define DE_VECTOR 0
#define DB_VECTOR 1
diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h
index f1d8307454e0..554f75fe013c 100644
--- a/arch/x86/include/uapi/asm/svm.h
+++ b/arch/x86/include/uapi/asm/svm.h
@@ -77,10 +77,28 @@
#define SVM_EXIT_MWAIT_COND 0x08c
#define SVM_EXIT_XSETBV 0x08d
#define SVM_EXIT_RDPRU 0x08e
+#define SVM_EXIT_EFER_WRITE_TRAP 0x08f
+#define SVM_EXIT_CR0_WRITE_TRAP 0x090
+#define SVM_EXIT_CR1_WRITE_TRAP 0x091
+#define SVM_EXIT_CR2_WRITE_TRAP 0x092
+#define SVM_EXIT_CR3_WRITE_TRAP 0x093
+#define SVM_EXIT_CR4_WRITE_TRAP 0x094
+#define SVM_EXIT_CR5_WRITE_TRAP 0x095
+#define SVM_EXIT_CR6_WRITE_TRAP 0x096
+#define SVM_EXIT_CR7_WRITE_TRAP 0x097
+#define SVM_EXIT_CR8_WRITE_TRAP 0x098
+#define SVM_EXIT_CR9_WRITE_TRAP 0x099
+#define SVM_EXIT_CR10_WRITE_TRAP 0x09a
+#define SVM_EXIT_CR11_WRITE_TRAP 0x09b
+#define SVM_EXIT_CR12_WRITE_TRAP 0x09c
+#define SVM_EXIT_CR13_WRITE_TRAP 0x09d
+#define SVM_EXIT_CR14_WRITE_TRAP 0x09e
+#define SVM_EXIT_CR15_WRITE_TRAP 0x09f
#define SVM_EXIT_INVPCID 0x0a2
#define SVM_EXIT_NPF 0x400
#define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401
#define SVM_EXIT_AVIC_UNACCELERATED_ACCESS 0x402
+#define SVM_EXIT_VMGEXIT 0x403
/* SEV-ES software-defined VMGEXIT events */
#define SVM_VMGEXIT_MMIO_READ 0x80000001
@@ -183,10 +201,20 @@
{ SVM_EXIT_MONITOR, "monitor" }, \
{ SVM_EXIT_MWAIT, "mwait" }, \
{ SVM_EXIT_XSETBV, "xsetbv" }, \
+ { SVM_EXIT_EFER_WRITE_TRAP, "write_efer_trap" }, \
+ { SVM_EXIT_CR0_WRITE_TRAP, "write_cr0_trap" }, \
+ { SVM_EXIT_CR4_WRITE_TRAP, "write_cr4_trap" }, \
+ { SVM_EXIT_CR8_WRITE_TRAP, "write_cr8_trap" }, \
{ SVM_EXIT_INVPCID, "invpcid" }, \
{ SVM_EXIT_NPF, "npf" }, \
{ SVM_EXIT_AVIC_INCOMPLETE_IPI, "avic_incomplete_ipi" }, \
{ SVM_EXIT_AVIC_UNACCELERATED_ACCESS, "avic_unaccelerated_access" }, \
+ { SVM_EXIT_VMGEXIT, "vmgexit" }, \
+ { SVM_VMGEXIT_MMIO_READ, "vmgexit_mmio_read" }, \
+ { SVM_VMGEXIT_MMIO_WRITE, "vmgexit_mmio_write" }, \
+ { SVM_VMGEXIT_NMI_COMPLETE, "vmgexit_nmi_complete" }, \
+ { SVM_VMGEXIT_AP_HLT_LOOP, "vmgexit_ap_hlt_loop" }, \
+ { SVM_VMGEXIT_AP_JUMP_TABLE, "vmgexit_ap_jump_table" }, \
{ SVM_EXIT_ERR, "invalid_guest_state" }
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index b8ff9e8ac0d5..ada955c5ebb6 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -32,6 +32,7 @@
#define EXIT_REASON_EXTERNAL_INTERRUPT 1
#define EXIT_REASON_TRIPLE_FAULT 2
#define EXIT_REASON_INIT_SIGNAL 3
+#define EXIT_REASON_SIPI_SIGNAL 4
#define EXIT_REASON_INTERRUPT_WINDOW 7
#define EXIT_REASON_NMI_WINDOW 8
@@ -94,6 +95,7 @@
{ EXIT_REASON_EXTERNAL_INTERRUPT, "EXTERNAL_INTERRUPT" }, \
{ EXIT_REASON_TRIPLE_FAULT, "TRIPLE_FAULT" }, \
{ EXIT_REASON_INIT_SIGNAL, "INIT_SIGNAL" }, \
+ { EXIT_REASON_SIPI_SIGNAL, "SIPI_SIGNAL" }, \
{ EXIT_REASON_INTERRUPT_WINDOW, "INTERRUPT_WINDOW" }, \
{ EXIT_REASON_NMI_WINDOW, "NMI_WINDOW" }, \
{ EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index d502241995a3..42af31b64c2c 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -69,6 +69,7 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC },
{ X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC },
{ X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_FP16, X86_FEATURE_AVX512BW },
{ X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES },
{ X86_FEATURE_PER_THREAD_MBA, X86_FEATURE_MBA },
{}
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 866c9a9bcdee..236924930bf0 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -44,6 +44,7 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_SEV, CPUID_EAX, 1, 0x8000001f, 0 },
{ X86_FEATURE_SEV_ES, CPUID_EAX, 3, 0x8000001f, 0 },
{ X86_FEATURE_SME_COHERENT, CPUID_EAX, 10, 0x8000001f, 0 },
+ { X86_FEATURE_VM_PAGE_FLUSH, CPUID_EAX, 2, 0x8000001f, 0 },
{ 0, 0, 0, 0, 0 }
};
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 924571fe5864..c6ede3b3d302 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -501,12 +501,12 @@ static bool vmware_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs)
ghcb_rbp_is_valid(ghcb)))
return false;
- regs->bx = ghcb->save.rbx;
- regs->cx = ghcb->save.rcx;
- regs->dx = ghcb->save.rdx;
- regs->si = ghcb->save.rsi;
- regs->di = ghcb->save.rdi;
- regs->bp = ghcb->save.rbp;
+ regs->bx = ghcb_get_rbx(ghcb);
+ regs->cx = ghcb_get_rcx(ghcb);
+ regs->dx = ghcb_get_rdx(ghcb);
+ regs->si = ghcb_get_rsi(ghcb);
+ regs->di = ghcb_get_rdi(ghcb);
+ regs->bp = ghcb_get_rbp(ghcb);
return true;
}
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 34b18f6eeb2c..aa593743acf6 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -44,7 +44,6 @@ static int __init parse_no_kvmclock_vsyscall(char *arg)
early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
/* Aligned to page sizes to match whats mapped via vsyscalls to userspace */
-#define HV_CLOCK_SIZE (sizeof(struct pvclock_vsyscall_time_info) * NR_CPUS)
#define HVC_BOOT_ARRAY_SIZE \
(PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info))
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index f92dfd8ef10d..7ac592664c52 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -100,7 +100,8 @@ config KVM_AMD_SEV
depends on KVM_AMD && X86_64
depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m)
help
- Provides support for launching Encrypted VMs on AMD processors.
+ Provides support for launching Encrypted VMs (SEV) and Encrypted VMs
+ with Encrypted State (SEV-ES) on AMD processors.
config KVM_MMU_AUDIT
bool "Audit KVM MMU"
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index b804444e16d4..4bd14ab01323 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -10,7 +10,8 @@ endif
KVM := ../../../virt/kvm
kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
- $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o
+ $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o \
+ $(KVM)/dirty_ring.o
kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 83637a2ff605..13036cf0b912 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -146,6 +146,7 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
MSR_IA32_MISC_ENABLE_MWAIT);
}
}
+EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime);
static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
@@ -418,7 +419,7 @@ void kvm_set_cpu_caps(void)
F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) |
- F(SERIALIZE) | F(TSXLDTRK)
+ F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16)
);
/* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index f7a6e8f83783..dc921d76e42e 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -264,6 +264,20 @@ static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu)
return x86_stepping(best->eax);
}
+static inline bool guest_has_spec_ctrl_msr(struct kvm_vcpu *vcpu)
+{
+ return (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) ||
+ guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) ||
+ guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) ||
+ guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD));
+}
+
+static inline bool guest_has_pred_cmd_msr(struct kvm_vcpu *vcpu)
+{
+ return (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) ||
+ guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB));
+}
+
static inline bool supports_cpuid_fault(struct kvm_vcpu *vcpu)
{
return vcpu->arch.msr_platform_info & MSR_PLATFORM_INFO_CPUID_FAULT;
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 5c7c4060b45c..922c69dcca4d 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1951,8 +1951,8 @@ int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args)
return kvm_hv_eventfd_assign(kvm, args->conn_id, args->fd);
}
-int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
- struct kvm_cpuid_entry2 __user *entries)
+int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries)
{
uint16_t evmcs_ver = 0;
struct kvm_cpuid_entry2 cpuid_entries[] = {
@@ -2037,7 +2037,7 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
* Direct Synthetic timers only make sense with in-kernel
* LAPIC
*/
- if (lapic_in_kernel(vcpu))
+ if (!vcpu || lapic_in_kernel(vcpu))
ent->edx |= HV_STIMER_DIRECT_MODE_AVAILABLE;
break;
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index e68c6c2e9649..6d7def2b0aad 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -126,7 +126,7 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
void kvm_hv_init_vm(struct kvm *kvm);
void kvm_hv_destroy_vm(struct kvm *kvm);
int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args);
-int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
- struct kvm_cpuid_entry2 __user *entries);
+int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries);
#endif
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index a889563ad02d..f15bc16de07c 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -9,6 +9,31 @@
(X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
| X86_CR4_OSXMMEXCPT | X86_CR4_PGE | X86_CR4_TSD | X86_CR4_FSGSBASE)
+static inline bool kvm_register_is_available(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+ return test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+}
+
+static inline bool kvm_register_is_dirty(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+ return test_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
+}
+
+static inline void kvm_register_mark_available(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+ __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+}
+
+static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+ __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+ __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
+}
+
#define BUILD_KVM_GPR_ACCESSORS(lname, uname) \
static __always_inline unsigned long kvm_##lname##_read(struct kvm_vcpu *vcpu)\
{ \
@@ -18,6 +43,7 @@ static __always_inline void kvm_##lname##_write(struct kvm_vcpu *vcpu, \
unsigned long val) \
{ \
vcpu->arch.regs[VCPU_REGS_##uname] = val; \
+ kvm_register_mark_dirty(vcpu, VCPU_REGS_##uname); \
}
BUILD_KVM_GPR_ACCESSORS(rax, RAX)
BUILD_KVM_GPR_ACCESSORS(rbx, RBX)
@@ -37,31 +63,6 @@ BUILD_KVM_GPR_ACCESSORS(r14, R14)
BUILD_KVM_GPR_ACCESSORS(r15, R15)
#endif
-static inline bool kvm_register_is_available(struct kvm_vcpu *vcpu,
- enum kvm_reg reg)
-{
- return test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
-}
-
-static inline bool kvm_register_is_dirty(struct kvm_vcpu *vcpu,
- enum kvm_reg reg)
-{
- return test_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
-}
-
-static inline void kvm_register_mark_available(struct kvm_vcpu *vcpu,
- enum kvm_reg reg)
-{
- __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
-}
-
-static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu,
- enum kvm_reg reg)
-{
- __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
- __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
-}
-
static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, int reg)
{
if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_REGS))
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 86c33d53c90a..3136e05831cf 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2843,14 +2843,35 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
u8 sipi_vector;
+ int r;
unsigned long pe;
- if (!lapic_in_kernel(vcpu) || !apic->pending_events)
+ if (!lapic_in_kernel(vcpu))
return;
/*
+ * Read pending events before calling the check_events
+ * callback.
+ */
+ pe = smp_load_acquire(&apic->pending_events);
+ if (!pe)
+ return;
+
+ if (is_guest_mode(vcpu)) {
+ r = kvm_x86_ops.nested_ops->check_events(vcpu);
+ if (r < 0)
+ return;
+ /*
+ * If an event has happened and caused a vmexit,
+ * we know INITs are latched and therefore
+ * we will not incorrectly deliver an APIC
+ * event instead of a vmexit.
+ */
+ }
+
+ /*
* INITs are latched while CPU is in specific states
- * (SMM, VMX non-root mode, SVM with GIF=0).
+ * (SMM, VMX root mode, SVM with GIF=0).
* Because a CPU cannot be in these states immediately
* after it has processed an INIT signal (and thus in
* KVM_MP_STATE_INIT_RECEIVED state), just eat SIPIs
@@ -2858,26 +2879,28 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
*/
if (kvm_vcpu_latch_init(vcpu)) {
WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED);
- if (test_bit(KVM_APIC_SIPI, &apic->pending_events))
+ if (test_bit(KVM_APIC_SIPI, &pe))
clear_bit(KVM_APIC_SIPI, &apic->pending_events);
return;
}
- pe = xchg(&apic->pending_events, 0);
if (test_bit(KVM_APIC_INIT, &pe)) {
+ clear_bit(KVM_APIC_INIT, &apic->pending_events);
kvm_vcpu_reset(vcpu, true);
if (kvm_vcpu_is_bsp(apic->vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
else
vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
}
- if (test_bit(KVM_APIC_SIPI, &pe) &&
- vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
- /* evaluate pending_events before reading the vector */
- smp_rmb();
- sipi_vector = apic->sipi_vector;
- kvm_vcpu_deliver_sipi_vector(vcpu, sipi_vector);
- vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ if (test_bit(KVM_APIC_SIPI, &pe)) {
+ clear_bit(KVM_APIC_SIPI, &apic->pending_events);
+ if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
+ /* evaluate pending_events before reading the vector */
+ smp_rmb();
+ sipi_vector = apic->sipi_vector;
+ kvm_vcpu_deliver_sipi_vector(vcpu, sipi_vector);
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ }
}
}
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 7a6ae9e90bd7..c478904af518 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -820,7 +820,7 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
return NULL;
- if (no_dirty_log && slot->dirty_bitmap)
+ if (no_dirty_log && kvm_slot_dirty_track_enabled(slot))
return NULL;
return slot;
@@ -1289,6 +1289,14 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
}
+int kvm_cpu_dirty_log_size(void)
+{
+ if (kvm_x86_ops.cpu_dirty_log_size)
+ return kvm_x86_ops.cpu_dirty_log_size();
+
+ return 0;
+}
+
bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
struct kvm_memory_slot *slot, u64 gfn)
{
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
index 213699b27b44..e798489b56b5 100644
--- a/arch/x86/kvm/mmu/mmutrace.h
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -381,6 +381,35 @@ TRACE_EVENT(
)
);
+TRACE_EVENT(
+ kvm_tdp_mmu_spte_changed,
+ TP_PROTO(int as_id, gfn_t gfn, int level, u64 old_spte, u64 new_spte),
+ TP_ARGS(as_id, gfn, level, old_spte, new_spte),
+
+ TP_STRUCT__entry(
+ __field(u64, gfn)
+ __field(u64, old_spte)
+ __field(u64, new_spte)
+ /* Level cannot be larger than 5 on x86, so it fits in a u8. */
+ __field(u8, level)
+ /* as_id can only be 0 or 1 x86, so it fits in a u8. */
+ __field(u8, as_id)
+ ),
+
+ TP_fast_assign(
+ __entry->gfn = gfn;
+ __entry->old_spte = old_spte;
+ __entry->new_spte = new_spte;
+ __entry->level = level;
+ __entry->as_id = as_id;
+ ),
+
+ TP_printk("as id %d gfn %llx level %d old_spte %llx new_spte %llx",
+ __entry->as_id, __entry->gfn, __entry->level,
+ __entry->old_spte, __entry->new_spte
+ )
+);
+
#endif /* _TRACE_KVMMMU_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 84c8f06bec26..4bd2f1dc0172 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -7,6 +7,8 @@
#include "tdp_mmu.h"
#include "spte.h"
+#include <trace/events/kvm.h>
+
#ifdef CONFIG_X86_64
static bool __read_mostly tdp_mmu_enabled = false;
module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
@@ -108,6 +110,8 @@ static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
sp->gfn = gfn;
sp->tdp_mmu_page = true;
+ trace_kvm_mmu_get_page(sp, true);
+
return sp;
}
@@ -185,7 +189,7 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
if ((!is_writable_pte(old_spte) || pfn_changed) &&
is_writable_pte(new_spte)) {
slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
- mark_page_dirty_in_slot(slot, gfn);
+ mark_page_dirty_in_slot(kvm, slot, gfn);
}
}
@@ -244,6 +248,8 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
if (old_spte == new_spte)
return;
+ trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
+
/*
* The only times a SPTE should be changed from a non-present to
* non-present state is when an MMIO entry is installed/modified/
@@ -278,6 +284,8 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
pt = spte_to_child_pt(old_spte, level);
sp = sptep_to_sp(pt);
+ trace_kvm_mmu_prepare_zap_page(sp);
+
list_del(&sp->link);
if (sp->lpage_disallowed)
@@ -480,11 +488,13 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
if (unlikely(is_noslot_pfn(pfn))) {
new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
trace_mark_mmio_spte(iter->sptep, iter->gfn, new_spte);
- } else
+ } else {
make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
pfn, iter->old_spte, prefault, true,
map_writable, !shadow_accessed_mask,
&new_spte);
+ trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep);
+ }
if (new_spte == iter->old_spte)
ret = RET_PF_SPURIOUS;
@@ -698,6 +708,8 @@ static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte);
young = 1;
+
+ trace_kvm_age_page(iter.gfn, iter.level, slot, young);
}
return young;
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
index 7f0059aa30e1..f472fdb6ae7e 100644
--- a/arch/x86/kvm/mtrr.c
+++ b/arch/x86/kvm/mtrr.c
@@ -84,12 +84,8 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
} else
/* MTRR mask */
mask |= 0x7ff;
- if (data & mask) {
- kvm_inject_gp(vcpu, 0);
- return false;
- }
- return true;
+ return (data & mask) == 0;
}
EXPORT_SYMBOL_GPL(kvm_mtrr_valid);
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 8c550999ace0..0ef84d57b72e 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -233,7 +233,8 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
*/
static int avic_update_access_page(struct kvm *kvm, bool activate)
{
- int ret = 0;
+ void __user *ret;
+ int r = 0;
mutex_lock(&kvm->slots_lock);
/*
@@ -249,13 +250,15 @@ static int avic_update_access_page(struct kvm *kvm, bool activate)
APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
APIC_DEFAULT_PHYS_BASE,
activate ? PAGE_SIZE : 0);
- if (ret)
+ if (IS_ERR(ret)) {
+ r = PTR_ERR(ret);
goto out;
+ }
kvm->arch.apic_access_page_done = activate;
out:
mutex_unlock(&kvm->slots_lock);
- return ret;
+ return r;
}
static int avic_init_backing_page(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 9e4c226dbf7d..b0b667456b2e 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -254,7 +254,7 @@ static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12)
(vmcb12->save.cr3 & MSR_CR3_LONG_MBZ_MASK))
return false;
}
- if (kvm_valid_cr4(&svm->vcpu, vmcb12->save.cr4))
+ if (!kvm_is_valid_cr4(&svm->vcpu, vmcb12->save.cr4))
return false;
return nested_vmcb_check_controls(&vmcb12->control);
@@ -381,7 +381,7 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
svm->vmcb->save.ds = vmcb12->save.ds;
svm->vmcb->save.gdtr = vmcb12->save.gdtr;
svm->vmcb->save.idtr = vmcb12->save.idtr;
- kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags);
+ kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED);
svm_set_efer(&svm->vcpu, vmcb12->save.efer);
svm_set_cr0(&svm->vcpu, vmcb12->save.cr0);
svm_set_cr4(&svm->vcpu, vmcb12->save.cr4);
@@ -394,8 +394,8 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
svm->vmcb->save.rax = vmcb12->save.rax;
svm->vmcb->save.rsp = vmcb12->save.rsp;
svm->vmcb->save.rip = vmcb12->save.rip;
- svm->vmcb->save.dr7 = vmcb12->save.dr7;
- svm->vcpu.arch.dr6 = vmcb12->save.dr6;
+ svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1;
+ svm->vcpu.arch.dr6 = vmcb12->save.dr6 | DR6_FIXED_1 | DR6_RTM;
svm->vmcb->save.cpl = vmcb12->save.cpl;
}
@@ -660,13 +660,14 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
svm->vmcb->save.gdtr = hsave->save.gdtr;
svm->vmcb->save.idtr = hsave->save.idtr;
kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
+ kvm_set_rflags(&svm->vcpu, hsave->save.rflags | X86_EFLAGS_FIXED);
svm_set_efer(&svm->vcpu, hsave->save.efer);
svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
svm_set_cr4(&svm->vcpu, hsave->save.cr4);
kvm_rax_write(&svm->vcpu, hsave->save.rax);
kvm_rsp_write(&svm->vcpu, hsave->save.rsp);
kvm_rip_write(&svm->vcpu, hsave->save.rip);
- svm->vmcb->save.dr7 = 0;
+ svm->vmcb->save.dr7 = DR7_FIXED_1;
svm->vmcb->save.cpl = 0;
svm->vmcb->control.exit_int_info = 0;
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 566f4d18185b..9858d5ae9ddd 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -14,10 +14,20 @@
#include <linux/psp-sev.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
+#include <linux/processor.h>
+#include <linux/trace_events.h>
+#include <asm/fpu/internal.h>
+
+#include <asm/trapnr.h>
#include "x86.h"
#include "svm.h"
+#include "cpuid.h"
+#include "trace.h"
+
+#define __ex(x) __kvm_handle_fault_on_reboot(x)
+static u8 sev_enc_bit;
static int sev_flush_asids(void);
static DECLARE_RWSEM(sev_deactivate_lock);
static DEFINE_MUTEX(sev_bitmap_lock);
@@ -25,7 +35,6 @@ unsigned int max_sev_asid;
static unsigned int min_sev_asid;
static unsigned long *sev_asid_bitmap;
static unsigned long *sev_reclaim_asid_bitmap;
-#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT)
struct enc_region {
struct list_head list;
@@ -57,19 +66,19 @@ static int sev_flush_asids(void)
}
/* Must be called with the sev_bitmap_lock held */
-static bool __sev_recycle_asids(void)
+static bool __sev_recycle_asids(int min_asid, int max_asid)
{
int pos;
/* Check if there are any ASIDs to reclaim before performing a flush */
- pos = find_next_bit(sev_reclaim_asid_bitmap,
- max_sev_asid, min_sev_asid - 1);
- if (pos >= max_sev_asid)
+ pos = find_next_bit(sev_reclaim_asid_bitmap, max_sev_asid, min_asid);
+ if (pos >= max_asid)
return false;
if (sev_flush_asids())
return false;
+ /* The flush process will flush all reclaimable SEV and SEV-ES ASIDs */
bitmap_xor(sev_asid_bitmap, sev_asid_bitmap, sev_reclaim_asid_bitmap,
max_sev_asid);
bitmap_zero(sev_reclaim_asid_bitmap, max_sev_asid);
@@ -77,20 +86,23 @@ static bool __sev_recycle_asids(void)
return true;
}
-static int sev_asid_new(void)
+static int sev_asid_new(struct kvm_sev_info *sev)
{
+ int pos, min_asid, max_asid;
bool retry = true;
- int pos;
mutex_lock(&sev_bitmap_lock);
/*
- * SEV-enabled guest must use asid from min_sev_asid to max_sev_asid.
+ * SEV-enabled guests must use asid from min_sev_asid to max_sev_asid.
+ * SEV-ES-enabled guest can use from 1 to min_sev_asid - 1.
*/
+ min_asid = sev->es_active ? 0 : min_sev_asid - 1;
+ max_asid = sev->es_active ? min_sev_asid - 1 : max_sev_asid;
again:
- pos = find_next_zero_bit(sev_asid_bitmap, max_sev_asid, min_sev_asid - 1);
- if (pos >= max_sev_asid) {
- if (retry && __sev_recycle_asids()) {
+ pos = find_next_zero_bit(sev_asid_bitmap, max_sev_asid, min_asid);
+ if (pos >= max_asid) {
+ if (retry && __sev_recycle_asids(min_asid, max_asid)) {
retry = false;
goto again;
}
@@ -172,7 +184,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (unlikely(sev->active))
return ret;
- asid = sev_asid_new();
+ asid = sev_asid_new(sev);
if (asid < 0)
return ret;
@@ -191,6 +203,16 @@ e_free:
return ret;
}
+static int sev_es_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ if (!sev_es)
+ return -ENOTTY;
+
+ to_kvm_svm(kvm)->sev_info.es_active = true;
+
+ return sev_guest_init(kvm, argp);
+}
+
static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
{
struct sev_data_activate *data;
@@ -490,6 +512,96 @@ e_free:
return ret;
}
+static int sev_es_sync_vmsa(struct vcpu_svm *svm)
+{
+ struct vmcb_save_area *save = &svm->vmcb->save;
+
+ /* Check some debug related fields before encrypting the VMSA */
+ if (svm->vcpu.guest_debug || (save->dr7 & ~DR7_FIXED_1))
+ return -EINVAL;
+
+ /* Sync registgers */
+ save->rax = svm->vcpu.arch.regs[VCPU_REGS_RAX];
+ save->rbx = svm->vcpu.arch.regs[VCPU_REGS_RBX];
+ save->rcx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+ save->rdx = svm->vcpu.arch.regs[VCPU_REGS_RDX];
+ save->rsp = svm->vcpu.arch.regs[VCPU_REGS_RSP];
+ save->rbp = svm->vcpu.arch.regs[VCPU_REGS_RBP];
+ save->rsi = svm->vcpu.arch.regs[VCPU_REGS_RSI];
+ save->rdi = svm->vcpu.arch.regs[VCPU_REGS_RDI];
+#ifdef CONFIG_X86_64
+ save->r8 = svm->vcpu.arch.regs[VCPU_REGS_R8];
+ save->r9 = svm->vcpu.arch.regs[VCPU_REGS_R9];
+ save->r10 = svm->vcpu.arch.regs[VCPU_REGS_R10];
+ save->r11 = svm->vcpu.arch.regs[VCPU_REGS_R11];
+ save->r12 = svm->vcpu.arch.regs[VCPU_REGS_R12];
+ save->r13 = svm->vcpu.arch.regs[VCPU_REGS_R13];
+ save->r14 = svm->vcpu.arch.regs[VCPU_REGS_R14];
+ save->r15 = svm->vcpu.arch.regs[VCPU_REGS_R15];
+#endif
+ save->rip = svm->vcpu.arch.regs[VCPU_REGS_RIP];
+
+ /* Sync some non-GPR registers before encrypting */
+ save->xcr0 = svm->vcpu.arch.xcr0;
+ save->pkru = svm->vcpu.arch.pkru;
+ save->xss = svm->vcpu.arch.ia32_xss;
+
+ /*
+ * SEV-ES will use a VMSA that is pointed to by the VMCB, not
+ * the traditional VMSA that is part of the VMCB. Copy the
+ * traditional VMSA as it has been built so far (in prep
+ * for LAUNCH_UPDATE_VMSA) to be the initial SEV-ES state.
+ */
+ memcpy(svm->vmsa, save, sizeof(*save));
+
+ return 0;
+}
+
+static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_launch_update_vmsa *vmsa;
+ int i, ret;
+
+ if (!sev_es_guest(kvm))
+ return -ENOTTY;
+
+ vmsa = kzalloc(sizeof(*vmsa), GFP_KERNEL);
+ if (!vmsa)
+ return -ENOMEM;
+
+ for (i = 0; i < kvm->created_vcpus; i++) {
+ struct vcpu_svm *svm = to_svm(kvm->vcpus[i]);
+
+ /* Perform some pre-encryption checks against the VMSA */
+ ret = sev_es_sync_vmsa(svm);
+ if (ret)
+ goto e_free;
+
+ /*
+ * The LAUNCH_UPDATE_VMSA command will perform in-place
+ * encryption of the VMSA memory content (i.e it will write
+ * the same memory region with the guest's key), so invalidate
+ * it first.
+ */
+ clflush_cache_range(svm->vmsa, PAGE_SIZE);
+
+ vmsa->handle = sev->handle;
+ vmsa->address = __sme_pa(svm->vmsa);
+ vmsa->len = PAGE_SIZE;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, vmsa,
+ &argp->error);
+ if (ret)
+ goto e_free;
+
+ svm->vcpu.arch.guest_state_protected = true;
+ }
+
+e_free:
+ kfree(vmsa);
+ return ret;
+}
+
static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
void __user *measure = (void __user *)(uintptr_t)argp->data;
@@ -932,7 +1044,7 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
struct kvm_sev_cmd sev_cmd;
int r;
- if (!svm_sev_enabled())
+ if (!svm_sev_enabled() || !sev)
return -ENOTTY;
if (!argp)
@@ -947,12 +1059,18 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
case KVM_SEV_INIT:
r = sev_guest_init(kvm, &sev_cmd);
break;
+ case KVM_SEV_ES_INIT:
+ r = sev_es_guest_init(kvm, &sev_cmd);
+ break;
case KVM_SEV_LAUNCH_START:
r = sev_launch_start(kvm, &sev_cmd);
break;
case KVM_SEV_LAUNCH_UPDATE_DATA:
r = sev_launch_update_data(kvm, &sev_cmd);
break;
+ case KVM_SEV_LAUNCH_UPDATE_VMSA:
+ r = sev_launch_update_vmsa(kvm, &sev_cmd);
+ break;
case KVM_SEV_LAUNCH_MEASURE:
r = sev_launch_measure(kvm, &sev_cmd);
break;
@@ -1125,49 +1243,61 @@ void sev_vm_destroy(struct kvm *kvm)
sev_asid_free(sev->asid);
}
-int __init sev_hardware_setup(void)
+void __init sev_hardware_setup(void)
{
- struct sev_user_data_status *status;
- int rc;
+ unsigned int eax, ebx, ecx, edx;
+ bool sev_es_supported = false;
+ bool sev_supported = false;
+
+ /* Does the CPU support SEV? */
+ if (!boot_cpu_has(X86_FEATURE_SEV))
+ goto out;
+
+ /* Retrieve SEV CPUID information */
+ cpuid(0x8000001f, &eax, &ebx, &ecx, &edx);
+
+ /* Set encryption bit location for SEV-ES guests */
+ sev_enc_bit = ebx & 0x3f;
/* Maximum number of encrypted guests supported simultaneously */
- max_sev_asid = cpuid_ecx(0x8000001F);
+ max_sev_asid = ecx;
if (!svm_sev_enabled())
- return 1;
+ goto out;
/* Minimum ASID value that should be used for SEV guest */
- min_sev_asid = cpuid_edx(0x8000001F);
+ min_sev_asid = edx;
/* Initialize SEV ASID bitmaps */
sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL);
if (!sev_asid_bitmap)
- return 1;
+ goto out;
sev_reclaim_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL);
if (!sev_reclaim_asid_bitmap)
- return 1;
+ goto out;
- status = kmalloc(sizeof(*status), GFP_KERNEL);
- if (!status)
- return 1;
+ pr_info("SEV supported: %u ASIDs\n", max_sev_asid - min_sev_asid + 1);
+ sev_supported = true;
- /*
- * Check SEV platform status.
- *
- * PLATFORM_STATUS can be called in any state, if we failed to query
- * the PLATFORM status then either PSP firmware does not support SEV
- * feature or SEV firmware is dead.
- */
- rc = sev_platform_status(status, NULL);
- if (rc)
- goto err;
+ /* SEV-ES support requested? */
+ if (!sev_es)
+ goto out;
- pr_info("SEV supported\n");
+ /* Does the CPU support SEV-ES? */
+ if (!boot_cpu_has(X86_FEATURE_SEV_ES))
+ goto out;
-err:
- kfree(status);
- return rc;
+ /* Has the system been allocated ASIDs for SEV-ES? */
+ if (min_sev_asid == 1)
+ goto out;
+
+ pr_info("SEV-ES supported: %u ASIDs\n", min_sev_asid - 1);
+ sev_es_supported = true;
+
+out:
+ sev = sev_supported;
+ sev_es = sev_es_supported;
}
void sev_hardware_teardown(void)
@@ -1181,13 +1311,329 @@ void sev_hardware_teardown(void)
sev_flush_asids();
}
+/*
+ * Pages used by hardware to hold guest encrypted state must be flushed before
+ * returning them to the system.
+ */
+static void sev_flush_guest_memory(struct vcpu_svm *svm, void *va,
+ unsigned long len)
+{
+ /*
+ * If hardware enforced cache coherency for encrypted mappings of the
+ * same physical page is supported, nothing to do.
+ */
+ if (boot_cpu_has(X86_FEATURE_SME_COHERENT))
+ return;
+
+ /*
+ * If the VM Page Flush MSR is supported, use it to flush the page
+ * (using the page virtual address and the guest ASID).
+ */
+ if (boot_cpu_has(X86_FEATURE_VM_PAGE_FLUSH)) {
+ struct kvm_sev_info *sev;
+ unsigned long va_start;
+ u64 start, stop;
+
+ /* Align start and stop to page boundaries. */
+ va_start = (unsigned long)va;
+ start = (u64)va_start & PAGE_MASK;
+ stop = PAGE_ALIGN((u64)va_start + len);
+
+ if (start < stop) {
+ sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info;
+
+ while (start < stop) {
+ wrmsrl(MSR_AMD64_VM_PAGE_FLUSH,
+ start | sev->asid);
+
+ start += PAGE_SIZE;
+ }
+
+ return;
+ }
+
+ WARN(1, "Address overflow, using WBINVD\n");
+ }
+
+ /*
+ * Hardware should always have one of the above features,
+ * but if not, use WBINVD and issue a warning.
+ */
+ WARN_ONCE(1, "Using WBINVD to flush guest memory\n");
+ wbinvd_on_all_cpus();
+}
+
+void sev_free_vcpu(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm;
+
+ if (!sev_es_guest(vcpu->kvm))
+ return;
+
+ svm = to_svm(vcpu);
+
+ if (vcpu->arch.guest_state_protected)
+ sev_flush_guest_memory(svm, svm->vmsa, PAGE_SIZE);
+ __free_page(virt_to_page(svm->vmsa));
+
+ if (svm->ghcb_sa_free)
+ kfree(svm->ghcb_sa);
+}
+
+static void dump_ghcb(struct vcpu_svm *svm)
+{
+ struct ghcb *ghcb = svm->ghcb;
+ unsigned int nbits;
+
+ /* Re-use the dump_invalid_vmcb module parameter */
+ if (!dump_invalid_vmcb) {
+ pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
+ return;
+ }
+
+ nbits = sizeof(ghcb->save.valid_bitmap) * 8;
+
+ pr_err("GHCB (GPA=%016llx):\n", svm->vmcb->control.ghcb_gpa);
+ pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code",
+ ghcb->save.sw_exit_code, ghcb_sw_exit_code_is_valid(ghcb));
+ pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1",
+ ghcb->save.sw_exit_info_1, ghcb_sw_exit_info_1_is_valid(ghcb));
+ pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2",
+ ghcb->save.sw_exit_info_2, ghcb_sw_exit_info_2_is_valid(ghcb));
+ pr_err("%-20s%016llx is_valid: %u\n", "sw_scratch",
+ ghcb->save.sw_scratch, ghcb_sw_scratch_is_valid(ghcb));
+ pr_err("%-20s%*pb\n", "valid_bitmap", nbits, ghcb->save.valid_bitmap);
+}
+
+static void sev_es_sync_to_ghcb(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct ghcb *ghcb = svm->ghcb;
+
+ /*
+ * The GHCB protocol so far allows for the following data
+ * to be returned:
+ * GPRs RAX, RBX, RCX, RDX
+ *
+ * Copy their values to the GHCB if they are dirty.
+ */
+ if (kvm_register_is_dirty(vcpu, VCPU_REGS_RAX))
+ ghcb_set_rax(ghcb, vcpu->arch.regs[VCPU_REGS_RAX]);
+ if (kvm_register_is_dirty(vcpu, VCPU_REGS_RBX))
+ ghcb_set_rbx(ghcb, vcpu->arch.regs[VCPU_REGS_RBX]);
+ if (kvm_register_is_dirty(vcpu, VCPU_REGS_RCX))
+ ghcb_set_rcx(ghcb, vcpu->arch.regs[VCPU_REGS_RCX]);
+ if (kvm_register_is_dirty(vcpu, VCPU_REGS_RDX))
+ ghcb_set_rdx(ghcb, vcpu->arch.regs[VCPU_REGS_RDX]);
+}
+
+static void sev_es_sync_from_ghcb(struct vcpu_svm *svm)
+{
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct ghcb *ghcb = svm->ghcb;
+ u64 exit_code;
+
+ /*
+ * The GHCB protocol so far allows for the following data
+ * to be supplied:
+ * GPRs RAX, RBX, RCX, RDX
+ * XCR0
+ * CPL
+ *
+ * VMMCALL allows the guest to provide extra registers. KVM also
+ * expects RSI for hypercalls, so include that, too.
+ *
+ * Copy their values to the appropriate location if supplied.
+ */
+ memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
+
+ vcpu->arch.regs[VCPU_REGS_RAX] = ghcb_get_rax_if_valid(ghcb);
+ vcpu->arch.regs[VCPU_REGS_RBX] = ghcb_get_rbx_if_valid(ghcb);
+ vcpu->arch.regs[VCPU_REGS_RCX] = ghcb_get_rcx_if_valid(ghcb);
+ vcpu->arch.regs[VCPU_REGS_RDX] = ghcb_get_rdx_if_valid(ghcb);
+ vcpu->arch.regs[VCPU_REGS_RSI] = ghcb_get_rsi_if_valid(ghcb);
+
+ svm->vmcb->save.cpl = ghcb_get_cpl_if_valid(ghcb);
+
+ if (ghcb_xcr0_is_valid(ghcb)) {
+ vcpu->arch.xcr0 = ghcb_get_xcr0(ghcb);
+ kvm_update_cpuid_runtime(vcpu);
+ }
+
+ /* Copy the GHCB exit information into the VMCB fields */
+ exit_code = ghcb_get_sw_exit_code(ghcb);
+ control->exit_code = lower_32_bits(exit_code);
+ control->exit_code_hi = upper_32_bits(exit_code);
+ control->exit_info_1 = ghcb_get_sw_exit_info_1(ghcb);
+ control->exit_info_2 = ghcb_get_sw_exit_info_2(ghcb);
+
+ /* Clear the valid entries fields */
+ memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
+}
+
+static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu;
+ struct ghcb *ghcb;
+ u64 exit_code = 0;
+
+ ghcb = svm->ghcb;
+
+ /* Only GHCB Usage code 0 is supported */
+ if (ghcb->ghcb_usage)
+ goto vmgexit_err;
+
+ /*
+ * Retrieve the exit code now even though is may not be marked valid
+ * as it could help with debugging.
+ */
+ exit_code = ghcb_get_sw_exit_code(ghcb);
+
+ if (!ghcb_sw_exit_code_is_valid(ghcb) ||
+ !ghcb_sw_exit_info_1_is_valid(ghcb) ||
+ !ghcb_sw_exit_info_2_is_valid(ghcb))
+ goto vmgexit_err;
+
+ switch (ghcb_get_sw_exit_code(ghcb)) {
+ case SVM_EXIT_READ_DR7:
+ break;
+ case SVM_EXIT_WRITE_DR7:
+ if (!ghcb_rax_is_valid(ghcb))
+ goto vmgexit_err;
+ break;
+ case SVM_EXIT_RDTSC:
+ break;
+ case SVM_EXIT_RDPMC:
+ if (!ghcb_rcx_is_valid(ghcb))
+ goto vmgexit_err;
+ break;
+ case SVM_EXIT_CPUID:
+ if (!ghcb_rax_is_valid(ghcb) ||
+ !ghcb_rcx_is_valid(ghcb))
+ goto vmgexit_err;
+ if (ghcb_get_rax(ghcb) == 0xd)
+ if (!ghcb_xcr0_is_valid(ghcb))
+ goto vmgexit_err;
+ break;
+ case SVM_EXIT_INVD:
+ break;
+ case SVM_EXIT_IOIO:
+ if (ghcb_get_sw_exit_info_1(ghcb) & SVM_IOIO_STR_MASK) {
+ if (!ghcb_sw_scratch_is_valid(ghcb))
+ goto vmgexit_err;
+ } else {
+ if (!(ghcb_get_sw_exit_info_1(ghcb) & SVM_IOIO_TYPE_MASK))
+ if (!ghcb_rax_is_valid(ghcb))
+ goto vmgexit_err;
+ }
+ break;
+ case SVM_EXIT_MSR:
+ if (!ghcb_rcx_is_valid(ghcb))
+ goto vmgexit_err;
+ if (ghcb_get_sw_exit_info_1(ghcb)) {
+ if (!ghcb_rax_is_valid(ghcb) ||
+ !ghcb_rdx_is_valid(ghcb))
+ goto vmgexit_err;
+ }
+ break;
+ case SVM_EXIT_VMMCALL:
+ if (!ghcb_rax_is_valid(ghcb) ||
+ !ghcb_cpl_is_valid(ghcb))
+ goto vmgexit_err;
+ break;
+ case SVM_EXIT_RDTSCP:
+ break;
+ case SVM_EXIT_WBINVD:
+ break;
+ case SVM_EXIT_MONITOR:
+ if (!ghcb_rax_is_valid(ghcb) ||
+ !ghcb_rcx_is_valid(ghcb) ||
+ !ghcb_rdx_is_valid(ghcb))
+ goto vmgexit_err;
+ break;
+ case SVM_EXIT_MWAIT:
+ if (!ghcb_rax_is_valid(ghcb) ||
+ !ghcb_rcx_is_valid(ghcb))
+ goto vmgexit_err;
+ break;
+ case SVM_VMGEXIT_MMIO_READ:
+ case SVM_VMGEXIT_MMIO_WRITE:
+ if (!ghcb_sw_scratch_is_valid(ghcb))
+ goto vmgexit_err;
+ break;
+ case SVM_VMGEXIT_NMI_COMPLETE:
+ case SVM_VMGEXIT_AP_JUMP_TABLE:
+ case SVM_VMGEXIT_UNSUPPORTED_EVENT:
+ break;
+ default:
+ goto vmgexit_err;
+ }
+
+ return 0;
+
+vmgexit_err:
+ vcpu = &svm->vcpu;
+
+ if (ghcb->ghcb_usage) {
+ vcpu_unimpl(vcpu, "vmgexit: ghcb usage %#x is not valid\n",
+ ghcb->ghcb_usage);
+ } else {
+ vcpu_unimpl(vcpu, "vmgexit: exit reason %#llx is not valid\n",
+ exit_code);
+ dump_ghcb(svm);
+ }
+
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
+ vcpu->run->internal.ndata = 2;
+ vcpu->run->internal.data[0] = exit_code;
+ vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
+
+ return -EINVAL;
+}
+
+static void pre_sev_es_run(struct vcpu_svm *svm)
+{
+ if (!svm->ghcb)
+ return;
+
+ if (svm->ghcb_sa_free) {
+ /*
+ * The scratch area lives outside the GHCB, so there is a
+ * buffer that, depending on the operation performed, may
+ * need to be synced, then freed.
+ */
+ if (svm->ghcb_sa_sync) {
+ kvm_write_guest(svm->vcpu.kvm,
+ ghcb_get_sw_scratch(svm->ghcb),
+ svm->ghcb_sa, svm->ghcb_sa_len);
+ svm->ghcb_sa_sync = false;
+ }
+
+ kfree(svm->ghcb_sa);
+ svm->ghcb_sa = NULL;
+ svm->ghcb_sa_free = false;
+ }
+
+ trace_kvm_vmgexit_exit(svm->vcpu.vcpu_id, svm->ghcb);
+
+ sev_es_sync_to_ghcb(svm);
+
+ kvm_vcpu_unmap(&svm->vcpu, &svm->ghcb_map, true);
+ svm->ghcb = NULL;
+}
+
void pre_sev_run(struct vcpu_svm *svm, int cpu)
{
struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
int asid = sev_get_asid(svm->vcpu.kvm);
+ /* Perform any SEV-ES pre-run actions */
+ pre_sev_es_run(svm);
+
/* Assign the asid allocated with this SEV guest */
- svm->vmcb->control.asid = asid;
+ svm->asid = asid;
/*
* Flush guest TLB:
@@ -1203,3 +1649,394 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu)
svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
}
+
+#define GHCB_SCRATCH_AREA_LIMIT (16ULL * PAGE_SIZE)
+static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
+{
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ struct ghcb *ghcb = svm->ghcb;
+ u64 ghcb_scratch_beg, ghcb_scratch_end;
+ u64 scratch_gpa_beg, scratch_gpa_end;
+ void *scratch_va;
+
+ scratch_gpa_beg = ghcb_get_sw_scratch(ghcb);
+ if (!scratch_gpa_beg) {
+ pr_err("vmgexit: scratch gpa not provided\n");
+ return false;
+ }
+
+ scratch_gpa_end = scratch_gpa_beg + len;
+ if (scratch_gpa_end < scratch_gpa_beg) {
+ pr_err("vmgexit: scratch length (%#llx) not valid for scratch address (%#llx)\n",
+ len, scratch_gpa_beg);
+ return false;
+ }
+
+ if ((scratch_gpa_beg & PAGE_MASK) == control->ghcb_gpa) {
+ /* Scratch area begins within GHCB */
+ ghcb_scratch_beg = control->ghcb_gpa +
+ offsetof(struct ghcb, shared_buffer);
+ ghcb_scratch_end = control->ghcb_gpa +
+ offsetof(struct ghcb, reserved_1);
+
+ /*
+ * If the scratch area begins within the GHCB, it must be
+ * completely contained in the GHCB shared buffer area.
+ */
+ if (scratch_gpa_beg < ghcb_scratch_beg ||
+ scratch_gpa_end > ghcb_scratch_end) {
+ pr_err("vmgexit: scratch area is outside of GHCB shared buffer area (%#llx - %#llx)\n",
+ scratch_gpa_beg, scratch_gpa_end);
+ return false;
+ }
+
+ scratch_va = (void *)svm->ghcb;
+ scratch_va += (scratch_gpa_beg - control->ghcb_gpa);
+ } else {
+ /*
+ * The guest memory must be read into a kernel buffer, so
+ * limit the size
+ */
+ if (len > GHCB_SCRATCH_AREA_LIMIT) {
+ pr_err("vmgexit: scratch area exceeds KVM limits (%#llx requested, %#llx limit)\n",
+ len, GHCB_SCRATCH_AREA_LIMIT);
+ return false;
+ }
+ scratch_va = kzalloc(len, GFP_KERNEL);
+ if (!scratch_va)
+ return false;
+
+ if (kvm_read_guest(svm->vcpu.kvm, scratch_gpa_beg, scratch_va, len)) {
+ /* Unable to copy scratch area from guest */
+ pr_err("vmgexit: kvm_read_guest for scratch area failed\n");
+
+ kfree(scratch_va);
+ return false;
+ }
+
+ /*
+ * The scratch area is outside the GHCB. The operation will
+ * dictate whether the buffer needs to be synced before running
+ * the vCPU next time (i.e. a read was requested so the data
+ * must be written back to the guest memory).
+ */
+ svm->ghcb_sa_sync = sync;
+ svm->ghcb_sa_free = true;
+ }
+
+ svm->ghcb_sa = scratch_va;
+ svm->ghcb_sa_len = len;
+
+ return true;
+}
+
+static void set_ghcb_msr_bits(struct vcpu_svm *svm, u64 value, u64 mask,
+ unsigned int pos)
+{
+ svm->vmcb->control.ghcb_gpa &= ~(mask << pos);
+ svm->vmcb->control.ghcb_gpa |= (value & mask) << pos;
+}
+
+static u64 get_ghcb_msr_bits(struct vcpu_svm *svm, u64 mask, unsigned int pos)
+{
+ return (svm->vmcb->control.ghcb_gpa >> pos) & mask;
+}
+
+static void set_ghcb_msr(struct vcpu_svm *svm, u64 value)
+{
+ svm->vmcb->control.ghcb_gpa = value;
+}
+
+static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
+{
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ u64 ghcb_info;
+ int ret = 1;
+
+ ghcb_info = control->ghcb_gpa & GHCB_MSR_INFO_MASK;
+
+ trace_kvm_vmgexit_msr_protocol_enter(svm->vcpu.vcpu_id,
+ control->ghcb_gpa);
+
+ switch (ghcb_info) {
+ case GHCB_MSR_SEV_INFO_REQ:
+ set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX,
+ GHCB_VERSION_MIN,
+ sev_enc_bit));
+ break;
+ case GHCB_MSR_CPUID_REQ: {
+ u64 cpuid_fn, cpuid_reg, cpuid_value;
+
+ cpuid_fn = get_ghcb_msr_bits(svm,
+ GHCB_MSR_CPUID_FUNC_MASK,
+ GHCB_MSR_CPUID_FUNC_POS);
+
+ /* Initialize the registers needed by the CPUID intercept */
+ vcpu->arch.regs[VCPU_REGS_RAX] = cpuid_fn;
+ vcpu->arch.regs[VCPU_REGS_RCX] = 0;
+
+ ret = svm_invoke_exit_handler(svm, SVM_EXIT_CPUID);
+ if (!ret) {
+ ret = -EINVAL;
+ break;
+ }
+
+ cpuid_reg = get_ghcb_msr_bits(svm,
+ GHCB_MSR_CPUID_REG_MASK,
+ GHCB_MSR_CPUID_REG_POS);
+ if (cpuid_reg == 0)
+ cpuid_value = vcpu->arch.regs[VCPU_REGS_RAX];
+ else if (cpuid_reg == 1)
+ cpuid_value = vcpu->arch.regs[VCPU_REGS_RBX];
+ else if (cpuid_reg == 2)
+ cpuid_value = vcpu->arch.regs[VCPU_REGS_RCX];
+ else
+ cpuid_value = vcpu->arch.regs[VCPU_REGS_RDX];
+
+ set_ghcb_msr_bits(svm, cpuid_value,
+ GHCB_MSR_CPUID_VALUE_MASK,
+ GHCB_MSR_CPUID_VALUE_POS);
+
+ set_ghcb_msr_bits(svm, GHCB_MSR_CPUID_RESP,
+ GHCB_MSR_INFO_MASK,
+ GHCB_MSR_INFO_POS);
+ break;
+ }
+ case GHCB_MSR_TERM_REQ: {
+ u64 reason_set, reason_code;
+
+ reason_set = get_ghcb_msr_bits(svm,
+ GHCB_MSR_TERM_REASON_SET_MASK,
+ GHCB_MSR_TERM_REASON_SET_POS);
+ reason_code = get_ghcb_msr_bits(svm,
+ GHCB_MSR_TERM_REASON_MASK,
+ GHCB_MSR_TERM_REASON_POS);
+ pr_info("SEV-ES guest requested termination: %#llx:%#llx\n",
+ reason_set, reason_code);
+ fallthrough;
+ }
+ default:
+ ret = -EINVAL;
+ }
+
+ trace_kvm_vmgexit_msr_protocol_exit(svm->vcpu.vcpu_id,
+ control->ghcb_gpa, ret);
+
+ return ret;
+}
+
+int sev_handle_vmgexit(struct vcpu_svm *svm)
+{
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ u64 ghcb_gpa, exit_code;
+ struct ghcb *ghcb;
+ int ret;
+
+ /* Validate the GHCB */
+ ghcb_gpa = control->ghcb_gpa;
+ if (ghcb_gpa & GHCB_MSR_INFO_MASK)
+ return sev_handle_vmgexit_msr_protocol(svm);
+
+ if (!ghcb_gpa) {
+ vcpu_unimpl(&svm->vcpu, "vmgexit: GHCB gpa is not set\n");
+ return -EINVAL;
+ }
+
+ if (kvm_vcpu_map(&svm->vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->ghcb_map)) {
+ /* Unable to map GHCB from guest */
+ vcpu_unimpl(&svm->vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n",
+ ghcb_gpa);
+ return -EINVAL;
+ }
+
+ svm->ghcb = svm->ghcb_map.hva;
+ ghcb = svm->ghcb_map.hva;
+
+ trace_kvm_vmgexit_enter(svm->vcpu.vcpu_id, ghcb);
+
+ exit_code = ghcb_get_sw_exit_code(ghcb);
+
+ ret = sev_es_validate_vmgexit(svm);
+ if (ret)
+ return ret;
+
+ sev_es_sync_from_ghcb(svm);
+ ghcb_set_sw_exit_info_1(ghcb, 0);
+ ghcb_set_sw_exit_info_2(ghcb, 0);
+
+ ret = -EINVAL;
+ switch (exit_code) {
+ case SVM_VMGEXIT_MMIO_READ:
+ if (!setup_vmgexit_scratch(svm, true, control->exit_info_2))
+ break;
+
+ ret = kvm_sev_es_mmio_read(&svm->vcpu,
+ control->exit_info_1,
+ control->exit_info_2,
+ svm->ghcb_sa);
+ break;
+ case SVM_VMGEXIT_MMIO_WRITE:
+ if (!setup_vmgexit_scratch(svm, false, control->exit_info_2))
+ break;
+
+ ret = kvm_sev_es_mmio_write(&svm->vcpu,
+ control->exit_info_1,
+ control->exit_info_2,
+ svm->ghcb_sa);
+ break;
+ case SVM_VMGEXIT_NMI_COMPLETE:
+ ret = svm_invoke_exit_handler(svm, SVM_EXIT_IRET);
+ break;
+ case SVM_VMGEXIT_AP_JUMP_TABLE: {
+ struct kvm_sev_info *sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info;
+
+ switch (control->exit_info_1) {
+ case 0:
+ /* Set AP jump table address */
+ sev->ap_jump_table = control->exit_info_2;
+ break;
+ case 1:
+ /* Get AP jump table address */
+ ghcb_set_sw_exit_info_2(ghcb, sev->ap_jump_table);
+ break;
+ default:
+ pr_err("svm: vmgexit: unsupported AP jump table request - exit_info_1=%#llx\n",
+ control->exit_info_1);
+ ghcb_set_sw_exit_info_1(ghcb, 1);
+ ghcb_set_sw_exit_info_2(ghcb,
+ X86_TRAP_UD |
+ SVM_EVTINJ_TYPE_EXEPT |
+ SVM_EVTINJ_VALID);
+ }
+
+ ret = 1;
+ break;
+ }
+ case SVM_VMGEXIT_UNSUPPORTED_EVENT:
+ vcpu_unimpl(&svm->vcpu,
+ "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n",
+ control->exit_info_1, control->exit_info_2);
+ break;
+ default:
+ ret = svm_invoke_exit_handler(svm, exit_code);
+ }
+
+ return ret;
+}
+
+int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in)
+{
+ if (!setup_vmgexit_scratch(svm, in, svm->vmcb->control.exit_info_2))
+ return -EINVAL;
+
+ return kvm_sev_es_string_io(&svm->vcpu, size, port,
+ svm->ghcb_sa, svm->ghcb_sa_len, in);
+}
+
+void sev_es_init_vmcb(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+
+ svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ES_ENABLE;
+ svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
+
+ /*
+ * An SEV-ES guest requires a VMSA area that is a separate from the
+ * VMCB page. Do not include the encryption mask on the VMSA physical
+ * address since hardware will access it using the guest key.
+ */
+ svm->vmcb->control.vmsa_pa = __pa(svm->vmsa);
+
+ /* Can't intercept CR register access, HV can't modify CR registers */
+ svm_clr_intercept(svm, INTERCEPT_CR0_READ);
+ svm_clr_intercept(svm, INTERCEPT_CR4_READ);
+ svm_clr_intercept(svm, INTERCEPT_CR8_READ);
+ svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
+ svm_clr_intercept(svm, INTERCEPT_CR4_WRITE);
+ svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
+
+ svm_clr_intercept(svm, INTERCEPT_SELECTIVE_CR0);
+
+ /* Track EFER/CR register changes */
+ svm_set_intercept(svm, TRAP_EFER_WRITE);
+ svm_set_intercept(svm, TRAP_CR0_WRITE);
+ svm_set_intercept(svm, TRAP_CR4_WRITE);
+ svm_set_intercept(svm, TRAP_CR8_WRITE);
+
+ /* No support for enable_vmware_backdoor */
+ clr_exception_intercept(svm, GP_VECTOR);
+
+ /* Can't intercept XSETBV, HV can't modify XCR0 directly */
+ svm_clr_intercept(svm, INTERCEPT_XSETBV);
+
+ /* Clear intercepts on selected MSRs */
+ set_msr_interception(vcpu, svm->msrpm, MSR_EFER, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_CR_PAT, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
+}
+
+void sev_es_create_vcpu(struct vcpu_svm *svm)
+{
+ /*
+ * Set the GHCB MSR value as per the GHCB specification when creating
+ * a vCPU for an SEV-ES guest.
+ */
+ set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX,
+ GHCB_VERSION_MIN,
+ sev_enc_bit));
+}
+
+void sev_es_vcpu_load(struct vcpu_svm *svm, int cpu)
+{
+ struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
+ struct vmcb_save_area *hostsa;
+ unsigned int i;
+
+ /*
+ * As an SEV-ES guest, hardware will restore the host state on VMEXIT,
+ * of which one step is to perform a VMLOAD. Since hardware does not
+ * perform a VMSAVE on VMRUN, the host savearea must be updated.
+ */
+ asm volatile(__ex("vmsave") : : "a" (__sme_page_pa(sd->save_area)) : "memory");
+
+ /*
+ * Certain MSRs are restored on VMEXIT, only save ones that aren't
+ * restored.
+ */
+ for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) {
+ if (host_save_user_msrs[i].sev_es_restored)
+ continue;
+
+ rdmsrl(host_save_user_msrs[i].index, svm->host_user_msrs[i]);
+ }
+
+ /* XCR0 is restored on VMEXIT, save the current host value */
+ hostsa = (struct vmcb_save_area *)(page_address(sd->save_area) + 0x400);
+ hostsa->xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+
+ /* PKRU is restored on VMEXIT, save the curent host value */
+ hostsa->pkru = read_pkru();
+
+ /* MSR_IA32_XSS is restored on VMEXIT, save the currnet host value */
+ hostsa->xss = host_xss;
+}
+
+void sev_es_vcpu_put(struct vcpu_svm *svm)
+{
+ unsigned int i;
+
+ /*
+ * Certain MSRs are restored on VMEXIT and were saved with vmsave in
+ * sev_es_vcpu_load() above. Only restore ones that weren't.
+ */
+ for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) {
+ if (host_save_user_msrs[i].sev_es_restored)
+ continue;
+
+ wrmsrl(host_save_user_msrs[i].index, svm->host_user_msrs[i]);
+ }
+}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index da7eb4aaf44f..cce0143a6f80 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -33,9 +33,9 @@
#include <asm/debugreg.h>
#include <asm/kvm_para.h>
#include <asm/irq_remapping.h>
-#include <asm/mce.h>
#include <asm/spec-ctrl.h>
#include <asm/cpu_device_id.h>
+#include <asm/traps.h>
#include <asm/virtext.h>
#include "trace.h"
@@ -90,7 +90,7 @@ static DEFINE_PER_CPU(u64, current_tsc_ratio);
static const struct svm_direct_access_msrs {
u32 index; /* Index of the MSR */
- bool always; /* True if intercept is always on */
+ bool always; /* True if intercept is initially cleared */
} direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = {
{ .index = MSR_STAR, .always = true },
{ .index = MSR_IA32_SYSENTER_CS, .always = true },
@@ -108,6 +108,9 @@ static const struct svm_direct_access_msrs {
{ .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
{ .index = MSR_IA32_LASTINTFROMIP, .always = false },
{ .index = MSR_IA32_LASTINTTOIP, .always = false },
+ { .index = MSR_EFER, .always = false },
+ { .index = MSR_IA32_CR_PAT, .always = false },
+ { .index = MSR_AMD64_SEV_ES_GHCB, .always = true },
{ .index = MSR_INVALID, .always = false },
};
@@ -187,10 +190,14 @@ static int vgif = true;
module_param(vgif, int, 0444);
/* enable/disable SEV support */
-static int sev = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT);
+int sev = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT);
module_param(sev, int, 0444);
-static bool __read_mostly dump_invalid_vmcb = 0;
+/* enable/disable SEV-ES support */
+int sev_es = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT);
+module_param(sev_es, int, 0444);
+
+bool __read_mostly dump_invalid_vmcb;
module_param(dump_invalid_vmcb, bool, 0644);
static u8 rsm_ins_bytes[] = "\x0f\xaa";
@@ -336,6 +343,13 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ /*
+ * SEV-ES does not expose the next RIP. The RIP update is controlled by
+ * the type of exit and the #VC handler in the guest.
+ */
+ if (sev_es_guest(vcpu->kvm))
+ goto done;
+
if (nrips && svm->vmcb->control.next_rip != 0) {
WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
svm->next_rip = svm->vmcb->control.next_rip;
@@ -347,6 +361,8 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
} else {
kvm_rip_write(vcpu, svm->next_rip);
}
+
+done:
svm_set_interrupt_shadow(vcpu, 0);
return 1;
@@ -484,7 +500,7 @@ static int svm_hardware_enable(void)
wrmsrl(MSR_EFER, efer | EFER_SVME);
- wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
+ wrmsrl(MSR_VM_HSAVE_PA, __sme_page_pa(sd->save_area));
if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
@@ -552,6 +568,7 @@ static int svm_cpu_init(int cpu)
sd->save_area = alloc_page(GFP_KERNEL);
if (!sd->save_area)
goto free_cpu_data;
+ clear_page(page_address(sd->save_area));
if (svm_sev_enabled()) {
sd->sev_vmcbs = kmalloc_array(max_sev_asid + 1,
@@ -662,8 +679,8 @@ static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
msrpm[offset] = tmp;
}
-static void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
- int read, int write)
+void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
+ int read, int write)
{
set_shadow_msr_intercept(vcpu, msr, read, write);
set_msr_interception_bitmap(vcpu, msrpm, msr, read, write);
@@ -959,15 +976,11 @@ static __init int svm_hardware_setup(void)
kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
}
- if (sev) {
- if (boot_cpu_has(X86_FEATURE_SEV) &&
- IS_ENABLED(CONFIG_KVM_AMD_SEV)) {
- r = sev_hardware_setup();
- if (r)
- sev = false;
- } else {
- sev = false;
- }
+ if (IS_ENABLED(CONFIG_KVM_AMD_SEV) && sev) {
+ sev_hardware_setup();
+ } else {
+ sev = false;
+ sev_es = false;
}
svm_adjust_mmio_mask();
@@ -1215,6 +1228,7 @@ static void init_vmcb(struct vcpu_svm *svm)
save->cr4 = 0;
}
svm->asid_generation = 0;
+ svm->asid = 0;
svm->nested.vmcb12_gpa = 0;
svm->vcpu.arch.hflags = 0;
@@ -1252,6 +1266,11 @@ static void init_vmcb(struct vcpu_svm *svm)
if (sev_guest(svm->vcpu.kvm)) {
svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE;
clr_exception_intercept(svm, UD_VECTOR);
+
+ if (sev_es_guest(svm->vcpu.kvm)) {
+ /* Perform SEV-ES specific VMCB updates */
+ sev_es_init_vmcb(svm);
+ }
}
vmcb_mark_all_dirty(svm->vmcb);
@@ -1288,6 +1307,7 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm;
struct page *vmcb_page;
+ struct page *vmsa_page = NULL;
int err;
BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
@@ -1298,9 +1318,27 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
if (!vmcb_page)
goto out;
+ if (sev_es_guest(svm->vcpu.kvm)) {
+ /*
+ * SEV-ES guests require a separate VMSA page used to contain
+ * the encrypted register state of the guest.
+ */
+ vmsa_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!vmsa_page)
+ goto error_free_vmcb_page;
+
+ /*
+ * SEV-ES guests maintain an encrypted version of their FPU
+ * state which is restored and saved on VMRUN and VMEXIT.
+ * Free the fpu structure to prevent KVM from attempting to
+ * access the FPU state.
+ */
+ kvm_free_guest_fpu(vcpu);
+ }
+
err = avic_init_vcpu(svm);
if (err)
- goto error_free_vmcb_page;
+ goto error_free_vmsa_page;
/* We initialize this flag to true to make sure that the is_running
* bit would be set the first time the vcpu is loaded.
@@ -1311,21 +1349,32 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
svm->msrpm = svm_vcpu_alloc_msrpm();
if (!svm->msrpm) {
err = -ENOMEM;
- goto error_free_vmcb_page;
+ goto error_free_vmsa_page;
}
svm_vcpu_init_msrpm(vcpu, svm->msrpm);
svm->vmcb = page_address(vmcb_page);
svm->vmcb_pa = __sme_set(page_to_pfn(vmcb_page) << PAGE_SHIFT);
+
+ if (vmsa_page)
+ svm->vmsa = page_address(vmsa_page);
+
svm->asid_generation = 0;
init_vmcb(svm);
svm_init_osvw(vcpu);
vcpu->arch.microcode_version = 0x01000065;
+ if (sev_es_guest(svm->vcpu.kvm))
+ /* Perform SEV-ES specific VMCB creation updates */
+ sev_es_create_vcpu(svm);
+
return 0;
+error_free_vmsa_page:
+ if (vmsa_page)
+ __free_page(vmsa_page);
error_free_vmcb_page:
__free_page(vmcb_page);
out:
@@ -1353,6 +1402,8 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
svm_free_nested(svm);
+ sev_free_vcpu(vcpu);
+
__free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
__free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
}
@@ -1368,15 +1419,20 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
vmcb_mark_all_dirty(svm->vmcb);
}
+ if (sev_es_guest(svm->vcpu.kvm)) {
+ sev_es_vcpu_load(svm, cpu);
+ } else {
#ifdef CONFIG_X86_64
- rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
+ rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
#endif
- savesegment(fs, svm->host.fs);
- savesegment(gs, svm->host.gs);
- svm->host.ldt = kvm_read_ldt();
+ savesegment(fs, svm->host.fs);
+ savesegment(gs, svm->host.gs);
+ svm->host.ldt = kvm_read_ldt();
- for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
- rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
+ for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
+ rdmsrl(host_save_user_msrs[i].index,
+ svm->host_user_msrs[i]);
+ }
if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio;
@@ -1404,18 +1460,24 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
avic_vcpu_put(vcpu);
++vcpu->stat.host_state_reload;
- kvm_load_ldt(svm->host.ldt);
+ if (sev_es_guest(svm->vcpu.kvm)) {
+ sev_es_vcpu_put(svm);
+ } else {
+ kvm_load_ldt(svm->host.ldt);
#ifdef CONFIG_X86_64
- loadsegment(fs, svm->host.fs);
- wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gsbase);
- load_gs_index(svm->host.gs);
+ loadsegment(fs, svm->host.fs);
+ wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gsbase);
+ load_gs_index(svm->host.gs);
#else
#ifdef CONFIG_X86_32_LAZY_GS
- loadsegment(gs, svm->host.gs);
+ loadsegment(gs, svm->host.gs);
#endif
#endif
- for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
- wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
+
+ for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
+ wrmsrl(host_save_user_msrs[i].index,
+ svm->host_user_msrs[i]);
+ }
}
static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
@@ -1633,9 +1695,18 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
static void update_cr0_intercept(struct vcpu_svm *svm)
{
- ulong gcr0 = svm->vcpu.arch.cr0;
- u64 *hcr0 = &svm->vmcb->save.cr0;
+ ulong gcr0;
+ u64 *hcr0;
+ /*
+ * SEV-ES guests must always keep the CR intercepts cleared. CR
+ * tracking is done using the CR write traps.
+ */
+ if (sev_es_guest(svm->vcpu.kvm))
+ return;
+
+ gcr0 = svm->vcpu.arch.cr0;
+ hcr0 = &svm->vmcb->save.cr0;
*hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
| (gcr0 & SVM_CR0_SELECTIVE_MASK);
@@ -1655,7 +1726,7 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
struct vcpu_svm *svm = to_svm(vcpu);
#ifdef CONFIG_X86_64
- if (vcpu->arch.efer & EFER_LME) {
+ if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) {
if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
vcpu->arch.efer |= EFER_LMA;
svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
@@ -1684,13 +1755,15 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
update_cr0_intercept(svm);
}
-int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
- unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
- unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
+ return true;
+}
- if (cr4 & X86_CR4_VMXE)
- return 1;
+void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
+ unsigned long old_cr4 = vcpu->arch.cr4;
if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
svm_flush_tlb(vcpu);
@@ -1701,7 +1774,9 @@ int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
cr4 |= host_cr4_mce;
to_svm(vcpu)->vmcb->save.cr4 = cr4;
vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
- return 0;
+
+ if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
+ kvm_update_cpuid_runtime(vcpu);
}
static void svm_set_segment(struct kvm_vcpu *vcpu,
@@ -1753,18 +1828,20 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
++sd->asid_generation;
sd->next_asid = sd->min_asid;
svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
+ vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
}
svm->asid_generation = sd->asid_generation;
- svm->vmcb->control.asid = sd->next_asid++;
-
- vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
+ svm->asid = sd->next_asid++;
}
static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value)
{
struct vmcb *vmcb = svm->vmcb;
+ if (svm->vcpu.arch.guest_state_protected)
+ return;
+
if (unlikely(value != vmcb->save.dr6)) {
vmcb->save.dr6 = value;
vmcb_mark_dirty(vmcb, VMCB_DR);
@@ -1775,6 +1852,9 @@ static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ if (vcpu->arch.guest_state_protected)
+ return;
+
get_debugreg(vcpu->arch.db[0], 0);
get_debugreg(vcpu->arch.db[1], 1);
get_debugreg(vcpu->arch.db[2], 2);
@@ -1793,6 +1873,9 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ if (vcpu->arch.guest_state_protected)
+ return;
+
svm->vmcb->save.dr7 = value;
vmcb_mark_dirty(svm->vmcb, VMCB_DR);
}
@@ -1931,25 +2014,6 @@ static bool is_erratum_383(void)
return true;
}
-/*
- * Trigger machine check on the host. We assume all the MSRs are already set up
- * by the CPU and that we still run on the same CPU as the MCE occurred on.
- * We pass a fake environment to the machine check handler because we want
- * the guest to be always treated like user space, no matter what context
- * it used internally.
- */
-static void kvm_machine_check(void)
-{
-#if defined(CONFIG_X86_MCE)
- struct pt_regs regs = {
- .cs = 3, /* Fake ring 3 no matter what the guest ran on */
- .flags = X86_EFLAGS_IF,
- };
-
- do_machine_check(&regs);
-#endif
-}
-
static void svm_handle_mce(struct vcpu_svm *svm)
{
if (is_erratum_383()) {
@@ -1981,6 +2045,13 @@ static int shutdown_interception(struct vcpu_svm *svm)
struct kvm_run *kvm_run = svm->vcpu.run;
/*
+ * The VM save area has already been encrypted so it
+ * cannot be reinitialized - just terminate.
+ */
+ if (sev_es_guest(svm->vcpu.kvm))
+ return -EINVAL;
+
+ /*
* VMCB is undefined after a SHUTDOWN intercept
* so reinitialize it.
*/
@@ -2001,11 +2072,16 @@ static int io_interception(struct vcpu_svm *svm)
++svm->vcpu.stat.io_exits;
string = (io_info & SVM_IOIO_STR_MASK) != 0;
in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
- if (string)
- return kvm_emulate_instruction(vcpu, 0);
-
port = io_info >> 16;
size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
+
+ if (string) {
+ if (sev_es_guest(vcpu->kvm))
+ return sev_es_string_io(svm, size, port, in);
+ else
+ return kvm_emulate_instruction(vcpu, 0);
+ }
+
svm->next_rip = svm->vmcb->control.exit_info_2;
return kvm_fast_pio(&svm->vcpu, size, port, in);
@@ -2269,9 +2345,11 @@ static int cpuid_interception(struct vcpu_svm *svm)
static int iret_interception(struct vcpu_svm *svm)
{
++svm->vcpu.stat.nmi_window_exits;
- svm_clr_intercept(svm, INTERCEPT_IRET);
svm->vcpu.arch.hflags |= HF_IRET_MASK;
- svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
+ if (!sev_es_guest(svm->vcpu.kvm)) {
+ svm_clr_intercept(svm, INTERCEPT_IRET);
+ svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
+ }
kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
return 1;
}
@@ -2408,6 +2486,41 @@ static int cr_interception(struct vcpu_svm *svm)
return kvm_complete_insn_gp(&svm->vcpu, err);
}
+static int cr_trap(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ unsigned long old_value, new_value;
+ unsigned int cr;
+ int ret = 0;
+
+ new_value = (unsigned long)svm->vmcb->control.exit_info_1;
+
+ cr = svm->vmcb->control.exit_code - SVM_EXIT_CR0_WRITE_TRAP;
+ switch (cr) {
+ case 0:
+ old_value = kvm_read_cr0(vcpu);
+ svm_set_cr0(vcpu, new_value);
+
+ kvm_post_set_cr0(vcpu, old_value, new_value);
+ break;
+ case 4:
+ old_value = kvm_read_cr4(vcpu);
+ svm_set_cr4(vcpu, new_value);
+
+ kvm_post_set_cr4(vcpu, old_value, new_value);
+ break;
+ case 8:
+ ret = kvm_set_cr8(&svm->vcpu, new_value);
+ break;
+ default:
+ WARN(1, "unhandled CR%d write trap", cr);
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ return kvm_complete_insn_gp(vcpu, ret);
+}
+
static int dr_interception(struct vcpu_svm *svm)
{
int reg, dr;
@@ -2461,6 +2574,25 @@ static int cr8_write_interception(struct vcpu_svm *svm)
return 0;
}
+static int efer_trap(struct vcpu_svm *svm)
+{
+ struct msr_data msr_info;
+ int ret;
+
+ /*
+ * Clear the EFER_SVME bit from EFER. The SVM code always sets this
+ * bit in svm_set_efer(), but __kvm_valid_efer() checks it against
+ * whether the guest has X86_FEATURE_SVM - this avoids a failure if
+ * the guest doesn't have X86_FEATURE_SVM.
+ */
+ msr_info.host_initiated = false;
+ msr_info.index = MSR_EFER;
+ msr_info.data = svm->vmcb->control.exit_info_1 & ~EFER_SVME;
+ ret = kvm_set_msr_common(&svm->vcpu, &msr_info);
+
+ return kvm_complete_insn_gp(&svm->vcpu, ret);
+}
+
static int svm_get_msr_feature(struct kvm_msr_entry *msr)
{
msr->data = 0;
@@ -2543,10 +2675,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_IA32_SPEC_CTRL:
if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
- !guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) &&
- !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) &&
- !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
+ !guest_has_spec_ctrl_msr(vcpu))
return 1;
msr_info->data = svm->spec_ctrl;
@@ -2584,6 +2713,20 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 0;
}
+static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ if (!sev_es_guest(svm->vcpu.kvm) || !err)
+ return kvm_complete_insn_gp(&svm->vcpu, err);
+
+ ghcb_set_sw_exit_info_1(svm->ghcb, 1);
+ ghcb_set_sw_exit_info_2(svm->ghcb,
+ X86_TRAP_GP |
+ SVM_EVTINJ_TYPE_EXEPT |
+ SVM_EVTINJ_VALID);
+ return 1;
+}
+
static int rdmsr_interception(struct vcpu_svm *svm)
{
return kvm_emulate_rdmsr(&svm->vcpu);
@@ -2630,10 +2773,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
break;
case MSR_IA32_SPEC_CTRL:
if (!msr->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
- !guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) &&
- !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) &&
- !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
+ !guest_has_spec_ctrl_msr(vcpu))
return 1;
if (kvm_spec_ctrl_test_value(data))
@@ -2658,12 +2798,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
break;
case MSR_IA32_PRED_CMD:
if (!msr->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB))
+ !guest_has_pred_cmd_msr(vcpu))
return 1;
if (data & ~PRED_CMD_IBPB)
return 1;
- if (!boot_cpu_has(X86_FEATURE_AMD_IBPB))
+ if (!boot_cpu_has(X86_FEATURE_IBPB))
return 1;
if (!data)
break;
@@ -2805,7 +2945,14 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
static int pause_interception(struct vcpu_svm *svm)
{
struct kvm_vcpu *vcpu = &svm->vcpu;
- bool in_kernel = (svm_get_cpl(vcpu) == 0);
+ bool in_kernel;
+
+ /*
+ * CPL is not made available for an SEV-ES guest, therefore
+ * vcpu->arch.preempted_in_kernel can never be true. Just
+ * set in_kernel to false as well.
+ */
+ in_kernel = !sev_es_guest(svm->vcpu.kvm) && svm_get_cpl(vcpu) == 0;
if (!kvm_pause_in_guest(vcpu->kvm))
grow_ple_window(vcpu);
@@ -2920,11 +3067,16 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_MWAIT] = mwait_interception,
[SVM_EXIT_XSETBV] = xsetbv_interception,
[SVM_EXIT_RDPRU] = rdpru_interception,
+ [SVM_EXIT_EFER_WRITE_TRAP] = efer_trap,
+ [SVM_EXIT_CR0_WRITE_TRAP] = cr_trap,
+ [SVM_EXIT_CR4_WRITE_TRAP] = cr_trap,
+ [SVM_EXIT_CR8_WRITE_TRAP] = cr_trap,
[SVM_EXIT_INVPCID] = invpcid_interception,
[SVM_EXIT_NPF] = npf_interception,
[SVM_EXIT_RSM] = rsm_interception,
[SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception,
[SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception,
+ [SVM_EXIT_VMGEXIT] = sev_handle_vmgexit,
};
static void dump_vmcb(struct kvm_vcpu *vcpu)
@@ -2966,6 +3118,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
+ pr_err("%-20s%016llx\n", "ghcb:", control->ghcb_gpa);
pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext);
@@ -2973,6 +3126,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
+ pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa);
pr_err("VMCB State Save Area:\n");
pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
"es:",
@@ -3045,6 +3199,43 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
"excp_to:", save->last_excp_to);
}
+static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
+{
+ if (exit_code < ARRAY_SIZE(svm_exit_handlers) &&
+ svm_exit_handlers[exit_code])
+ return 0;
+
+ vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code);
+ dump_vmcb(vcpu);
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
+ vcpu->run->internal.ndata = 2;
+ vcpu->run->internal.data[0] = exit_code;
+ vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
+
+ return -EINVAL;
+}
+
+int svm_invoke_exit_handler(struct vcpu_svm *svm, u64 exit_code)
+{
+ if (svm_handle_invalid_exit(&svm->vcpu, exit_code))
+ return 0;
+
+#ifdef CONFIG_RETPOLINE
+ if (exit_code == SVM_EXIT_MSR)
+ return msr_interception(svm);
+ else if (exit_code == SVM_EXIT_VINTR)
+ return interrupt_window_interception(svm);
+ else if (exit_code == SVM_EXIT_INTR)
+ return intr_interception(svm);
+ else if (exit_code == SVM_EXIT_HLT)
+ return halt_interception(svm);
+ else if (exit_code == SVM_EXIT_NPF)
+ return npf_interception(svm);
+#endif
+ return svm_exit_handlers[exit_code](svm);
+}
+
static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
u32 *intr_info, u32 *error_code)
{
@@ -3068,10 +3259,13 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
- if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE))
- vcpu->arch.cr0 = svm->vmcb->save.cr0;
- if (npt_enabled)
- vcpu->arch.cr3 = svm->vmcb->save.cr3;
+ /* SEV-ES guests must use the CR write traps to track CR registers. */
+ if (!sev_es_guest(vcpu->kvm)) {
+ if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE))
+ vcpu->arch.cr0 = svm->vmcb->save.cr0;
+ if (npt_enabled)
+ vcpu->arch.cr3 = svm->vmcb->save.cr3;
+ }
if (is_guest_mode(vcpu)) {
int vmexit;
@@ -3108,32 +3302,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
if (exit_fastpath != EXIT_FASTPATH_NONE)
return 1;
- if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
- || !svm_exit_handlers[exit_code]) {
- vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%x\n", exit_code);
- dump_vmcb(vcpu);
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror =
- KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
- vcpu->run->internal.ndata = 2;
- vcpu->run->internal.data[0] = exit_code;
- vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
- return 0;
- }
-
-#ifdef CONFIG_RETPOLINE
- if (exit_code == SVM_EXIT_MSR)
- return msr_interception(svm);
- else if (exit_code == SVM_EXIT_VINTR)
- return interrupt_window_interception(svm);
- else if (exit_code == SVM_EXIT_INTR)
- return intr_interception(svm);
- else if (exit_code == SVM_EXIT_HLT)
- return halt_interception(svm);
- else if (exit_code == SVM_EXIT_NPF)
- return npf_interception(svm);
-#endif
- return svm_exit_handlers[exit_code](svm);
+ return svm_invoke_exit_handler(svm, exit_code);
}
static void reload_tss(struct kvm_vcpu *vcpu)
@@ -3162,7 +3331,8 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
vcpu->arch.hflags |= HF_NMI_MASK;
- svm_set_intercept(svm, INTERCEPT_IRET);
+ if (!sev_es_guest(svm->vcpu.kvm))
+ svm_set_intercept(svm, INTERCEPT_IRET);
++vcpu->stat.nmi_injections;
}
@@ -3183,6 +3353,13 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ /*
+ * SEV-ES guests must always keep the CR intercepts cleared. CR
+ * tracking is done using the CR write traps.
+ */
+ if (sev_es_guest(vcpu->kvm))
+ return;
+
if (nested_svm_virtualize_tpr(vcpu))
return;
@@ -3239,10 +3416,12 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
if (masked) {
svm->vcpu.arch.hflags |= HF_NMI_MASK;
- svm_set_intercept(svm, INTERCEPT_IRET);
+ if (!sev_es_guest(svm->vcpu.kvm))
+ svm_set_intercept(svm, INTERCEPT_IRET);
} else {
svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
- svm_clr_intercept(svm, INTERCEPT_IRET);
+ if (!sev_es_guest(svm->vcpu.kvm))
+ svm_clr_intercept(svm, INTERCEPT_IRET);
}
}
@@ -3254,7 +3433,14 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
if (!gif_set(svm))
return true;
- if (is_guest_mode(vcpu)) {
+ if (sev_es_guest(svm->vcpu.kvm)) {
+ /*
+ * SEV-ES guests to not expose RFLAGS. Use the VMCB interrupt mask
+ * bit to determine the state of the IF flag.
+ */
+ if (!(vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK))
+ return true;
+ } else if (is_guest_mode(vcpu)) {
/* As long as interrupts are being delivered... */
if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
? !(svm->nested.hsave->save.rflags & X86_EFLAGS_IF)
@@ -3413,8 +3599,9 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
* If we've made progress since setting HF_IRET_MASK, we've
* executed an IRET and can allow NMI injection.
*/
- if ((svm->vcpu.arch.hflags & HF_IRET_MASK)
- && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) {
+ if ((svm->vcpu.arch.hflags & HF_IRET_MASK) &&
+ (sev_es_guest(svm->vcpu.kvm) ||
+ kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip)) {
svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
}
@@ -3437,6 +3624,12 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
break;
case SVM_EXITINTINFO_TYPE_EXEPT:
/*
+ * Never re-inject a #VC exception.
+ */
+ if (vector == X86_TRAP_VC)
+ break;
+
+ /*
* In case of software exceptions, do not reinject the vector,
* but re-execute the instruction instead. Rewind RIP first
* if we emulated INT3 before.
@@ -3509,16 +3702,20 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu,
guest_enter_irqoff();
lockdep_hardirqs_on(CALLER_ADDR0);
- __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs);
+ if (sev_es_guest(svm->vcpu.kvm)) {
+ __svm_sev_es_vcpu_run(svm->vmcb_pa);
+ } else {
+ __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs);
#ifdef CONFIG_X86_64
- native_wrmsrl(MSR_GS_BASE, svm->host.gs_base);
+ native_wrmsrl(MSR_GS_BASE, svm->host.gs_base);
#else
- loadsegment(fs, svm->host.fs);
+ loadsegment(fs, svm->host.fs);
#ifndef CONFIG_X86_32_LAZY_GS
- loadsegment(gs, svm->host.gs);
+ loadsegment(gs, svm->host.gs);
#endif
#endif
+ }
/*
* VMEXIT disables interrupts (host state), but tracing and lockdep
@@ -3568,6 +3765,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
sync_lapic_to_cr8(vcpu);
+ if (unlikely(svm->asid != svm->vmcb->control.asid)) {
+ svm->vmcb->control.asid = svm->asid;
+ vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
+ }
svm->vmcb->save.cr2 = vcpu->arch.cr2;
/*
@@ -3612,14 +3813,17 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
- reload_tss(vcpu);
+ if (!sev_es_guest(svm->vcpu.kvm))
+ reload_tss(vcpu);
x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);
- vcpu->arch.cr2 = svm->vmcb->save.cr2;
- vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
- vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
- vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
+ if (!sev_es_guest(svm->vcpu.kvm)) {
+ vcpu->arch.cr2 = svm->vmcb->save.cr2;
+ vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
+ vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
+ vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
+ }
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
kvm_before_interrupt(&svm->vcpu);
@@ -3722,12 +3926,21 @@ static bool svm_cpu_has_accelerated_tpr(void)
return false;
}
-static bool svm_has_emulated_msr(u32 index)
+/*
+ * The kvm parameter can be NULL (module initialization, or invocation before
+ * VM creation). Be sure to check the kvm parameter before using it.
+ */
+static bool svm_has_emulated_msr(struct kvm *kvm, u32 index)
{
switch (index) {
case MSR_IA32_MCG_EXT_CTL:
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
return false;
+ case MSR_IA32_SMBASE:
+ /* SEV-ES guests do not support SMM, so report false */
+ if (kvm && sev_es_guest(kvm))
+ return false;
+ break;
default:
break;
}
@@ -4086,6 +4299,12 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int i
unsigned long cr4;
/*
+ * When the guest is an SEV-ES guest, emulation is not possible.
+ */
+ if (sev_es_guest(vcpu->kvm))
+ return false;
+
+ /*
* Detect and workaround Errata 1096 Fam_17h_00_0Fh.
*
* Errata:
@@ -4217,6 +4436,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.get_cpl = svm_get_cpl,
.get_cs_db_l_bits = kvm_get_cs_db_l_bits,
.set_cr0 = svm_set_cr0,
+ .is_valid_cr4 = svm_is_valid_cr4,
.set_cr4 = svm_set_cr4,
.set_efer = svm_set_efer,
.get_idt = svm_get_idt,
@@ -4305,6 +4525,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.apic_init_signal_blocked = svm_apic_init_signal_blocked,
.msr_filter_changed = svm_msr_filter_changed,
+ .complete_emulated_msr = svm_complete_emulated_msr,
};
static struct kvm_x86_init_ops svm_init_ops __initdata = {
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 1d853fe4c778..5431e6335e2e 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -17,21 +17,32 @@
#include <linux/kvm_types.h>
#include <linux/kvm_host.h>
+#include <linux/bits.h>
#include <asm/svm.h>
-static const u32 host_save_user_msrs[] = {
+#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT)
+
+static const struct svm_host_save_msrs {
+ u32 index; /* Index of the MSR */
+ bool sev_es_restored; /* True if MSR is restored on SEV-ES VMEXIT */
+} host_save_user_msrs[] = {
#ifdef CONFIG_X86_64
- MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
- MSR_FS_BASE,
+ { .index = MSR_STAR, .sev_es_restored = true },
+ { .index = MSR_LSTAR, .sev_es_restored = true },
+ { .index = MSR_CSTAR, .sev_es_restored = true },
+ { .index = MSR_SYSCALL_MASK, .sev_es_restored = true },
+ { .index = MSR_KERNEL_GS_BASE, .sev_es_restored = true },
+ { .index = MSR_FS_BASE, .sev_es_restored = true },
#endif
- MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
- MSR_TSC_AUX,
+ { .index = MSR_IA32_SYSENTER_CS, .sev_es_restored = true },
+ { .index = MSR_IA32_SYSENTER_ESP, .sev_es_restored = true },
+ { .index = MSR_IA32_SYSENTER_EIP, .sev_es_restored = true },
+ { .index = MSR_TSC_AUX, .sev_es_restored = false },
};
-
#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
-#define MAX_DIRECT_ACCESS_MSRS 15
+#define MAX_DIRECT_ACCESS_MSRS 18
#define MSRPM_OFFSETS 16
extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
extern bool npt_enabled;
@@ -61,11 +72,13 @@ enum {
struct kvm_sev_info {
bool active; /* SEV enabled guest */
+ bool es_active; /* SEV-ES enabled guest */
unsigned int asid; /* ASID used for this guest */
unsigned int handle; /* SEV firmware handle */
int fd; /* SEV device fd */
unsigned long pages_locked; /* Number of pages locked */
struct list_head regions_list; /* List of registered regions */
+ u64 ap_jump_table; /* SEV-ES AP Jump Table address */
};
struct kvm_svm {
@@ -106,6 +119,7 @@ struct vcpu_svm {
struct vmcb *vmcb;
unsigned long vmcb_pa;
struct svm_cpu_data *svm_data;
+ u32 asid;
uint64_t asid_generation;
uint64_t sysenter_esp;
uint64_t sysenter_eip;
@@ -166,6 +180,17 @@ struct vcpu_svm {
DECLARE_BITMAP(read, MAX_DIRECT_ACCESS_MSRS);
DECLARE_BITMAP(write, MAX_DIRECT_ACCESS_MSRS);
} shadow_msr_intercept;
+
+ /* SEV-ES support */
+ struct vmcb_save_area *vmsa;
+ struct ghcb *ghcb;
+ struct kvm_host_map ghcb_map;
+
+ /* SEV-ES scratch area support */
+ void *ghcb_sa;
+ u64 ghcb_sa_len;
+ bool ghcb_sa_sync;
+ bool ghcb_sa_free;
};
struct svm_cpu_data {
@@ -193,6 +218,28 @@ static inline struct kvm_svm *to_kvm_svm(struct kvm *kvm)
return container_of(kvm, struct kvm_svm, kvm);
}
+static inline bool sev_guest(struct kvm *kvm)
+{
+#ifdef CONFIG_KVM_AMD_SEV
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+
+ return sev->active;
+#else
+ return false;
+#endif
+}
+
+static inline bool sev_es_guest(struct kvm *kvm)
+{
+#ifdef CONFIG_KVM_AMD_SEV
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+
+ return sev_guest(kvm) && sev->es_active;
+#else
+ return false;
+#endif
+}
+
static inline void vmcb_mark_all_dirty(struct vmcb *vmcb)
{
vmcb->control.clean = 0;
@@ -244,21 +291,24 @@ static inline void set_dr_intercepts(struct vcpu_svm *svm)
{
struct vmcb *vmcb = get_host_vmcb(svm);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ);
+ if (!sev_es_guest(svm->vcpu.kvm)) {
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE);
+ }
+
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE);
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
recalc_intercepts(svm);
@@ -270,6 +320,12 @@ static inline void clr_dr_intercepts(struct vcpu_svm *svm)
vmcb->control.intercepts[INTERCEPT_DR] = 0;
+ /* DR7 access must remain intercepted for an SEV-ES guest */
+ if (sev_es_guest(svm->vcpu.kvm)) {
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
+ }
+
recalc_intercepts(svm);
}
@@ -351,6 +407,10 @@ static inline bool gif_set(struct vcpu_svm *svm)
#define MSR_CR3_LONG_MBZ_MASK 0xfff0000000000000U
#define MSR_INVALID 0xffffffffU
+extern int sev;
+extern int sev_es;
+extern bool dump_invalid_vmcb;
+
u32 svm_msrpm_offset(u32 msr);
u32 *svm_vcpu_alloc_msrpm(void);
void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm);
@@ -358,13 +418,16 @@ void svm_vcpu_free_msrpm(u32 *msrpm);
int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer);
void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
-int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
void svm_flush_tlb(struct kvm_vcpu *vcpu);
void disable_nmi_singlestep(struct vcpu_svm *svm);
bool svm_smi_blocked(struct kvm_vcpu *vcpu);
bool svm_nmi_blocked(struct kvm_vcpu *vcpu);
bool svm_interrupt_blocked(struct kvm_vcpu *vcpu);
void svm_set_gif(struct vcpu_svm *svm, bool value);
+int svm_invoke_exit_handler(struct vcpu_svm *svm, u64 exit_code);
+void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
+ int read, int write);
/* nested.c */
@@ -470,18 +533,42 @@ void svm_vcpu_unblocking(struct kvm_vcpu *vcpu);
/* sev.c */
-extern unsigned int max_sev_asid;
+#define GHCB_VERSION_MAX 1ULL
+#define GHCB_VERSION_MIN 1ULL
+
+#define GHCB_MSR_INFO_POS 0
+#define GHCB_MSR_INFO_MASK (BIT_ULL(12) - 1)
+
+#define GHCB_MSR_SEV_INFO_RESP 0x001
+#define GHCB_MSR_SEV_INFO_REQ 0x002
+#define GHCB_MSR_VER_MAX_POS 48
+#define GHCB_MSR_VER_MAX_MASK 0xffff
+#define GHCB_MSR_VER_MIN_POS 32
+#define GHCB_MSR_VER_MIN_MASK 0xffff
+#define GHCB_MSR_CBIT_POS 24
+#define GHCB_MSR_CBIT_MASK 0xff
+#define GHCB_MSR_SEV_INFO(_max, _min, _cbit) \
+ ((((_max) & GHCB_MSR_VER_MAX_MASK) << GHCB_MSR_VER_MAX_POS) | \
+ (((_min) & GHCB_MSR_VER_MIN_MASK) << GHCB_MSR_VER_MIN_POS) | \
+ (((_cbit) & GHCB_MSR_CBIT_MASK) << GHCB_MSR_CBIT_POS) | \
+ GHCB_MSR_SEV_INFO_RESP)
+
+#define GHCB_MSR_CPUID_REQ 0x004
+#define GHCB_MSR_CPUID_RESP 0x005
+#define GHCB_MSR_CPUID_FUNC_POS 32
+#define GHCB_MSR_CPUID_FUNC_MASK 0xffffffff
+#define GHCB_MSR_CPUID_VALUE_POS 32
+#define GHCB_MSR_CPUID_VALUE_MASK 0xffffffff
+#define GHCB_MSR_CPUID_REG_POS 30
+#define GHCB_MSR_CPUID_REG_MASK 0x3
+
+#define GHCB_MSR_TERM_REQ 0x100
+#define GHCB_MSR_TERM_REASON_SET_POS 12
+#define GHCB_MSR_TERM_REASON_SET_MASK 0xf
+#define GHCB_MSR_TERM_REASON_POS 16
+#define GHCB_MSR_TERM_REASON_MASK 0xff
-static inline bool sev_guest(struct kvm *kvm)
-{
-#ifdef CONFIG_KVM_AMD_SEV
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-
- return sev->active;
-#else
- return false;
-#endif
-}
+extern unsigned int max_sev_asid;
static inline bool svm_sev_enabled(void)
{
@@ -495,7 +582,19 @@ int svm_register_enc_region(struct kvm *kvm,
int svm_unregister_enc_region(struct kvm *kvm,
struct kvm_enc_region *range);
void pre_sev_run(struct vcpu_svm *svm, int cpu);
-int __init sev_hardware_setup(void);
+void __init sev_hardware_setup(void);
void sev_hardware_teardown(void);
+void sev_free_vcpu(struct kvm_vcpu *vcpu);
+int sev_handle_vmgexit(struct vcpu_svm *svm);
+int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in);
+void sev_es_init_vmcb(struct vcpu_svm *svm);
+void sev_es_create_vcpu(struct vcpu_svm *svm);
+void sev_es_vcpu_load(struct vcpu_svm *svm, int cpu);
+void sev_es_vcpu_put(struct vcpu_svm *svm);
+
+/* vmenter.S */
+
+void __svm_sev_es_vcpu_run(unsigned long vmcb_pa);
+void __svm_vcpu_run(unsigned long vmcb_pa, unsigned long *regs);
#endif
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index 1ec1ac40e328..6feb8c08f45a 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -168,3 +168,53 @@ SYM_FUNC_START(__svm_vcpu_run)
pop %_ASM_BP
ret
SYM_FUNC_END(__svm_vcpu_run)
+
+/**
+ * __svm_sev_es_vcpu_run - Run a SEV-ES vCPU via a transition to SVM guest mode
+ * @vmcb_pa: unsigned long
+ */
+SYM_FUNC_START(__svm_sev_es_vcpu_run)
+ push %_ASM_BP
+#ifdef CONFIG_X86_64
+ push %r15
+ push %r14
+ push %r13
+ push %r12
+#else
+ push %edi
+ push %esi
+#endif
+ push %_ASM_BX
+
+ /* Enter guest mode */
+ mov %_ASM_ARG1, %_ASM_AX
+ sti
+
+1: vmrun %_ASM_AX
+ jmp 3f
+2: cmpb $0, kvm_rebooting
+ jne 3f
+ ud2
+ _ASM_EXTABLE(1b, 2b)
+
+3: cli
+
+#ifdef CONFIG_RETPOLINE
+ /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
+ FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
+#endif
+
+ pop %_ASM_BX
+
+#ifdef CONFIG_X86_64
+ pop %r12
+ pop %r13
+ pop %r14
+ pop %r15
+#else
+ pop %esi
+ pop %edi
+#endif
+ pop %_ASM_BP
+ ret
+SYM_FUNC_END(__svm_sev_es_vcpu_run)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index aef960f90f26..2de30c20bc26 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -1578,6 +1578,103 @@ TRACE_EVENT(kvm_hv_syndbg_get_msr,
__entry->vcpu_id, __entry->vp_index, __entry->msr,
__entry->data)
);
+
+/*
+ * Tracepoint for the start of VMGEXIT processing
+ */
+TRACE_EVENT(kvm_vmgexit_enter,
+ TP_PROTO(unsigned int vcpu_id, struct ghcb *ghcb),
+ TP_ARGS(vcpu_id, ghcb),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, vcpu_id)
+ __field(u64, exit_reason)
+ __field(u64, info1)
+ __field(u64, info2)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->exit_reason = ghcb->save.sw_exit_code;
+ __entry->info1 = ghcb->save.sw_exit_info_1;
+ __entry->info2 = ghcb->save.sw_exit_info_2;
+ ),
+
+ TP_printk("vcpu %u, exit_reason %llx, exit_info1 %llx, exit_info2 %llx",
+ __entry->vcpu_id, __entry->exit_reason,
+ __entry->info1, __entry->info2)
+);
+
+/*
+ * Tracepoint for the end of VMGEXIT processing
+ */
+TRACE_EVENT(kvm_vmgexit_exit,
+ TP_PROTO(unsigned int vcpu_id, struct ghcb *ghcb),
+ TP_ARGS(vcpu_id, ghcb),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, vcpu_id)
+ __field(u64, exit_reason)
+ __field(u64, info1)
+ __field(u64, info2)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->exit_reason = ghcb->save.sw_exit_code;
+ __entry->info1 = ghcb->save.sw_exit_info_1;
+ __entry->info2 = ghcb->save.sw_exit_info_2;
+ ),
+
+ TP_printk("vcpu %u, exit_reason %llx, exit_info1 %llx, exit_info2 %llx",
+ __entry->vcpu_id, __entry->exit_reason,
+ __entry->info1, __entry->info2)
+);
+
+/*
+ * Tracepoint for the start of VMGEXIT MSR procotol processing
+ */
+TRACE_EVENT(kvm_vmgexit_msr_protocol_enter,
+ TP_PROTO(unsigned int vcpu_id, u64 ghcb_gpa),
+ TP_ARGS(vcpu_id, ghcb_gpa),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, vcpu_id)
+ __field(u64, ghcb_gpa)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->ghcb_gpa = ghcb_gpa;
+ ),
+
+ TP_printk("vcpu %u, ghcb_gpa %016llx",
+ __entry->vcpu_id, __entry->ghcb_gpa)
+);
+
+/*
+ * Tracepoint for the end of VMGEXIT MSR procotol processing
+ */
+TRACE_EVENT(kvm_vmgexit_msr_protocol_exit,
+ TP_PROTO(unsigned int vcpu_id, u64 ghcb_gpa, int result),
+ TP_ARGS(vcpu_id, ghcb_gpa, result),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, vcpu_id)
+ __field(u64, ghcb_gpa)
+ __field(int, result)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->ghcb_gpa = ghcb_gpa;
+ __entry->result = result;
+ ),
+
+ TP_printk("vcpu %u, ghcb_gpa %016llx, result %d",
+ __entry->vcpu_id, __entry->ghcb_gpa, __entry->result)
+);
+
#endif /* _TRACE_KVM_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c
index f3199bb02f22..41f24661af04 100644
--- a/arch/x86/kvm/vmx/evmcs.c
+++ b/arch/x86/kvm/vmx/evmcs.c
@@ -326,7 +326,6 @@ bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa)
uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
/*
* vmcs_version represents the range of supported Enlightened VMCS
* versions: lower 8 bits is the minimal version, higher 8 bits is the
@@ -334,7 +333,7 @@ uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu)
* KVM_EVMCS_VERSION.
*/
if (kvm_cpu_cap_get(X86_FEATURE_VMX) &&
- vmx->nested.enlightened_vmcs_enabled)
+ (!vcpu || to_vmx(vcpu)->nested.enlightened_vmcs_enabled))
return (KVM_EVMCS_VERSION << 8) | 1;
return 0;
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 89af692deb7e..e2f26564a12d 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2952,7 +2952,8 @@ static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12)
{
if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
- vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT))
+ vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT &&
+ vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI))
return -EINVAL;
return 0;
@@ -3559,19 +3560,29 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
*/
nested_cache_shadow_vmcs12(vcpu, vmcs12);
- /*
- * If we're entering a halted L2 vcpu and the L2 vcpu won't be
- * awakened by event injection or by an NMI-window VM-exit or
- * by an interrupt-window VM-exit, halt the vcpu.
- */
- if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) &&
- !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) &&
- !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_NMI_WINDOW_EXITING) &&
- !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_INTR_WINDOW_EXITING) &&
- (vmcs12->guest_rflags & X86_EFLAGS_IF))) {
+ switch (vmcs12->guest_activity_state) {
+ case GUEST_ACTIVITY_HLT:
+ /*
+ * If we're entering a halted L2 vcpu and the L2 vcpu won't be
+ * awakened by event injection or by an NMI-window VM-exit or
+ * by an interrupt-window VM-exit, halt the vcpu.
+ */
+ if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) &&
+ !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) &&
+ !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) &&
+ (vmcs12->guest_rflags & X86_EFLAGS_IF))) {
+ vmx->nested.nested_run_pending = 0;
+ return kvm_vcpu_halt(vcpu);
+ }
+ break;
+ case GUEST_ACTIVITY_WAIT_SIPI:
vmx->nested.nested_run_pending = 0;
- return kvm_vcpu_halt(vcpu);
+ vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
+ break;
+ default:
+ break;
}
+
return 1;
vmentry_failed:
@@ -3797,7 +3808,20 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
return -EBUSY;
nested_vmx_update_pending_dbg(vcpu);
clear_bit(KVM_APIC_INIT, &apic->pending_events);
- nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0);
+ if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED)
+ nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0);
+ return 0;
+ }
+
+ if (lapic_in_kernel(vcpu) &&
+ test_bit(KVM_APIC_SIPI, &apic->pending_events)) {
+ if (block_nested_events)
+ return -EBUSY;
+
+ clear_bit(KVM_APIC_SIPI, &apic->pending_events);
+ if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
+ nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0,
+ apic->sipi_vector & 0xFFUL);
return 0;
}
@@ -4036,6 +4060,8 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
+ else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
+ vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI;
else
vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
@@ -4814,7 +4840,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
/*
* The Intel VMX Instruction Reference lists a bunch of bits that are
* prerequisite to running VMXON, most notably cr4.VMXE must be set to
- * 1 (see vmx_set_cr4() for when we allow the guest to set this).
+ * 1 (see vmx_is_valid_cr4() for when we allow the guest to set this).
* Otherwise, we should fail with #UD. But most faulting conditions
* have already been checked by hardware, prior to the VM-exit for
* VMXON. We do test guest cr4.VMXE because processor CR4 always has
@@ -6483,7 +6509,8 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
msrs->misc_low |=
MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
- VMX_MISC_ACTIVITY_HLT;
+ VMX_MISC_ACTIVITY_HLT |
+ VMX_MISC_ACTIVITY_WAIT_SIPI;
msrs->misc_high = 0;
/*
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 90ad7a6246e3..e85aa5faa22d 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -132,7 +132,7 @@ SYM_FUNC_START(__vmx_vcpu_run)
mov (%_ASM_SP), %_ASM_AX
/* Check if vmlaunch or vmresume is needed */
- cmpb $0, %bl
+ testb %bl, %bl
/* Load guest registers. Don't clobber flags. */
mov VCPU_RCX(%_ASM_AX), %_ASM_CX
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 47b8357b9751..75c9c6a0a3a4 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -40,7 +40,6 @@
#include <asm/irq_remapping.h>
#include <asm/kexec.h>
#include <asm/perf_event.h>
-#include <asm/mce.h>
#include <asm/mmu_context.h>
#include <asm/mshyperv.h>
#include <asm/mwait.h>
@@ -1826,7 +1825,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_IA32_SPEC_CTRL:
if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
+ !guest_has_spec_ctrl_msr(vcpu))
return 1;
msr_info->data = to_vmx(vcpu)->spec_ctrl;
@@ -2028,7 +2027,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_IA32_SPEC_CTRL:
if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
+ !guest_has_spec_ctrl_msr(vcpu))
return 1;
if (kvm_spec_ctrl_test_value(data))
@@ -2063,12 +2062,12 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
goto find_uret_msr;
case MSR_IA32_PRED_CMD:
if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
+ !guest_has_pred_cmd_msr(vcpu))
return 1;
if (data & ~PRED_CMD_IBPB)
return 1;
- if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL))
+ if (!boot_cpu_has(X86_FEATURE_IBPB))
return 1;
if (!data)
break;
@@ -3095,8 +3094,25 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
vmcs_writel(GUEST_CR3, guest_cr3);
}
-int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
+ /*
+ * We operate under the default treatment of SMM, so VMX cannot be
+ * enabled under SMM. Note, whether or not VMXE is allowed at all is
+ * handled by kvm_is_valid_cr4().
+ */
+ if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu))
+ return false;
+
+ if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
+ return false;
+
+ return true;
+}
+
+void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ unsigned long old_cr4 = vcpu->arch.cr4;
struct vcpu_vmx *vmx = to_vmx(vcpu);
/*
* Pass through host's Machine Check Enable value to hw_cr4, which
@@ -3123,21 +3139,6 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
}
}
- if (cr4 & X86_CR4_VMXE) {
- /*
- * To use VMXON (and later other VMX instructions), a guest
- * must first be able to turn on cr4.VMXE (see handle_vmon()).
- * So basically the check on whether to allow nested VMX
- * is here. We operate under the default treatment of SMM,
- * so VMX cannot be enabled under SMM.
- */
- if (!nested_vmx_allowed(vcpu) || is_smm(vcpu))
- return 1;
- }
-
- if (vmx->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
- return 1;
-
vcpu->arch.cr4 = cr4;
kvm_register_mark_available(vcpu, VCPU_EXREG_CR4);
@@ -3168,7 +3169,9 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
vmcs_writel(CR4_READ_SHADOW, cr4);
vmcs_writel(GUEST_CR4, hw_cr4);
- return 0;
+
+ if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
+ kvm_update_cpuid_runtime(vcpu);
}
void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
@@ -3515,42 +3518,33 @@ bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu)
return true;
}
-static int init_rmode_tss(struct kvm *kvm)
+static int init_rmode_tss(struct kvm *kvm, void __user *ua)
{
- gfn_t fn;
- u16 data = 0;
- int idx, r;
+ const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
+ u16 data;
+ int i;
+
+ for (i = 0; i < 3; i++) {
+ if (__copy_to_user(ua + PAGE_SIZE * i, zero_page, PAGE_SIZE))
+ return -EFAULT;
+ }
- idx = srcu_read_lock(&kvm->srcu);
- fn = to_kvm_vmx(kvm)->tss_addr >> PAGE_SHIFT;
- r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
- if (r < 0)
- goto out;
data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
- r = kvm_write_guest_page(kvm, fn++, &data,
- TSS_IOPB_BASE_OFFSET, sizeof(u16));
- if (r < 0)
- goto out;
- r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
- if (r < 0)
- goto out;
- r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
- if (r < 0)
- goto out;
+ if (__copy_to_user(ua + TSS_IOPB_BASE_OFFSET, &data, sizeof(u16)))
+ return -EFAULT;
+
data = ~0;
- r = kvm_write_guest_page(kvm, fn, &data,
- RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
- sizeof(u8));
-out:
- srcu_read_unlock(&kvm->srcu, idx);
- return r;
+ if (__copy_to_user(ua + RMODE_TSS_SIZE - 1, &data, sizeof(u8)))
+ return -EFAULT;
+
+ return 0;
}
static int init_rmode_identity_map(struct kvm *kvm)
{
struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
int i, r = 0;
- kvm_pfn_t identity_map_pfn;
+ void __user *uaddr;
u32 tmp;
/* Protect kvm_vmx->ept_identity_pagetable_done. */
@@ -3561,24 +3555,24 @@ static int init_rmode_identity_map(struct kvm *kvm)
if (!kvm_vmx->ept_identity_map_addr)
kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
- identity_map_pfn = kvm_vmx->ept_identity_map_addr >> PAGE_SHIFT;
- r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
- kvm_vmx->ept_identity_map_addr, PAGE_SIZE);
- if (r < 0)
+ uaddr = __x86_set_memory_region(kvm,
+ IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
+ kvm_vmx->ept_identity_map_addr,
+ PAGE_SIZE);
+ if (IS_ERR(uaddr)) {
+ r = PTR_ERR(uaddr);
goto out;
+ }
- r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
- if (r < 0)
- goto out;
/* Set up identity-mapping pagetable for EPT in real mode */
for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
_PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
- r = kvm_write_guest_page(kvm, identity_map_pfn,
- &tmp, i * sizeof(tmp), sizeof(tmp));
- if (r < 0)
+ if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) {
+ r = -EFAULT;
goto out;
+ }
}
kvm_vmx->ept_identity_pagetable_done = true;
@@ -3605,19 +3599,22 @@ static void seg_setup(int seg)
static int alloc_apic_access_page(struct kvm *kvm)
{
struct page *page;
- int r = 0;
+ void __user *hva;
+ int ret = 0;
mutex_lock(&kvm->slots_lock);
if (kvm->arch.apic_access_page_done)
goto out;
- r = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
- APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
- if (r)
+ hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
+ APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
+ if (IS_ERR(hva)) {
+ ret = PTR_ERR(hva);
goto out;
+ }
page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
if (is_error_page(page)) {
- r = -EFAULT;
+ ret = -EFAULT;
goto out;
}
@@ -3629,7 +3626,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
kvm->arch.apic_access_page_done = true;
out:
mutex_unlock(&kvm->slots_lock);
- return r;
+ return ret;
}
int allocate_vpid(void)
@@ -4638,7 +4635,7 @@ static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
{
- int ret;
+ void __user *ret;
if (enable_unrestricted_guest)
return 0;
@@ -4648,10 +4645,12 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
PAGE_SIZE * 3);
mutex_unlock(&kvm->slots_lock);
- if (ret)
- return ret;
+ if (IS_ERR(ret))
+ return PTR_ERR(ret);
+
to_kvm_vmx(kvm)->tss_addr = addr;
- return init_rmode_tss(kvm);
+
+ return init_rmode_tss(kvm, ret);
}
static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
@@ -4716,25 +4715,6 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
return 1;
}
-/*
- * Trigger machine check on the host. We assume all the MSRs are already set up
- * by the CPU and that we still run on the same CPU as the MCE occurred on.
- * We pass a fake environment to the machine check handler because we want
- * the guest to be always treated like user space, no matter what context
- * it used internally.
- */
-static void kvm_machine_check(void)
-{
-#if defined(CONFIG_X86_MCE)
- struct pt_regs regs = {
- .cs = 3, /* Fake ring 3 no matter what the guest ran on */
- .flags = X86_EFLAGS_IF,
- };
-
- do_machine_check(&regs);
-#endif
-}
-
static int handle_machine_check(struct kvm_vcpu *vcpu)
{
/* handled by vmx_vcpu_run() */
@@ -6399,7 +6379,11 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
handle_exception_nmi_irqoff(vmx);
}
-static bool vmx_has_emulated_msr(u32 index)
+/*
+ * The kvm parameter can be NULL (module initialization, or invocation before
+ * VM creation). Be sure to check the kvm parameter before using it.
+ */
+static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
{
switch (index) {
case MSR_IA32_SMBASE:
@@ -7558,7 +7542,7 @@ static void enable_smi_window(struct kvm_vcpu *vcpu)
static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
{
- return to_vmx(vcpu)->nested.vmxon;
+ return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu);
}
static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
@@ -7587,6 +7571,11 @@ static bool vmx_check_apicv_inhibit_reasons(ulong bit)
return supported & BIT(bit);
}
+static int vmx_cpu_dirty_log_size(void)
+{
+ return enable_pml ? PML_ENTITY_NUM : 0;
+}
+
static struct kvm_x86_ops vmx_x86_ops __initdata = {
.hardware_unsetup = hardware_unsetup,
@@ -7616,6 +7605,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.get_cpl = vmx_get_cpl,
.get_cs_db_l_bits = vmx_get_cs_db_l_bits,
.set_cr0 = vmx_set_cr0,
+ .is_valid_cr4 = vmx_is_valid_cr4,
.set_cr4 = vmx_set_cr4,
.set_efer = vmx_set_efer,
.get_idt = vmx_get_idt,
@@ -7715,6 +7705,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.migrate_timers = vmx_migrate_timers,
.msr_filter_changed = vmx_msr_filter_changed,
+ .complete_emulated_msr = kvm_complete_insn_gp,
+ .cpu_dirty_log_size = vmx_cpu_dirty_log_size,
};
static __init int hardware_setup(void)
@@ -7832,6 +7824,7 @@ static __init int hardware_setup(void)
vmx_x86_ops.slot_disable_log_dirty = NULL;
vmx_x86_ops.flush_log_dirty = NULL;
vmx_x86_ops.enable_log_dirty_pt_masked = NULL;
+ vmx_x86_ops.cpu_dirty_log_size = NULL;
}
if (!cpu_has_vmx_preemption_timer())
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index f6f66e5c6510..9d3a557949ac 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -321,7 +321,7 @@ u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu);
void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask);
int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer);
void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
-int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
void set_cr4_guest_host_mask(struct vcpu_vmx *vmx);
void ept_save_pdptrs(struct kvm_vcpu *vcpu);
void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b05aec109242..3f7c1fc7a3ce 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -197,7 +197,8 @@ EXPORT_SYMBOL_GPL(host_efer);
bool __read_mostly allow_smaller_maxphyaddr = 0;
EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
-static u64 __read_mostly host_xss;
+u64 __read_mostly host_xss;
+EXPORT_SYMBOL_GPL(host_xss);
u64 __read_mostly supported_xss;
EXPORT_SYMBOL_GPL(supported_xss);
@@ -804,11 +805,29 @@ bool pdptrs_changed(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(pdptrs_changed);
+void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0)
+{
+ unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
+
+ if ((cr0 ^ old_cr0) & X86_CR0_PG) {
+ kvm_clear_async_pf_completion_queue(vcpu);
+ kvm_async_pf_hash_reset(vcpu);
+ }
+
+ if ((cr0 ^ old_cr0) & update_bits)
+ kvm_mmu_reset_context(vcpu);
+
+ if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
+ kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
+ !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
+ kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
+}
+EXPORT_SYMBOL_GPL(kvm_post_set_cr0);
+
int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
unsigned long old_cr0 = kvm_read_cr0(vcpu);
unsigned long pdptr_bits = X86_CR0_CD | X86_CR0_NW | X86_CR0_PG;
- unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
cr0 |= X86_CR0_ET;
@@ -847,18 +866,7 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
kvm_x86_ops.set_cr0(vcpu, cr0);
- if ((cr0 ^ old_cr0) & X86_CR0_PG) {
- kvm_clear_async_pf_completion_queue(vcpu);
- kvm_async_pf_hash_reset(vcpu);
- }
-
- if ((cr0 ^ old_cr0) & update_bits)
- kvm_mmu_reset_context(vcpu);
-
- if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
- kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
- !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
- kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
+ kvm_post_set_cr0(vcpu, old_cr0, cr0);
return 0;
}
@@ -872,6 +880,9 @@ EXPORT_SYMBOL_GPL(kvm_lmsw);
void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
{
+ if (vcpu->arch.guest_state_protected)
+ return;
+
if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
if (vcpu->arch.xcr0 != host_xcr0)
@@ -892,6 +903,9 @@ EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
{
+ if (vcpu->arch.guest_state_protected)
+ return;
+
if (static_cpu_has(X86_FEATURE_PKU) &&
(kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
(vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) {
@@ -964,26 +978,36 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
}
EXPORT_SYMBOL_GPL(kvm_set_xcr);
-int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
if (cr4 & cr4_reserved_bits)
- return -EINVAL;
+ return false;
if (cr4 & vcpu->arch.cr4_guest_rsvd_bits)
- return -EINVAL;
+ return false;
- return 0;
+ return kvm_x86_ops.is_valid_cr4(vcpu, cr4);
+}
+EXPORT_SYMBOL_GPL(kvm_is_valid_cr4);
+
+void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4)
+{
+ unsigned long mmu_role_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
+ X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE;
+
+ if (((cr4 ^ old_cr4) & mmu_role_bits) ||
+ (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
+ kvm_mmu_reset_context(vcpu);
}
-EXPORT_SYMBOL_GPL(kvm_valid_cr4);
+EXPORT_SYMBOL_GPL(kvm_post_set_cr4);
int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
unsigned long old_cr4 = kvm_read_cr4(vcpu);
unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
X86_CR4_SMEP;
- unsigned long mmu_role_bits = pdptr_bits | X86_CR4_SMAP | X86_CR4_PKE;
- if (kvm_valid_cr4(vcpu, cr4))
+ if (!kvm_is_valid_cr4(vcpu, cr4))
return 1;
if (is_long_mode(vcpu)) {
@@ -1006,15 +1030,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
return 1;
}
- if (kvm_x86_ops.set_cr4(vcpu, cr4))
- return 1;
+ kvm_x86_ops.set_cr4(vcpu, cr4);
- if (((cr4 ^ old_cr4) & mmu_role_bits) ||
- (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
- kvm_mmu_reset_context(vcpu);
-
- if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
- kvm_update_cpuid_runtime(vcpu);
+ kvm_post_set_cr4(vcpu, old_cr4, cr4);
return 0;
}
@@ -1638,27 +1656,20 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
}
EXPORT_SYMBOL_GPL(kvm_set_msr);
-static int complete_emulated_msr(struct kvm_vcpu *vcpu, bool is_read)
+static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
{
- if (vcpu->run->msr.error) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- } else if (is_read) {
+ int err = vcpu->run->msr.error;
+ if (!err) {
kvm_rax_write(vcpu, (u32)vcpu->run->msr.data);
kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32);
}
- return kvm_skip_emulated_instruction(vcpu);
-}
-
-static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
-{
- return complete_emulated_msr(vcpu, true);
+ return kvm_x86_ops.complete_emulated_msr(vcpu, err);
}
static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu)
{
- return complete_emulated_msr(vcpu, false);
+ return kvm_x86_ops.complete_emulated_msr(vcpu, vcpu->run->msr.error);
}
static u64 kvm_msr_reason(int r)
@@ -1721,18 +1732,16 @@ int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
return 0;
}
- /* MSR read failed? Inject a #GP */
- if (r) {
+ if (!r) {
+ trace_kvm_msr_read(ecx, data);
+
+ kvm_rax_write(vcpu, data & -1u);
+ kvm_rdx_write(vcpu, (data >> 32) & -1u);
+ } else {
trace_kvm_msr_read_ex(ecx);
- kvm_inject_gp(vcpu, 0);
- return 1;
}
- trace_kvm_msr_read(ecx, data);
-
- kvm_rax_write(vcpu, data & -1u);
- kvm_rdx_write(vcpu, (data >> 32) & -1u);
- return kvm_skip_emulated_instruction(vcpu);
+ return kvm_x86_ops.complete_emulated_msr(vcpu, r);
}
EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr);
@@ -1753,15 +1762,12 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
if (r < 0)
return r;
- /* MSR write failed? Inject a #GP */
- if (r > 0) {
+ if (!r)
+ trace_kvm_msr_write(ecx, data);
+ else
trace_kvm_msr_write_ex(ecx, data);
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
- trace_kvm_msr_write(ecx, data);
- return kvm_skip_emulated_instruction(vcpu);
+ return kvm_x86_ops.complete_emulated_msr(vcpu, r);
}
EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
@@ -3678,6 +3684,27 @@ static inline bool kvm_can_mwait_in_guest(void)
boot_cpu_has(X86_FEATURE_ARAT);
}
+static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid2 __user *cpuid_arg)
+{
+ struct kvm_cpuid2 cpuid;
+ int r;
+
+ r = -EFAULT;
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
+ return r;
+
+ r = kvm_get_hv_cpuid(vcpu, &cpuid, cpuid_arg->entries);
+ if (r)
+ return r;
+
+ r = -EFAULT;
+ if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
+ return r;
+
+ return 0;
+}
+
int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
{
int r = 0;
@@ -3714,6 +3741,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_HYPERV_TLBFLUSH:
case KVM_CAP_HYPERV_SEND_IPI:
case KVM_CAP_HYPERV_CPUID:
+ case KVM_CAP_SYS_HYPERV_CPUID:
case KVM_CAP_PCI_SEGMENT:
case KVM_CAP_DEBUGREGS:
case KVM_CAP_X86_ROBUST_SINGLESTEP:
@@ -3762,7 +3790,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
* fringe case that is not enabled except via specific settings
* of the module parameters.
*/
- r = kvm_x86_ops.has_emulated_msr(MSR_IA32_SMBASE);
+ r = kvm_x86_ops.has_emulated_msr(kvm, MSR_IA32_SMBASE);
break;
case KVM_CAP_VAPIC:
r = !kvm_x86_ops.cpu_has_accelerated_tpr();
@@ -3899,6 +3927,9 @@ long kvm_arch_dev_ioctl(struct file *filp,
case KVM_GET_MSRS:
r = msr_io(NULL, argp, do_get_msr_feature, 1);
break;
+ case KVM_GET_SUPPORTED_HV_CPUID:
+ r = kvm_ioctl_get_supported_hv_cpuid(NULL, argp);
+ break;
default:
r = -EINVAL;
break;
@@ -3997,7 +4028,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
int idx;
- if (vcpu->preempted)
+ if (vcpu->preempted && !vcpu->arch.guest_state_protected)
vcpu->arch.preempted_in_kernel = !kvm_x86_ops.get_cpl(vcpu);
/*
@@ -4481,6 +4512,9 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
struct kvm_xsave *guest_xsave)
{
+ if (!vcpu->arch.guest_fpu)
+ return;
+
if (boot_cpu_has(X86_FEATURE_XSAVE)) {
memset(guest_xsave, 0, sizeof(struct kvm_xsave));
fill_xsave((u8 *) guest_xsave->region, vcpu);
@@ -4498,9 +4532,14 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
struct kvm_xsave *guest_xsave)
{
- u64 xstate_bv =
- *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
- u32 mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)];
+ u64 xstate_bv;
+ u32 mxcsr;
+
+ if (!vcpu->arch.guest_fpu)
+ return 0;
+
+ xstate_bv = *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
+ mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)];
if (boot_cpu_has(X86_FEATURE_XSAVE)) {
/*
@@ -4977,25 +5016,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
srcu_read_unlock(&vcpu->kvm->srcu, idx);
break;
}
- case KVM_GET_SUPPORTED_HV_CPUID: {
- struct kvm_cpuid2 __user *cpuid_arg = argp;
- struct kvm_cpuid2 cpuid;
-
- r = -EFAULT;
- if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
- goto out;
-
- r = kvm_vcpu_ioctl_get_hv_cpuid(vcpu, &cpuid,
- cpuid_arg->entries);
- if (r)
- goto out;
-
- r = -EFAULT;
- if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
- goto out;
- r = 0;
+ case KVM_GET_SUPPORTED_HV_CPUID:
+ r = kvm_ioctl_get_supported_hv_cpuid(vcpu, argp);
break;
- }
default:
r = -EINVAL;
}
@@ -5776,7 +5799,7 @@ static void kvm_init_msr_list(void)
}
for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) {
- if (!kvm_x86_ops.has_emulated_msr(emulated_msrs_all[i]))
+ if (!kvm_x86_ops.has_emulated_msr(NULL, emulated_msrs_all[i]))
continue;
emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i];
@@ -8158,7 +8181,14 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
{
struct kvm_run *kvm_run = vcpu->run;
- kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
+ /*
+ * if_flag is obsolete and useless, so do not bother
+ * setting it for SEV-ES guests. Userspace can just
+ * use kvm_run->ready_for_interrupt_injection.
+ */
+ kvm_run->if_flag = !vcpu->arch.guest_state_protected
+ && (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
+
kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0;
kvm_run->cr8 = kvm_get_cr8(vcpu);
kvm_run->apic_base = kvm_get_apic_base(vcpu);
@@ -8748,6 +8778,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
bool req_immediate_exit = false;
+ /* Forbid vmenter if vcpu dirty ring is soft-full */
+ if (unlikely(vcpu->kvm->dirty_ring_size &&
+ kvm_dirty_ring_soft_full(&vcpu->dirty_ring))) {
+ vcpu->run->exit_reason = KVM_EXIT_DIRTY_RING_FULL;
+ trace_kvm_dirty_ring_exit(vcpu);
+ r = 0;
+ goto out;
+ }
+
if (kvm_request_pending(vcpu)) {
if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) {
@@ -9223,9 +9262,14 @@ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
kvm_save_current_fpu(vcpu->arch.user_fpu);
- /* PKRU is separately restored in kvm_x86_ops.run. */
- __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state,
- ~XFEATURE_MASK_PKRU);
+ /*
+ * Guests with protected state can't have it set by the hypervisor,
+ * so skip trying to set it.
+ */
+ if (vcpu->arch.guest_fpu)
+ /* PKRU is separately restored in kvm_x86_ops.run. */
+ __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state,
+ ~XFEATURE_MASK_PKRU);
fpregs_mark_activate();
fpregs_unlock();
@@ -9238,7 +9282,12 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
{
fpregs_lock();
- kvm_save_current_fpu(vcpu->arch.guest_fpu);
+ /*
+ * Guests with protected state can't have it read by the hypervisor,
+ * so skip trying to save it.
+ */
+ if (vcpu->arch.guest_fpu)
+ kvm_save_current_fpu(vcpu->arch.guest_fpu);
copy_kernel_to_fpregs(&vcpu->arch.user_fpu->state);
@@ -9417,6 +9466,9 @@ static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
{
struct desc_ptr dt;
+ if (vcpu->arch.guest_state_protected)
+ goto skip_protected_regs;
+
kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
@@ -9434,9 +9486,11 @@ static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
sregs->gdt.limit = dt.size;
sregs->gdt.base = dt.address;
- sregs->cr0 = kvm_read_cr0(vcpu);
sregs->cr2 = vcpu->arch.cr2;
sregs->cr3 = kvm_read_cr3(vcpu);
+
+skip_protected_regs:
+ sregs->cr0 = kvm_read_cr0(vcpu);
sregs->cr4 = kvm_read_cr4(vcpu);
sregs->cr8 = kvm_get_cr8(vcpu);
sregs->efer = vcpu->arch.efer;
@@ -9535,7 +9589,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
}
EXPORT_SYMBOL_GPL(kvm_task_switch);
-static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
{
if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) {
/*
@@ -9543,31 +9597,29 @@ static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
* 64-bit mode (though maybe in a 32-bit code segment).
* CR4.PAE and EFER.LMA must be set.
*/
- if (!(sregs->cr4 & X86_CR4_PAE)
- || !(sregs->efer & EFER_LMA))
- return -EINVAL;
+ if (!(sregs->cr4 & X86_CR4_PAE) || !(sregs->efer & EFER_LMA))
+ return false;
} else {
/*
* Not in 64-bit mode: EFER.LMA is clear and the code
* segment cannot be 64-bit.
*/
if (sregs->efer & EFER_LMA || sregs->cs.l)
- return -EINVAL;
+ return false;
}
- return kvm_valid_cr4(vcpu, sregs->cr4);
+ return kvm_is_valid_cr4(vcpu, sregs->cr4);
}
static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
{
struct msr_data apic_base_msr;
int mmu_reset_needed = 0;
- int cpuid_update_needed = 0;
int pending_vec, max_bits, idx;
struct desc_ptr dt;
int ret = -EINVAL;
- if (kvm_valid_sregs(vcpu, sregs))
+ if (!kvm_is_valid_sregs(vcpu, sregs))
goto out;
apic_base_msr.data = sregs->apic_base;
@@ -9575,6 +9627,9 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
if (kvm_set_apic_base(vcpu, &apic_base_msr))
goto out;
+ if (vcpu->arch.guest_state_protected)
+ goto skip_protected_regs;
+
dt.size = sregs->idt.limit;
dt.address = sregs->idt.base;
kvm_x86_ops.set_idt(vcpu, &dt);
@@ -9597,11 +9652,7 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
vcpu->arch.cr0 = sregs->cr0;
mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
- cpuid_update_needed |= ((kvm_read_cr4(vcpu) ^ sregs->cr4) &
- (X86_CR4_OSXSAVE | X86_CR4_PKE));
kvm_x86_ops.set_cr4(vcpu, sregs->cr4);
- if (cpuid_update_needed)
- kvm_update_cpuid_runtime(vcpu);
idx = srcu_read_lock(&vcpu->kvm->srcu);
if (is_pae_paging(vcpu)) {
@@ -9613,14 +9664,6 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
if (mmu_reset_needed)
kvm_mmu_reset_context(vcpu);
- max_bits = KVM_NR_INTERRUPTS;
- pending_vec = find_first_bit(
- (const unsigned long *)sregs->interrupt_bitmap, max_bits);
- if (pending_vec < max_bits) {
- kvm_queue_interrupt(vcpu, pending_vec, false);
- pr_debug("Set back pending irq %d\n", pending_vec);
- }
-
kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
@@ -9639,6 +9682,15 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
!is_protmode(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+skip_protected_regs:
+ max_bits = KVM_NR_INTERRUPTS;
+ pending_vec = find_first_bit(
+ (const unsigned long *)sregs->interrupt_bitmap, max_bits);
+ if (pending_vec < max_bits) {
+ kvm_queue_interrupt(vcpu, pending_vec, false);
+ pr_debug("Set back pending irq %d\n", pending_vec);
+ }
+
kvm_make_request(KVM_REQ_EVENT, vcpu);
ret = 0;
@@ -9663,6 +9715,9 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
unsigned long rflags;
int i, r;
+ if (vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
vcpu_load(vcpu);
if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
@@ -9742,6 +9797,9 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
struct fxregs_state *fxsave;
+ if (!vcpu->arch.guest_fpu)
+ return 0;
+
vcpu_load(vcpu);
fxsave = &vcpu->arch.guest_fpu->state.fxsave;
@@ -9762,6 +9820,9 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
struct fxregs_state *fxsave;
+ if (!vcpu->arch.guest_fpu)
+ return 0;
+
vcpu_load(vcpu);
fxsave = &vcpu->arch.guest_fpu->state.fxsave;
@@ -9820,6 +9881,9 @@ static int sync_regs(struct kvm_vcpu *vcpu)
static void fx_init(struct kvm_vcpu *vcpu)
{
+ if (!vcpu->arch.guest_fpu)
+ return;
+
fpstate_init(&vcpu->arch.guest_fpu->state);
if (boot_cpu_has(X86_FEATURE_XSAVES))
vcpu->arch.guest_fpu->state.xsave.header.xcomp_bv =
@@ -9833,6 +9897,15 @@ static void fx_init(struct kvm_vcpu *vcpu)
vcpu->arch.cr0 |= X86_CR0_ET;
}
+void kvm_free_guest_fpu(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.guest_fpu) {
+ kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
+ vcpu->arch.guest_fpu = NULL;
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_free_guest_fpu);
+
int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
{
if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
@@ -9928,7 +10001,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
return 0;
free_guest_fpu:
- kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
+ kvm_free_guest_fpu(vcpu);
free_user_fpu:
kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
free_emulate_ctxt:
@@ -9982,7 +10055,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
- kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
+ kvm_free_guest_fpu(vcpu);
kvm_hv_vcpu_uninit(vcpu);
kvm_pmu_destroy(vcpu);
@@ -10030,7 +10103,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
kvm_async_pf_hash_reset(vcpu);
vcpu->arch.apf.halted = false;
- if (kvm_mpx_supported()) {
+ if (vcpu->arch.guest_fpu && kvm_mpx_supported()) {
void *mpx_state_buffer;
/*
@@ -10349,7 +10422,32 @@ void kvm_arch_sync_events(struct kvm *kvm)
kvm_free_pit(kvm);
}
-int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
+#define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e))
+
+/**
+ * __x86_set_memory_region: Setup KVM internal memory slot
+ *
+ * @kvm: the kvm pointer to the VM.
+ * @id: the slot ID to setup.
+ * @gpa: the GPA to install the slot (unused when @size == 0).
+ * @size: the size of the slot. Set to zero to uninstall a slot.
+ *
+ * This function helps to setup a KVM internal memory slot. Specify
+ * @size > 0 to install a new slot, while @size == 0 to uninstall a
+ * slot. The return code can be one of the following:
+ *
+ * HVA: on success (uninstall will return a bogus HVA)
+ * -errno: on error
+ *
+ * The caller should always use IS_ERR() to check the return value
+ * before use. Note, the KVM internal memory slots are guaranteed to
+ * remain valid and unchanged until the VM is destroyed, i.e., the
+ * GPA->HVA translation will not change. However, the HVA is a user
+ * address, i.e. its accessibility is not guaranteed, and must be
+ * accessed via __copy_{to,from}_user().
+ */
+void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
+ u32 size)
{
int i, r;
unsigned long hva, old_npages;
@@ -10358,12 +10456,12 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
/* Called with kvm->slots_lock held. */
if (WARN_ON(id >= KVM_MEM_SLOTS_NUM))
- return -EINVAL;
+ return ERR_PTR_USR(-EINVAL);
slot = id_to_memslot(slots, id);
if (size) {
if (slot && slot->npages)
- return -EEXIST;
+ return ERR_PTR_USR(-EEXIST);
/*
* MAP_SHARED to prevent internal slot pages from being moved
@@ -10372,7 +10470,7 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
hva = vm_mmap(NULL, 0, size, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_ANONYMOUS, 0);
if (IS_ERR((void *)hva))
- return PTR_ERR((void *)hva);
+ return (void __user *)hva;
} else {
if (!slot || !slot->npages)
return 0;
@@ -10391,13 +10489,13 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
m.memory_size = size;
r = __kvm_set_memory_region(kvm, &m);
if (r < 0)
- return r;
+ return ERR_PTR_USR(r);
}
if (!size)
vm_munmap(hva, old_npages * PAGE_SIZE);
- return 0;
+ return (void __user *)hva;
}
EXPORT_SYMBOL_GPL(__x86_set_memory_region);
@@ -10754,6 +10852,10 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
{
+ /* Can't read the RIP when guest state is protected, just return 0 */
+ if (vcpu->arch.guest_state_protected)
+ return 0;
+
if (is_64_bit_mode(vcpu))
return kvm_rip_read(vcpu);
return (u32)(get_segment_base(vcpu, VCPU_SREG_CS) +
@@ -11263,6 +11365,179 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
}
EXPORT_SYMBOL_GPL(kvm_handle_invpcid);
+static int complete_sev_es_emulated_mmio(struct kvm_vcpu *vcpu)
+{
+ struct kvm_run *run = vcpu->run;
+ struct kvm_mmio_fragment *frag;
+ unsigned int len;
+
+ BUG_ON(!vcpu->mmio_needed);
+
+ /* Complete previous fragment */
+ frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
+ len = min(8u, frag->len);
+ if (!vcpu->mmio_is_write)
+ memcpy(frag->data, run->mmio.data, len);
+
+ if (frag->len <= 8) {
+ /* Switch to the next fragment. */
+ frag++;
+ vcpu->mmio_cur_fragment++;
+ } else {
+ /* Go forward to the next mmio piece. */
+ frag->data += len;
+ frag->gpa += len;
+ frag->len -= len;
+ }
+
+ if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) {
+ vcpu->mmio_needed = 0;
+
+ // VMG change, at this point, we're always done
+ // RIP has already been advanced
+ return 1;
+ }
+
+ // More MMIO is needed
+ run->mmio.phys_addr = frag->gpa;
+ run->mmio.len = min(8u, frag->len);
+ run->mmio.is_write = vcpu->mmio_is_write;
+ if (run->mmio.is_write)
+ memcpy(run->mmio.data, frag->data, min(8u, frag->len));
+ run->exit_reason = KVM_EXIT_MMIO;
+
+ vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio;
+
+ return 0;
+}
+
+int kvm_sev_es_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes,
+ void *data)
+{
+ int handled;
+ struct kvm_mmio_fragment *frag;
+
+ if (!data)
+ return -EINVAL;
+
+ handled = write_emultor.read_write_mmio(vcpu, gpa, bytes, data);
+ if (handled == bytes)
+ return 1;
+
+ bytes -= handled;
+ gpa += handled;
+ data += handled;
+
+ /*TODO: Check if need to increment number of frags */
+ frag = vcpu->mmio_fragments;
+ vcpu->mmio_nr_fragments = 1;
+ frag->len = bytes;
+ frag->gpa = gpa;
+ frag->data = data;
+
+ vcpu->mmio_needed = 1;
+ vcpu->mmio_cur_fragment = 0;
+
+ vcpu->run->mmio.phys_addr = gpa;
+ vcpu->run->mmio.len = min(8u, frag->len);
+ vcpu->run->mmio.is_write = 1;
+ memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
+ vcpu->run->exit_reason = KVM_EXIT_MMIO;
+
+ vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_write);
+
+int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes,
+ void *data)
+{
+ int handled;
+ struct kvm_mmio_fragment *frag;
+
+ if (!data)
+ return -EINVAL;
+
+ handled = read_emultor.read_write_mmio(vcpu, gpa, bytes, data);
+ if (handled == bytes)
+ return 1;
+
+ bytes -= handled;
+ gpa += handled;
+ data += handled;
+
+ /*TODO: Check if need to increment number of frags */
+ frag = vcpu->mmio_fragments;
+ vcpu->mmio_nr_fragments = 1;
+ frag->len = bytes;
+ frag->gpa = gpa;
+ frag->data = data;
+
+ vcpu->mmio_needed = 1;
+ vcpu->mmio_cur_fragment = 0;
+
+ vcpu->run->mmio.phys_addr = gpa;
+ vcpu->run->mmio.len = min(8u, frag->len);
+ vcpu->run->mmio.is_write = 0;
+ vcpu->run->exit_reason = KVM_EXIT_MMIO;
+
+ vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_read);
+
+static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu)
+{
+ memcpy(vcpu->arch.guest_ins_data, vcpu->arch.pio_data,
+ vcpu->arch.pio.count * vcpu->arch.pio.size);
+ vcpu->arch.pio.count = 0;
+
+ return 1;
+}
+
+static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size,
+ unsigned int port, void *data, unsigned int count)
+{
+ int ret;
+
+ ret = emulator_pio_out_emulated(vcpu->arch.emulate_ctxt, size, port,
+ data, count);
+ if (ret)
+ return ret;
+
+ vcpu->arch.pio.count = 0;
+
+ return 0;
+}
+
+static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size,
+ unsigned int port, void *data, unsigned int count)
+{
+ int ret;
+
+ ret = emulator_pio_in_emulated(vcpu->arch.emulate_ctxt, size, port,
+ data, count);
+ if (ret) {
+ vcpu->arch.pio.count = 0;
+ } else {
+ vcpu->arch.guest_ins_data = data;
+ vcpu->arch.complete_userspace_io = complete_sev_es_emulated_ins;
+ }
+
+ return 0;
+}
+
+int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size,
+ unsigned int port, void *data, unsigned int count,
+ int in)
+{
+ return in ? kvm_sev_es_ins(vcpu, size, port, data, count)
+ : kvm_sev_es_outs(vcpu, size, port, data, count);
+}
+EXPORT_SYMBOL_GPL(kvm_sev_es_string_io);
+
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
@@ -11285,3 +11560,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index e7ca622a468f..c5ee0f5ce0f1 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -3,6 +3,7 @@
#define ARCH_X86_KVM_X86_H
#include <linux/kvm_host.h>
+#include <asm/mce.h>
#include <asm/pvclock.h>
#include "kvm_cache_regs.h"
#include "kvm_emulate.h"
@@ -278,6 +279,7 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu);
extern u64 host_xcr0;
extern u64 supported_xcr0;
+extern u64 host_xss;
extern u64 supported_xss;
static inline bool kvm_mpx_supported(void)
@@ -366,10 +368,29 @@ static inline bool kvm_dr6_valid(u64 data)
return !(data >> 32);
}
+/*
+ * Trigger machine check on the host. We assume all the MSRs are already set up
+ * by the CPU and that we still run on the same CPU as the MCE occurred on.
+ * We pass a fake environment to the machine check handler because we want
+ * the guest to be always treated like user space, no matter what context
+ * it used internally.
+ */
+static inline void kvm_machine_check(void)
+{
+#if defined(CONFIG_X86_MCE)
+ struct pt_regs regs = {
+ .cs = 3, /* Fake ring 3 no matter what the guest ran on */
+ .flags = X86_EFLAGS_IF,
+ };
+
+ do_machine_check(&regs);
+#endif
+}
+
void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu);
void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu);
int kvm_spec_ctrl_test_value(u64 value);
-int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu);
int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
struct x86_exception *e);
@@ -407,4 +428,12 @@ bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type);
__reserved_bits; \
})
+int kvm_sev_es_mmio_write(struct kvm_vcpu *vcpu, gpa_t src, unsigned int bytes,
+ void *dst);
+int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t src, unsigned int bytes,
+ void *dst);
+int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size,
+ unsigned int port, void *data, unsigned int count,
+ int in);
+
#endif