summaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm/svm/svm.c
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2024-02-14 10:45:07 +0100
committerIngo Molnar <mingo@kernel.org>2024-02-14 10:45:07 +0100
commit03c11eb3b16dc0058589751dfd91f254be2be613 (patch)
treee5f2889212fec0bb0babdce9abd781ab487e246a /arch/x86/kvm/svm/svm.c
parentx86/percpu: Use %RIP-relative address in untagged_addr() (diff)
parentLinux 6.8-rc4 (diff)
downloadlinux-03c11eb3b16dc0058589751dfd91f254be2be613.tar.xz
linux-03c11eb3b16dc0058589751dfd91f254be2be613.zip
Merge tag 'v6.8-rc4' into x86/percpu, to resolve conflicts and refresh the branch
Conflicts: arch/x86/include/asm/percpu.h arch/x86/include/asm/text-patching.h Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'arch/x86/kvm/svm/svm.c')
-rw-r--r--arch/x86/kvm/svm/svm.c432
1 files changed, 280 insertions, 152 deletions
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index d4bfdc607fe7..e90b429c84f1 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -39,10 +39,9 @@
#include <asm/spec-ctrl.h>
#include <asm/cpu_device_id.h>
#include <asm/traps.h>
+#include <asm/reboot.h>
#include <asm/fpu/api.h>
-#include <asm/virtext.h>
-
#include <trace/events/ipi.h>
#include "trace.h"
@@ -104,6 +103,7 @@ static const struct svm_direct_access_msrs {
{ .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
{ .index = MSR_IA32_LASTINTFROMIP, .always = false },
{ .index = MSR_IA32_LASTINTTOIP, .always = false },
+ { .index = MSR_IA32_XSS, .always = false },
{ .index = MSR_EFER, .always = false },
{ .index = MSR_IA32_CR_PAT, .always = false },
{ .index = MSR_AMD64_SEV_ES_GHCB, .always = true },
@@ -200,10 +200,10 @@ module_param_named(npt, npt_enabled, bool, 0444);
/* allow nested virtualization in KVM/SVM */
static int nested = true;
-module_param(nested, int, S_IRUGO);
+module_param(nested, int, 0444);
/* enable/disable Next RIP Save */
-static int nrips = true;
+int nrips = true;
module_param(nrips, int, 0444);
/* enable/disable Virtual VMLOAD VMSAVE */
@@ -517,50 +517,79 @@ static void svm_init_osvw(struct kvm_vcpu *vcpu)
vcpu->arch.osvw.status |= 1;
}
-static bool kvm_is_svm_supported(void)
+static bool __kvm_is_svm_supported(void)
{
- int cpu = raw_smp_processor_id();
- const char *msg;
- u64 vm_cr;
+ int cpu = smp_processor_id();
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
- if (!cpu_has_svm(&msg)) {
- pr_err("SVM not supported by CPU %d, %s\n", cpu, msg);
+ if (c->x86_vendor != X86_VENDOR_AMD &&
+ c->x86_vendor != X86_VENDOR_HYGON) {
+ pr_err("CPU %d isn't AMD or Hygon\n", cpu);
return false;
}
- if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
- pr_info("KVM is unsupported when running as an SEV guest\n");
+ if (!cpu_has(c, X86_FEATURE_SVM)) {
+ pr_err("SVM not supported by CPU %d\n", cpu);
return false;
}
- rdmsrl(MSR_VM_CR, vm_cr);
- if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE)) {
- pr_err("SVM disabled (by BIOS) in MSR_VM_CR on CPU %d\n", cpu);
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
+ pr_info("KVM is unsupported when running as an SEV guest\n");
return false;
}
return true;
}
+static bool kvm_is_svm_supported(void)
+{
+ bool supported;
+
+ migrate_disable();
+ supported = __kvm_is_svm_supported();
+ migrate_enable();
+
+ return supported;
+}
+
static int svm_check_processor_compat(void)
{
- if (!kvm_is_svm_supported())
+ if (!__kvm_is_svm_supported())
return -EIO;
return 0;
}
-void __svm_write_tsc_multiplier(u64 multiplier)
+static void __svm_write_tsc_multiplier(u64 multiplier)
{
- preempt_disable();
-
if (multiplier == __this_cpu_read(current_tsc_ratio))
- goto out;
+ return;
wrmsrl(MSR_AMD64_TSC_RATIO, multiplier);
__this_cpu_write(current_tsc_ratio, multiplier);
-out:
- preempt_enable();
+}
+
+static inline void kvm_cpu_svm_disable(void)
+{
+ uint64_t efer;
+
+ wrmsrl(MSR_VM_HSAVE_PA, 0);
+ rdmsrl(MSR_EFER, efer);
+ if (efer & EFER_SVME) {
+ /*
+ * Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and
+ * NMI aren't blocked.
+ */
+ stgi();
+ wrmsrl(MSR_EFER, efer & ~EFER_SVME);
+ }
+}
+
+static void svm_emergency_disable(void)
+{
+ kvm_rebooting = true;
+
+ kvm_cpu_svm_disable();
}
static void svm_hardware_disable(void)
@@ -569,7 +598,7 @@ static void svm_hardware_disable(void)
if (tsc_scaling)
__svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
- cpu_svm_disable();
+ kvm_cpu_svm_disable();
amd_pmu_disable_virt();
}
@@ -637,6 +666,21 @@ static int svm_hardware_enable(void)
amd_pmu_enable_virt();
+ /*
+ * If TSC_AUX virtualization is supported, TSC_AUX becomes a swap type
+ * "B" field (see sev_es_prepare_switch_to_guest()) for SEV-ES guests.
+ * Since Linux does not change the value of TSC_AUX once set, prime the
+ * TSC_AUX field now to avoid a RDMSR on every vCPU run.
+ */
+ if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) {
+ struct sev_es_save_area *hostsa;
+ u32 __maybe_unused msr_hi;
+
+ hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400);
+
+ rdmsr(MSR_TSC_AUX, hostsa->tsc_aux, msr_hi);
+ }
+
return 0;
}
@@ -677,6 +721,39 @@ free_save_area:
}
+static void set_dr_intercepts(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
+
+ recalc_intercepts(svm);
+}
+
+static void clr_dr_intercepts(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+
+ vmcb->control.intercepts[INTERCEPT_DR] = 0;
+
+ recalc_intercepts(svm);
+}
+
static int direct_access_msr_slot(u32 msr)
{
u32 i;
@@ -819,8 +896,7 @@ void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept)
if (intercept == svm->x2avic_msrs_intercepted)
return;
- if (!x2avic_enabled ||
- !apic_x2apic_mode(svm->vcpu.arch.apic))
+ if (!x2avic_enabled)
return;
for (i = 0; i < MAX_DIRECT_ACCESS_MSRS; i++) {
@@ -947,50 +1023,24 @@ static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb);
}
-static int svm_get_lbr_msr(struct vcpu_svm *svm, u32 index)
+static struct vmcb *svm_get_lbr_vmcb(struct vcpu_svm *svm)
{
/*
- * If the LBR virtualization is disabled, the LBR msrs are always
- * kept in the vmcb01 to avoid copying them on nested guest entries.
- *
- * If nested, and the LBR virtualization is enabled/disabled, the msrs
- * are moved between the vmcb01 and vmcb02 as needed.
+ * If LBR virtualization is disabled, the LBR MSRs are always kept in
+ * vmcb01. If LBR virtualization is enabled and L1 is running VMs of
+ * its own, the MSRs are moved between vmcb01 and vmcb02 as needed.
*/
- struct vmcb *vmcb =
- (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) ?
- svm->vmcb : svm->vmcb01.ptr;
-
- switch (index) {
- case MSR_IA32_DEBUGCTLMSR:
- return vmcb->save.dbgctl;
- case MSR_IA32_LASTBRANCHFROMIP:
- return vmcb->save.br_from;
- case MSR_IA32_LASTBRANCHTOIP:
- return vmcb->save.br_to;
- case MSR_IA32_LASTINTFROMIP:
- return vmcb->save.last_excp_from;
- case MSR_IA32_LASTINTTOIP:
- return vmcb->save.last_excp_to;
- default:
- KVM_BUG(false, svm->vcpu.kvm,
- "%s: Unknown MSR 0x%x", __func__, index);
- return 0;
- }
+ return svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK ? svm->vmcb :
+ svm->vmcb01.ptr;
}
void svm_update_lbrv(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
-
- bool enable_lbrv = svm_get_lbr_msr(svm, MSR_IA32_DEBUGCTLMSR) &
- DEBUGCTLMSR_LBR;
-
- bool current_enable_lbrv = !!(svm->vmcb->control.virt_ext &
- LBR_CTL_ENABLE_MASK);
-
- if (unlikely(is_guest_mode(vcpu) && svm->lbrv_enabled))
- if (unlikely(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))
- enable_lbrv = true;
+ bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK;
+ bool enable_lbrv = (svm_get_lbr_vmcb(svm)->save.dbgctl & DEBUGCTLMSR_LBR) ||
+ (is_guest_mode(vcpu) && guest_can_use(vcpu, X86_FEATURE_LBRV) &&
+ (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK));
if (enable_lbrv == current_enable_lbrv)
return;
@@ -1101,21 +1151,23 @@ static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
return svm->tsc_ratio_msr;
}
-static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+static void svm_write_tsc_offset(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset;
- svm->vmcb->control.tsc_offset = offset;
+ svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset;
vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
}
-static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
+void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu)
{
- __svm_write_tsc_multiplier(multiplier);
+ preempt_disable();
+ if (to_svm(vcpu)->guest_state_loaded)
+ __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
+ preempt_enable();
}
-
/* Evaluate instruction intercepts that depend on guest CPUID features. */
static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
struct vcpu_svm *svm)
@@ -1156,8 +1208,6 @@ static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
-
- svm->v_vmload_vmsave_enabled = false;
} else {
/*
* If hardware supports Virtual VMLOAD VMSAVE then enable it
@@ -1201,10 +1251,9 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
* Guest access to VMware backdoor ports could legitimately
* trigger #GP because of TSS I/O permission bitmap.
* We intercept those #GP and allow access to them anyway
- * as VMware does. Don't intercept #GP for SEV guests as KVM can't
- * decrypt guest memory to decode the faulting instruction.
+ * as VMware does.
*/
- if (enable_vmware_backdoor && !sev_guest(vcpu->kvm))
+ if (enable_vmware_backdoor)
set_exception_intercept(svm, GP_VECTOR);
svm_set_intercept(svm, INTERCEPT_INTR);
@@ -1480,7 +1529,14 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
if (tsc_scaling)
__svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
- if (likely(tsc_aux_uret_slot >= 0))
+ /*
+ * TSC_AUX is always virtualized for SEV-ES guests when the feature is
+ * available. The user return MSR support is not required in this case
+ * because TSC_AUX is restored on #VMEXIT from the host save area
+ * (which has been initialized in svm_hardware_enable()).
+ */
+ if (likely(tsc_aux_uret_slot >= 0) &&
+ (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm)))
kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
svm->guest_state_loaded = true;
@@ -1800,15 +1856,17 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
bool old_paging = is_paging(vcpu);
#ifdef CONFIG_X86_64
- if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) {
+ if (vcpu->arch.efer & EFER_LME) {
if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
vcpu->arch.efer |= EFER_LMA;
- svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
+ if (!vcpu->arch.guest_state_protected)
+ svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
}
if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
vcpu->arch.efer &= ~EFER_LMA;
- svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
+ if (!vcpu->arch.guest_state_protected)
+ svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
}
}
#endif
@@ -1949,7 +2007,7 @@ static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (vcpu->arch.guest_state_protected)
+ if (WARN_ON_ONCE(sev_es_guest(vcpu->kvm)))
return;
get_debugreg(vcpu->arch.db[0], 0);
@@ -2129,12 +2187,6 @@ static int shutdown_interception(struct kvm_vcpu *vcpu)
struct kvm_run *kvm_run = vcpu->run;
struct vcpu_svm *svm = to_svm(vcpu);
- /*
- * The VM save area has already been encrypted so it
- * cannot be reinitialized - just terminate.
- */
- if (sev_es_guest(vcpu->kvm))
- return -EINVAL;
/*
* VMCB is undefined after a SHUTDOWN intercept. INIT the vCPU to put
@@ -2143,9 +2195,14 @@ static int shutdown_interception(struct kvm_vcpu *vcpu)
* userspace. At a platform view, INIT is acceptable behavior as
* there exist bare metal platforms that automatically INIT the CPU
* in response to shutdown.
+ *
+ * The VM save area for SEV-ES guests has already been encrypted so it
+ * cannot be reinitialized, i.e. synthesizing INIT is futile.
*/
- clear_page(svm->vmcb);
- kvm_vcpu_reset(vcpu, true);
+ if (!sev_es_guest(vcpu->kvm)) {
+ clear_page(svm->vmcb);
+ kvm_vcpu_reset(vcpu, true);
+ }
kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
return 0;
@@ -2510,12 +2567,13 @@ static int iret_interception(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ WARN_ON_ONCE(sev_es_guest(vcpu->kvm));
+
++vcpu->stat.nmi_window_exits;
svm->awaiting_iret_completion = true;
svm_clr_iret_intercept(svm);
- if (!sev_es_guest(vcpu->kvm))
- svm->nmi_iret_rip = kvm_rip_read(vcpu);
+ svm->nmi_iret_rip = kvm_rip_read(vcpu);
kvm_make_request(KVM_REQ_EVENT, vcpu);
return 1;
@@ -2680,6 +2738,13 @@ static int dr_interception(struct kvm_vcpu *vcpu)
unsigned long val;
int err = 0;
+ /*
+ * SEV-ES intercepts DR7 only to disable guest debugging and the guest issues a VMGEXIT
+ * for DR7 write only. KVM cannot change DR7 (always swapped as type 'A') so return early.
+ */
+ if (sev_es_guest(vcpu->kvm))
+ return 1;
+
if (vcpu->guest_debug == 0) {
/*
* No more DR vmexits; force a reload of the debug registers
@@ -2764,7 +2829,8 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
switch (msr_info->index) {
case MSR_AMD64_TSC_RATIO:
- if (!msr_info->host_initiated && !svm->tsc_scaling_enabled)
+ if (!msr_info->host_initiated &&
+ !guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR))
return 1;
msr_info->data = svm->tsc_ratio_msr;
break;
@@ -2802,11 +2868,19 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = svm->tsc_aux;
break;
case MSR_IA32_DEBUGCTLMSR:
+ msr_info->data = svm_get_lbr_vmcb(svm)->save.dbgctl;
+ break;
case MSR_IA32_LASTBRANCHFROMIP:
+ msr_info->data = svm_get_lbr_vmcb(svm)->save.br_from;
+ break;
case MSR_IA32_LASTBRANCHTOIP:
+ msr_info->data = svm_get_lbr_vmcb(svm)->save.br_to;
+ break;
case MSR_IA32_LASTINTFROMIP:
+ msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_from;
+ break;
case MSR_IA32_LASTINTTOIP:
- msr_info->data = svm_get_lbr_msr(svm, msr_info->index);
+ msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_to;
break;
case MSR_VM_HSAVE_PA:
msr_info->data = svm->nested.hsave_msr;
@@ -2906,7 +2980,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
switch (ecx) {
case MSR_AMD64_TSC_RATIO:
- if (!svm->tsc_scaling_enabled) {
+ if (!guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR)) {
if (!msr->host_initiated)
return 1;
@@ -2928,7 +3002,8 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
svm->tsc_ratio_msr = data;
- if (svm->tsc_scaling_enabled && is_guest_mode(vcpu))
+ if (guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR) &&
+ is_guest_mode(vcpu))
nested_svm_update_tsc_ratio_msr(vcpu);
break;
@@ -3017,6 +3092,16 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
break;
case MSR_TSC_AUX:
/*
+ * TSC_AUX is always virtualized for SEV-ES guests when the
+ * feature is available. The user return MSR support is not
+ * required in this case because TSC_AUX is restored on #VMEXIT
+ * from the host save area (which has been initialized in
+ * svm_hardware_enable()).
+ */
+ if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && sev_es_guest(vcpu->kvm))
+ break;
+
+ /*
* TSC_AUX is usually changed only during boot and never read
* directly. Intercept TSC_AUX instead of exposing it to the
* guest via direct_access_msrs, and switch it via user return.
@@ -3037,13 +3122,8 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
if (data & DEBUGCTL_RESERVED_BITS)
return 1;
- if (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK)
- svm->vmcb->save.dbgctl = data;
- else
- svm->vmcb01.ptr->save.dbgctl = data;
-
+ svm_get_lbr_vmcb(svm)->save.dbgctl = data;
svm_update_lbrv(vcpu);
-
break;
case MSR_VM_HSAVE_PA:
/*
@@ -3483,8 +3563,15 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
if (svm->nmi_l1_to_l2)
return;
- svm->nmi_masked = true;
- svm_set_iret_intercept(svm);
+ /*
+ * No need to manually track NMI masking when vNMI is enabled, hardware
+ * automatically sets V_NMI_BLOCKING_MASK as appropriate, including the
+ * case where software directly injects an NMI.
+ */
+ if (!is_vnmi_enabled(svm)) {
+ svm->nmi_masked = true;
+ svm_set_iret_intercept(svm);
+ }
++vcpu->stat.nmi_injections;
}
@@ -3769,6 +3856,19 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
if (svm_get_nmi_mask(vcpu) && !svm->awaiting_iret_completion)
return; /* IRET will cause a vm exit */
+ /*
+ * SEV-ES guests are responsible for signaling when a vCPU is ready to
+ * receive a new NMI, as SEV-ES guests can't be single-stepped, i.e.
+ * KVM can't intercept and single-step IRET to detect when NMIs are
+ * unblocked (architecturally speaking). See SVM_VMGEXIT_NMI_COMPLETE.
+ *
+ * Note, GIF is guaranteed to be '1' for SEV-ES guests as hardware
+ * ignores SEV-ES guest writes to EFER.SVME *and* CLGI/STGI are not
+ * supported NAEs in the GHCB protocol.
+ */
+ if (sev_es_guest(vcpu->kvm))
+ return;
+
if (!gif_set(svm)) {
if (vgif)
svm_set_intercept(svm, INTERCEPT_STGI);
@@ -3918,12 +4018,11 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
svm->soft_int_injected = false;
/*
- * If we've made progress since setting HF_IRET_MASK, we've
+ * If we've made progress since setting awaiting_iret_completion, we've
* executed an IRET and can allow NMI injection.
*/
if (svm->awaiting_iret_completion &&
- (sev_es_guest(vcpu->kvm) ||
- kvm_rip_read(vcpu) != svm->nmi_iret_rip)) {
+ kvm_rip_read(vcpu) != svm->nmi_iret_rip) {
svm->awaiting_iret_completion = false;
svm->nmi_masked = false;
kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -4207,30 +4306,38 @@ static bool svm_has_emulated_msr(struct kvm *kvm, u32 index)
static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- struct kvm_cpuid_entry2 *best;
-
- vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
- boot_cpu_has(X86_FEATURE_XSAVE) &&
- boot_cpu_has(X86_FEATURE_XSAVES);
-
- /* Update nrips enabled cache */
- svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
- guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
- svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR);
- svm->lbrv_enabled = lbrv && guest_cpuid_has(vcpu, X86_FEATURE_LBRV);
-
- svm->v_vmload_vmsave_enabled = vls && guest_cpuid_has(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
-
- svm->pause_filter_enabled = kvm_cpu_cap_has(X86_FEATURE_PAUSEFILTER) &&
- guest_cpuid_has(vcpu, X86_FEATURE_PAUSEFILTER);
+ /*
+ * SVM doesn't provide a way to disable just XSAVES in the guest, KVM
+ * can only disable all variants of by disallowing CR4.OSXSAVE from
+ * being set. As a result, if the host has XSAVE and XSAVES, and the
+ * guest has XSAVE enabled, the guest can execute XSAVES without
+ * faulting. Treat XSAVES as enabled in this case regardless of
+ * whether it's advertised to the guest so that KVM context switches
+ * XSS on VM-Enter/VM-Exit. Failure to do so would effectively give
+ * the guest read/write access to the host's XSS.
+ */
+ if (boot_cpu_has(X86_FEATURE_XSAVE) &&
+ boot_cpu_has(X86_FEATURE_XSAVES) &&
+ guest_cpuid_has(vcpu, X86_FEATURE_XSAVE))
+ kvm_governed_feature_set(vcpu, X86_FEATURE_XSAVES);
- svm->pause_threshold_enabled = kvm_cpu_cap_has(X86_FEATURE_PFTHRESHOLD) &&
- guest_cpuid_has(vcpu, X86_FEATURE_PFTHRESHOLD);
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_NRIPS);
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_TSCRATEMSR);
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LBRV);
- svm->vgif_enabled = vgif && guest_cpuid_has(vcpu, X86_FEATURE_VGIF);
+ /*
+ * Intercept VMLOAD if the vCPU mode is Intel in order to emulate that
+ * VMLOAD drops bits 63:32 of SYSENTER (ignoring the fact that exposing
+ * SVM on Intel is bonkers and extremely unlikely to work).
+ */
+ if (!guest_cpuid_is_intel(vcpu))
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
- svm->vnmi_enabled = vnmi && guest_cpuid_has(vcpu, X86_FEATURE_VNMI);
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PAUSEFILTER);
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PFTHRESHOLD);
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VGIF);
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VNMI);
svm_recalc_instruction_intercepts(vcpu, svm);
@@ -4242,12 +4349,8 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_FLUSH_CMD, 0,
!!guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D));
- /* For sev guests, the memory encryption bit is not reserved in CR3. */
- if (sev_guest(vcpu->kvm)) {
- best = kvm_find_cpuid_entry(vcpu, 0x8000001F);
- if (best)
- vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
- }
+ if (sev_guest(vcpu->kvm))
+ sev_vcpu_after_set_cpuid(svm);
init_vmcb_after_set_cpuid(vcpu);
}
@@ -4615,15 +4718,15 @@ static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
}
#endif
-static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
- void *insn, int insn_len)
+static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len)
{
bool smep, smap, is_user;
u64 error_code;
/* Emulation is always possible when KVM has access to all guest state. */
if (!sev_guest(vcpu->kvm))
- return true;
+ return X86EMUL_CONTINUE;
/* #UD and #GP should never be intercepted for SEV guests. */
WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD |
@@ -4635,33 +4738,44 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
* to guest register state.
*/
if (sev_es_guest(vcpu->kvm))
- return false;
+ return X86EMUL_RETRY_INSTR;
/*
* Emulation is possible if the instruction is already decoded, e.g.
* when completing I/O after returning from userspace.
*/
if (emul_type & EMULTYPE_NO_DECODE)
- return true;
+ return X86EMUL_CONTINUE;
/*
* Emulation is possible for SEV guests if and only if a prefilled
* buffer containing the bytes of the intercepted instruction is
* available. SEV guest memory is encrypted with a guest specific key
- * and cannot be decrypted by KVM, i.e. KVM would read cyphertext and
+ * and cannot be decrypted by KVM, i.e. KVM would read ciphertext and
* decode garbage.
*
- * Inject #UD if KVM reached this point without an instruction buffer.
- * In practice, this path should never be hit by a well-behaved guest,
- * e.g. KVM doesn't intercept #UD or #GP for SEV guests, but this path
- * is still theoretically reachable, e.g. via unaccelerated fault-like
- * AVIC access, and needs to be handled by KVM to avoid putting the
- * guest into an infinite loop. Injecting #UD is somewhat arbitrary,
- * but its the least awful option given lack of insight into the guest.
+ * If KVM is NOT trying to simply skip an instruction, inject #UD if
+ * KVM reached this point without an instruction buffer. In practice,
+ * this path should never be hit by a well-behaved guest, e.g. KVM
+ * doesn't intercept #UD or #GP for SEV guests, but this path is still
+ * theoretically reachable, e.g. via unaccelerated fault-like AVIC
+ * access, and needs to be handled by KVM to avoid putting the guest
+ * into an infinite loop. Injecting #UD is somewhat arbitrary, but
+ * its the least awful option given lack of insight into the guest.
+ *
+ * If KVM is trying to skip an instruction, simply resume the guest.
+ * If a #NPF occurs while the guest is vectoring an INT3/INTO, then KVM
+ * will attempt to re-inject the INT3/INTO and skip the instruction.
+ * In that scenario, retrying the INT3/INTO and hoping the guest will
+ * make forward progress is the only option that has a chance of
+ * success (and in practice it will work the vast majority of the time).
*/
if (unlikely(!insn)) {
+ if (emul_type & EMULTYPE_SKIP)
+ return X86EMUL_UNHANDLEABLE;
+
kvm_queue_exception(vcpu, UD_VECTOR);
- return false;
+ return X86EMUL_PROPAGATE_FAULT;
}
/*
@@ -4672,7 +4786,7 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
* table used to translate CS:RIP resides in emulated MMIO.
*/
if (likely(insn_len))
- return true;
+ return X86EMUL_CONTINUE;
/*
* Detect and workaround Errata 1096 Fam_17h_00_0Fh.
@@ -4730,6 +4844,7 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
kvm_inject_gp(vcpu, 0);
else
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ return X86EMUL_PROPAGATE_FAULT;
}
resume_guest:
@@ -4747,7 +4862,7 @@ resume_guest:
* doesn't explicitly define "ignored", i.e. doing nothing and letting
* the guest spin is technically "ignoring" the access.
*/
- return false;
+ return X86EMUL_RETRY_INSTR;
}
static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
@@ -4907,7 +5022,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.vm_copy_enc_context_from = sev_vm_copy_enc_context_from,
.vm_move_enc_context_from = sev_vm_move_enc_context_from,
- .can_emulate_instruction = svm_can_emulate_instruction,
+ .check_emulate_instruction = svm_check_emulate_instruction,
.apic_init_signal_blocked = svm_apic_init_signal_blocked,
@@ -4971,6 +5086,13 @@ static __init void svm_set_cpu_caps(void)
kvm_cpu_cap_set(X86_FEATURE_SVM);
kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN);
+ /*
+ * KVM currently flushes TLBs on *every* nested SVM transition,
+ * and so for all intents and purposes KVM supports flushing by
+ * ASID, i.e. KVM is guaranteed to honor every L1 ASID flush.
+ */
+ kvm_cpu_cap_set(X86_FEATURE_FLUSHBYASID);
+
if (nrips)
kvm_cpu_cap_set(X86_FEATURE_NRIPS);
@@ -5112,9 +5234,11 @@ static __init int svm_hardware_setup(void)
svm_adjust_mmio_mask();
+ nrips = nrips && boot_cpu_has(X86_FEATURE_NRIPS);
+
/*
* Note, SEV setup consumes npt_enabled and enable_mmio_caching (which
- * may be modified by svm_adjust_mmio_mask()).
+ * may be modified by svm_adjust_mmio_mask()), as well as nrips.
*/
sev_hardware_setup();
@@ -5126,11 +5250,6 @@ static __init int svm_hardware_setup(void)
goto err;
}
- if (nrips) {
- if (!boot_cpu_has(X86_FEATURE_NRIPS))
- nrips = false;
- }
-
enable_apicv = avic = avic && avic_hardware_setup();
if (!enable_apicv) {
@@ -5213,6 +5332,13 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = {
.pmu_ops = &amd_pmu_ops,
};
+static void __svm_exit(void)
+{
+ kvm_x86_vendor_exit();
+
+ cpu_emergency_unregister_virt_callback(svm_emergency_disable);
+}
+
static int __init svm_init(void)
{
int r;
@@ -5226,6 +5352,8 @@ static int __init svm_init(void)
if (r)
return r;
+ cpu_emergency_register_virt_callback(svm_emergency_disable);
+
/*
* Common KVM initialization _must_ come last, after this, /dev/kvm is
* exposed to userspace!
@@ -5238,14 +5366,14 @@ static int __init svm_init(void)
return 0;
err_kvm_init:
- kvm_x86_vendor_exit();
+ __svm_exit();
return r;
}
static void __exit svm_exit(void)
{
kvm_exit();
- kvm_x86_vendor_exit();
+ __svm_exit();
}
module_init(svm_init)