summaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/cpuid.c32
-rw-r--r--arch/x86/kvm/pmu.h26
-rw-r--r--arch/x86/kvm/svm/nested.c12
-rw-r--r--arch/x86/kvm/vmx/vmx.c21
-rw-r--r--arch/x86/kvm/x86.c46
-rw-r--r--arch/x86/kvm/xen.c90
6 files changed, 132 insertions, 95 deletions
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index b14653b61470..596061c1610e 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -770,16 +770,22 @@ struct kvm_cpuid_array {
int nent;
};
+static struct kvm_cpuid_entry2 *get_next_cpuid(struct kvm_cpuid_array *array)
+{
+ if (array->nent >= array->maxnent)
+ return NULL;
+
+ return &array->entries[array->nent++];
+}
+
static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array,
u32 function, u32 index)
{
- struct kvm_cpuid_entry2 *entry;
+ struct kvm_cpuid_entry2 *entry = get_next_cpuid(array);
- if (array->nent >= array->maxnent)
+ if (!entry)
return NULL;
- entry = &array->entries[array->nent++];
-
memset(entry, 0, sizeof(*entry));
entry->function = function;
entry->index = index;
@@ -956,22 +962,13 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
entry->edx = edx.full;
break;
}
- /*
- * Per Intel's SDM, the 0x1f is a superset of 0xb,
- * thus they can be handled by common code.
- */
case 0x1f:
case 0xb:
/*
- * Populate entries until the level type (ECX[15:8]) of the
- * previous entry is zero. Note, CPUID EAX.{0x1f,0xb}.0 is
- * the starting entry, filled by the primary do_host_cpuid().
+ * No topology; a valid topology is indicated by the presence
+ * of subleaf 1.
*/
- for (i = 1; entry->ecx & 0xff00; ++i) {
- entry = do_host_cpuid(array, function, i);
- if (!entry)
- goto out;
- }
+ entry->eax = entry->ebx = entry->ecx = 0;
break;
case 0xd: {
u64 permitted_xcr0 = kvm_caps.supported_xcr0 & xstate_get_guest_group_perm();
@@ -1202,6 +1199,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
entry->ebx = entry->ecx = entry->edx = 0;
break;
case 0x8000001e:
+ /* Do not return host topology information. */
+ entry->eax = entry->ebx = entry->ecx = 0;
+ entry->edx = 0; /* reserved */
break;
case 0x8000001F:
if (!kvm_cpu_cap_has(X86_FEATURE_SEV)) {
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index cdb91009701d..ee67ba625094 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -165,15 +165,27 @@ static inline void kvm_init_pmu_capability(void)
{
bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL;
- perf_get_x86_pmu_capability(&kvm_pmu_cap);
-
- /*
- * For Intel, only support guest architectural pmu
- * on a host with architectural pmu.
- */
- if ((is_intel && !kvm_pmu_cap.version) || !kvm_pmu_cap.num_counters_gp)
+ /*
+ * Hybrid PMUs don't play nice with virtualization without careful
+ * configuration by userspace, and KVM's APIs for reporting supported
+ * vPMU features do not account for hybrid PMUs. Disable vPMU support
+ * for hybrid PMUs until KVM gains a way to let userspace opt-in.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
enable_pmu = false;
+ if (enable_pmu) {
+ perf_get_x86_pmu_capability(&kvm_pmu_cap);
+
+ /*
+ * For Intel, only support guest architectural pmu
+ * on a host with architectural pmu.
+ */
+ if ((is_intel && !kvm_pmu_cap.version) ||
+ !kvm_pmu_cap.num_counters_gp)
+ enable_pmu = false;
+ }
+
if (!enable_pmu) {
memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap));
return;
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index bc9cd7086fa9..add65dd59756 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -138,15 +138,13 @@ void recalc_intercepts(struct vcpu_svm *svm)
c->intercepts[i] = h->intercepts[i];
if (g->int_ctl & V_INTR_MASKING_MASK) {
- /* We only want the cr8 intercept bits of L1 */
- vmcb_clr_intercept(c, INTERCEPT_CR8_READ);
- vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE);
-
/*
- * Once running L2 with HF_VINTR_MASK, EFLAGS.IF does not
- * affect any interrupt we may want to inject; therefore,
- * interrupt window vmexits are irrelevant to L0.
+ * Once running L2 with HF_VINTR_MASK, EFLAGS.IF and CR8
+ * does not affect any interrupt we may want to inject;
+ * therefore, writes to CR8 are irrelevant to L0, as are
+ * interrupt window vmexits.
*/
+ vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE);
vmcb_clr_intercept(c, INTERCEPT_VINTR);
}
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index fc9008dbed33..7eec0226d56a 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -3440,18 +3440,15 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var)
{
u32 ar;
- if (var->unusable || !var->present)
- ar = 1 << 16;
- else {
- ar = var->type & 15;
- ar |= (var->s & 1) << 4;
- ar |= (var->dpl & 3) << 5;
- ar |= (var->present & 1) << 7;
- ar |= (var->avl & 1) << 12;
- ar |= (var->l & 1) << 13;
- ar |= (var->db & 1) << 14;
- ar |= (var->g & 1) << 15;
- }
+ ar = var->type & 15;
+ ar |= (var->s & 1) << 4;
+ ar |= (var->dpl & 3) << 5;
+ ar |= (var->present & 1) << 7;
+ ar |= (var->avl & 1) << 12;
+ ar |= (var->l & 1) << 13;
+ ar |= (var->db & 1) << 14;
+ ar |= (var->g & 1) << 15;
+ ar |= (var->unusable || !var->present) << 16;
return ar;
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index da4bbd043a7b..a2c299d47e69 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -191,6 +191,10 @@ module_param(enable_pmu, bool, 0444);
bool __read_mostly eager_page_split = true;
module_param(eager_page_split, bool, 0644);
+/* Enable/disable SMT_RSB bug mitigation */
+bool __read_mostly mitigate_smt_rsb;
+module_param(mitigate_smt_rsb, bool, 0444);
+
/*
* Restoring the host value for MSRs that are only consumed when running in
* usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
@@ -4448,10 +4452,15 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = KVM_CLOCK_VALID_FLAGS;
break;
case KVM_CAP_X86_DISABLE_EXITS:
- r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE |
- KVM_X86_DISABLE_EXITS_CSTATE;
- if(kvm_can_mwait_in_guest())
- r |= KVM_X86_DISABLE_EXITS_MWAIT;
+ r = KVM_X86_DISABLE_EXITS_PAUSE;
+
+ if (!mitigate_smt_rsb) {
+ r |= KVM_X86_DISABLE_EXITS_HLT |
+ KVM_X86_DISABLE_EXITS_CSTATE;
+
+ if (kvm_can_mwait_in_guest())
+ r |= KVM_X86_DISABLE_EXITS_MWAIT;
+ }
break;
case KVM_CAP_X86_SMM:
if (!IS_ENABLED(CONFIG_KVM_SMM))
@@ -5254,12 +5263,11 @@ static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
{
unsigned long val;
+ memset(dbgregs, 0, sizeof(*dbgregs));
memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
kvm_get_dr(vcpu, 6, &val);
dbgregs->dr6 = val;
dbgregs->dr7 = vcpu->arch.dr7;
- dbgregs->flags = 0;
- memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
}
static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
@@ -6227,15 +6235,26 @@ split_irqchip_unlock:
if (cap->args[0] & ~KVM_X86_DISABLE_VALID_EXITS)
break;
- if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) &&
- kvm_can_mwait_in_guest())
- kvm->arch.mwait_in_guest = true;
- if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT)
- kvm->arch.hlt_in_guest = true;
if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
kvm->arch.pause_in_guest = true;
- if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE)
- kvm->arch.cstate_in_guest = true;
+
+#define SMT_RSB_MSG "This processor is affected by the Cross-Thread Return Predictions vulnerability. " \
+ "KVM_CAP_X86_DISABLE_EXITS should only be used with SMT disabled or trusted guests."
+
+ if (!mitigate_smt_rsb) {
+ if (boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible() &&
+ (cap->args[0] & ~KVM_X86_DISABLE_EXITS_PAUSE))
+ pr_warn_once(SMT_RSB_MSG);
+
+ if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) &&
+ kvm_can_mwait_in_guest())
+ kvm->arch.mwait_in_guest = true;
+ if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT)
+ kvm->arch.hlt_in_guest = true;
+ if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE)
+ kvm->arch.cstate_in_guest = true;
+ }
+
r = 0;
break;
case KVM_CAP_MSR_PLATFORM_INFO:
@@ -13456,6 +13475,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit);
static int __init kvm_x86_init(void)
{
kvm_mmu_x86_module_init();
+ mitigate_smt_rsb &= boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible();
return 0;
}
module_init(kvm_x86_init);
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 2e29bdc2949c..8fd41f5deae3 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -271,7 +271,15 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
* Attempt to obtain the GPC lock on *both* (if there are two)
* gfn_to_pfn caches that cover the region.
*/
- read_lock_irqsave(&gpc1->lock, flags);
+ if (atomic) {
+ local_irq_save(flags);
+ if (!read_trylock(&gpc1->lock)) {
+ local_irq_restore(flags);
+ return;
+ }
+ } else {
+ read_lock_irqsave(&gpc1->lock, flags);
+ }
while (!kvm_gpc_check(gpc1, user_len1)) {
read_unlock_irqrestore(&gpc1->lock, flags);
@@ -304,9 +312,18 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
* The guest's runstate_info is split across two pages and we
* need to hold and validate both GPCs simultaneously. We can
* declare a lock ordering GPC1 > GPC2 because nothing else
- * takes them more than one at a time.
+ * takes them more than one at a time. Set a subclass on the
+ * gpc1 lock to make lockdep shut up about it.
*/
- read_lock(&gpc2->lock);
+ lock_set_subclass(&gpc1->lock.dep_map, 1, _THIS_IP_);
+ if (atomic) {
+ if (!read_trylock(&gpc2->lock)) {
+ read_unlock_irqrestore(&gpc1->lock, flags);
+ return;
+ }
+ } else {
+ read_lock(&gpc2->lock);
+ }
if (!kvm_gpc_check(gpc2, user_len2)) {
read_unlock(&gpc2->lock);
@@ -590,26 +607,26 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
if (!IS_ENABLED(CONFIG_64BIT) && data->u.long_mode) {
r = -EINVAL;
} else {
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
kvm->arch.xen.long_mode = !!data->u.long_mode;
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
r = 0;
}
break;
case KVM_XEN_ATTR_TYPE_SHARED_INFO:
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn);
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
break;
case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
if (data->u.vector && data->u.vector < 0x10)
r = -EINVAL;
else {
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
kvm->arch.xen.upcall_vector = data->u.vector;
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
r = 0;
}
break;
@@ -619,9 +636,9 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
break;
case KVM_XEN_ATTR_TYPE_XEN_VERSION:
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
kvm->arch.xen.xen_version = data->u.xen_version;
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
r = 0;
break;
@@ -630,9 +647,9 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
r = -EOPNOTSUPP;
break;
}
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
kvm->arch.xen.runstate_update_flag = !!data->u.runstate_update_flag;
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
r = 0;
break;
@@ -647,7 +664,7 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
{
int r = -ENOENT;
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
switch (data->type) {
case KVM_XEN_ATTR_TYPE_LONG_MODE:
@@ -686,7 +703,7 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
break;
}
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
return r;
}
@@ -694,7 +711,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
{
int idx, r = -ENOENT;
- mutex_lock(&vcpu->kvm->lock);
+ mutex_lock(&vcpu->kvm->arch.xen.xen_lock);
idx = srcu_read_lock(&vcpu->kvm->srcu);
switch (data->type) {
@@ -922,7 +939,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
}
srcu_read_unlock(&vcpu->kvm->srcu, idx);
- mutex_unlock(&vcpu->kvm->lock);
+ mutex_unlock(&vcpu->kvm->arch.xen.xen_lock);
return r;
}
@@ -930,7 +947,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
{
int r = -ENOENT;
- mutex_lock(&vcpu->kvm->lock);
+ mutex_lock(&vcpu->kvm->arch.xen.xen_lock);
switch (data->type) {
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
@@ -1013,7 +1030,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
break;
}
- mutex_unlock(&vcpu->kvm->lock);
+ mutex_unlock(&vcpu->kvm->arch.xen.xen_lock);
return r;
}
@@ -1106,7 +1123,7 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
xhc->blob_size_32 || xhc->blob_size_64))
return -EINVAL;
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
if (xhc->msr && !kvm->arch.xen_hvm_config.msr)
static_branch_inc(&kvm_xen_enabled.key);
@@ -1115,7 +1132,7 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
memcpy(&kvm->arch.xen_hvm_config, xhc, sizeof(*xhc));
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
return 0;
}
@@ -1658,15 +1675,7 @@ static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm)
mm_borrowed = true;
}
- /*
- * For the irqfd workqueue, using the main kvm->lock mutex is
- * fine since this function is invoked from kvm_set_irq() with
- * no other lock held, no srcu. In future if it will be called
- * directly from a vCPU thread (e.g. on hypercall for an IPI)
- * then it may need to switch to using a leaf-node mutex for
- * serializing the shared_info mapping.
- */
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
/*
* It is theoretically possible for the page to be unmapped
@@ -1695,7 +1704,7 @@ static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm)
srcu_read_unlock(&kvm->srcu, idx);
} while(!rc);
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
if (mm_borrowed)
kthread_unuse_mm(kvm->mm);
@@ -1811,7 +1820,7 @@ static int kvm_xen_eventfd_update(struct kvm *kvm,
int ret;
/* Protect writes to evtchnfd as well as the idr lookup. */
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
evtchnfd = idr_find(&kvm->arch.xen.evtchn_ports, port);
ret = -ENOENT;
@@ -1842,7 +1851,7 @@ static int kvm_xen_eventfd_update(struct kvm *kvm,
}
ret = 0;
out_unlock:
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
return ret;
}
@@ -1905,10 +1914,10 @@ static int kvm_xen_eventfd_assign(struct kvm *kvm,
evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority;
}
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
ret = idr_alloc(&kvm->arch.xen.evtchn_ports, evtchnfd, port, port + 1,
GFP_KERNEL);
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
if (ret >= 0)
return 0;
@@ -1926,9 +1935,9 @@ static int kvm_xen_eventfd_deassign(struct kvm *kvm, u32 port)
{
struct evtchnfd *evtchnfd;
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
evtchnfd = idr_remove(&kvm->arch.xen.evtchn_ports, port);
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
if (!evtchnfd)
return -ENOENT;
@@ -1946,7 +1955,7 @@ static int kvm_xen_eventfd_reset(struct kvm *kvm)
int i;
int n = 0;
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
/*
* Because synchronize_srcu() cannot be called inside the
@@ -1958,7 +1967,7 @@ static int kvm_xen_eventfd_reset(struct kvm *kvm)
all_evtchnfds = kmalloc_array(n, sizeof(struct evtchnfd *), GFP_KERNEL);
if (!all_evtchnfds) {
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
return -ENOMEM;
}
@@ -1967,7 +1976,7 @@ static int kvm_xen_eventfd_reset(struct kvm *kvm)
all_evtchnfds[n++] = evtchnfd;
idr_remove(&kvm->arch.xen.evtchn_ports, evtchnfd->send_port);
}
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
synchronize_srcu(&kvm->srcu);
@@ -2069,6 +2078,7 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
void kvm_xen_init_vm(struct kvm *kvm)
{
+ mutex_init(&kvm->arch.xen.xen_lock);
idr_init(&kvm->arch.xen.evtchn_ports);
kvm_gpc_init(&kvm->arch.xen.shinfo_cache, kvm, NULL, KVM_HOST_USES_PFN);
}