diff options
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r-- | arch/x86/kvm/cpuid.c | 104 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 18 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.h | 5 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 65 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 16 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/nested.c | 66 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/nested.h | 13 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/pmu_intel.c | 7 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 41 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 91 |
10 files changed, 245 insertions, 181 deletions
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 63316036f85a..f68c0c753c38 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -363,7 +363,7 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index) /* cpuid 7.0.ecx*/ const u32 kvm_cpuid_7_0_ecx_x86_features = - F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | + F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) | F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/; @@ -485,6 +485,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 0x80000008.ebx */ const u32 kvm_cpuid_8000_0008_ebx_x86_features = + F(CLZERO) | F(XSAVEERPTR) | F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON); @@ -618,16 +619,20 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, */ case 0x1f: case 0xb: { - int i, level_type; + int i; - /* read more entries until level_type is zero */ - for (i = 1; ; ++i) { + /* + * We filled in entry[0] for CPUID(EAX=<function>, + * ECX=00H) above. If its level type (ECX[15:8]) is + * zero, then the leaf is unimplemented, and we're + * done. Otherwise, continue to populate entries + * until the level type (ECX[15:8]) of the previously + * added entry is zero. + */ + for (i = 1; entry[i - 1].ecx & 0xff00; ++i) { if (*nent >= maxnent) goto out; - level_type = entry[i - 1].ecx & 0xff00; - if (!level_type) - break; do_host_cpuid(&entry[i], function, i); ++*nent; } @@ -969,53 +974,66 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); /* - * If no match is found, check whether we exceed the vCPU's limit - * and return the content of the highest valid _standard_ leaf instead. - * This is to satisfy the CPUID specification. + * If the basic or extended CPUID leaf requested is higher than the + * maximum supported basic or extended leaf, respectively, then it is + * out of range. */ -static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu, - u32 function, u32 index) +static bool cpuid_function_in_range(struct kvm_vcpu *vcpu, u32 function) { - struct kvm_cpuid_entry2 *maxlevel; - - maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0); - if (!maxlevel || maxlevel->eax >= function) - return NULL; - if (function & 0x80000000) { - maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0); - if (!maxlevel) - return NULL; - } - return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index); + struct kvm_cpuid_entry2 *max; + + max = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0); + return max && function <= max->eax; } bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool check_limit) { u32 function = *eax, index = *ecx; - struct kvm_cpuid_entry2 *best; - bool entry_found = true; - - best = kvm_find_cpuid_entry(vcpu, function, index); - - if (!best) { - entry_found = false; - if (!check_limit) - goto out; + struct kvm_cpuid_entry2 *entry; + struct kvm_cpuid_entry2 *max; + bool found; - best = check_cpuid_limit(vcpu, function, index); + entry = kvm_find_cpuid_entry(vcpu, function, index); + found = entry; + /* + * Intel CPUID semantics treats any query for an out-of-range + * leaf as if the highest basic leaf (i.e. CPUID.0H:EAX) were + * requested. AMD CPUID semantics returns all zeroes for any + * undefined leaf, whether or not the leaf is in range. + */ + if (!entry && check_limit && !guest_cpuid_is_amd(vcpu) && + !cpuid_function_in_range(vcpu, function)) { + max = kvm_find_cpuid_entry(vcpu, 0, 0); + if (max) { + function = max->eax; + entry = kvm_find_cpuid_entry(vcpu, function, index); + } } - -out: - if (best) { - *eax = best->eax; - *ebx = best->ebx; - *ecx = best->ecx; - *edx = best->edx; - } else + if (entry) { + *eax = entry->eax; + *ebx = entry->ebx; + *ecx = entry->ecx; + *edx = entry->edx; + } else { *eax = *ebx = *ecx = *edx = 0; - trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, entry_found); - return entry_found; + /* + * When leaf 0BH or 1FH is defined, CL is pass-through + * and EDX is always the x2APIC ID, even for undefined + * subleaves. Index 1 will exist iff the leaf is + * implemented, so we pass through CL iff leaf 1 + * exists. EDX can be copied from any existing index. + */ + if (function == 0xb || function == 0x1f) { + entry = kvm_find_cpuid_entry(vcpu, function, 1); + if (entry) { + *ecx = index & 0xff; + *edx = entry->edx; + } + } + } + trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, found); + return found; } EXPORT_SYMBOL_GPL(kvm_cpuid); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 3a3a6854dcca..b29d00b661ff 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -66,9 +66,10 @@ #define X2APIC_BROADCAST 0xFFFFFFFFul static bool lapic_timer_advance_dynamic __read_mostly; -#define LAPIC_TIMER_ADVANCE_ADJUST_MIN 100 -#define LAPIC_TIMER_ADVANCE_ADJUST_MAX 5000 -#define LAPIC_TIMER_ADVANCE_ADJUST_INIT 1000 +#define LAPIC_TIMER_ADVANCE_ADJUST_MIN 100 /* clock cycles */ +#define LAPIC_TIMER_ADVANCE_ADJUST_MAX 10000 /* clock cycles */ +#define LAPIC_TIMER_ADVANCE_NS_INIT 1000 +#define LAPIC_TIMER_ADVANCE_NS_MAX 5000 /* step-by-step approximation to mitigate fluctuation */ #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8 @@ -110,11 +111,6 @@ static inline int apic_enabled(struct kvm_lapic *apic) (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) -static inline u8 kvm_xapic_id(struct kvm_lapic *apic) -{ - return kvm_lapic_get_reg(apic, APIC_ID) >> 24; -} - static inline u32 kvm_x2apic_id(struct kvm_lapic *apic) { return apic->vcpu->vcpu_id; @@ -1504,8 +1500,8 @@ static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu, timer_advance_ns += ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP; } - if (unlikely(timer_advance_ns > LAPIC_TIMER_ADVANCE_ADJUST_MAX)) - timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT; + if (unlikely(timer_advance_ns > LAPIC_TIMER_ADVANCE_NS_MAX)) + timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT; apic->lapic_timer.timer_advance_ns = timer_advance_ns; } @@ -2302,7 +2298,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) HRTIMER_MODE_ABS_HARD); apic->lapic_timer.timer.function = apic_timer_fn; if (timer_advance_ns == -1) { - apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT; + apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT; lapic_timer_advance_dynamic = true; } else { apic->lapic_timer.timer_advance_ns = timer_advance_ns; diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 2aad7e226fc0..1f5014852e20 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -242,4 +242,9 @@ static inline enum lapic_mode kvm_apic_mode(u64 apic_base) return apic_base & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE); } +static inline u8 kvm_xapic_id(struct kvm_lapic *apic) +{ + return kvm_lapic_get_reg(apic, APIC_ID) >> 24; +} + #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5269aa057dfa..24c23c66b226 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -83,7 +83,17 @@ module_param(dbg, bool, 0644); #define PTE_PREFETCH_NUM 8 #define PT_FIRST_AVAIL_BITS_SHIFT 10 -#define PT64_SECOND_AVAIL_BITS_SHIFT 52 +#define PT64_SECOND_AVAIL_BITS_SHIFT 54 + +/* + * The mask used to denote special SPTEs, which can be either MMIO SPTEs or + * Access Tracking SPTEs. + */ +#define SPTE_SPECIAL_MASK (3ULL << 52) +#define SPTE_AD_ENABLED_MASK (0ULL << 52) +#define SPTE_AD_DISABLED_MASK (1ULL << 52) +#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52) +#define SPTE_MMIO_MASK (3ULL << 52) #define PT64_LEVEL_BITS 9 @@ -219,12 +229,11 @@ static u64 __read_mostly shadow_present_mask; static u64 __read_mostly shadow_me_mask; /* - * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value. - * Non-present SPTEs with shadow_acc_track_value set are in place for access - * tracking. + * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK; + * shadow_acc_track_mask is the set of bits to be cleared in non-accessed + * pages. */ static u64 __read_mostly shadow_acc_track_mask; -static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK; /* * The mask/shift to use for saving the original R/X bits when marking the PTE @@ -304,7 +313,7 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value, u64 access_mask) { BUG_ON((u64)(unsigned)access_mask != access_mask); BUG_ON((mmio_mask & mmio_value) != mmio_value); - shadow_mmio_value = mmio_value | SPTE_SPECIAL_MASK; + shadow_mmio_value = mmio_value | SPTE_MMIO_MASK; shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK; shadow_mmio_access_mask = access_mask; } @@ -320,10 +329,27 @@ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) return sp->role.ad_disabled; } +static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) +{ + /* + * When using the EPT page-modification log, the GPAs in the log + * would come from L2 rather than L1. Therefore, we need to rely + * on write protection to record dirty pages. This also bypasses + * PML, since writes now result in a vmexit. + */ + return vcpu->arch.mmu == &vcpu->arch.guest_mmu; +} + static inline bool spte_ad_enabled(u64 spte) { MMU_WARN_ON(is_mmio_spte(spte)); - return !(spte & shadow_acc_track_value); + return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK; +} + +static inline bool spte_ad_need_write_protect(u64 spte) +{ + MMU_WARN_ON(is_mmio_spte(spte)); + return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK; } static inline u64 spte_shadow_accessed_mask(u64 spte) @@ -461,7 +487,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, { BUG_ON(!dirty_mask != !accessed_mask); BUG_ON(!accessed_mask && !acc_track_mask); - BUG_ON(acc_track_mask & shadow_acc_track_value); + BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK); shadow_user_mask = user_mask; shadow_accessed_mask = accessed_mask; @@ -1589,16 +1615,16 @@ static bool spte_clear_dirty(u64 *sptep) rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep); + MMU_WARN_ON(!spte_ad_enabled(spte)); spte &= ~shadow_dirty_mask; - return mmu_spte_update(sptep, spte); } -static bool wrprot_ad_disabled_spte(u64 *sptep) +static bool spte_wrprot_for_clear_dirty(u64 *sptep) { bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT, (unsigned long *)sptep); - if (was_writable) + if (was_writable && !spte_ad_enabled(*sptep)) kvm_set_pfn_dirty(spte_to_pfn(*sptep)); return was_writable; @@ -1617,10 +1643,10 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) bool flush = false; for_each_rmap_spte(rmap_head, &iter, sptep) - if (spte_ad_enabled(*sptep)) - flush |= spte_clear_dirty(sptep); + if (spte_ad_need_write_protect(*sptep)) + flush |= spte_wrprot_for_clear_dirty(sptep); else - flush |= wrprot_ad_disabled_spte(sptep); + flush |= spte_clear_dirty(sptep); return flush; } @@ -1631,6 +1657,11 @@ static bool spte_set_dirty(u64 *sptep) rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep); + /* + * Similar to the !kvm_x86_ops->slot_disable_log_dirty case, + * do not bother adding back write access to pages marked + * SPTE_AD_WRPROT_ONLY_MASK. + */ spte |= shadow_dirty_mask; return mmu_spte_update(sptep, spte); @@ -2622,7 +2653,7 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, shadow_user_mask | shadow_x_mask | shadow_me_mask; if (sp_ad_disabled(sp)) - spte |= shadow_acc_track_value; + spte |= SPTE_AD_DISABLED_MASK; else spte |= shadow_accessed_mask; @@ -2968,7 +2999,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, sp = page_header(__pa(sptep)); if (sp_ad_disabled(sp)) - spte |= shadow_acc_track_value; + spte |= SPTE_AD_DISABLED_MASK; + else if (kvm_vcpu_ad_need_write_protect(vcpu)) + spte |= SPTE_AD_WRPROT_ONLY_MASK; /* * For the EPT case, shadow_present_mask is 0 if hardware diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f8ecb6df5106..c5673bda4b66 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -734,8 +734,14 @@ static int get_npt_level(struct kvm_vcpu *vcpu) static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) { vcpu->arch.efer = efer; - if (!npt_enabled && !(efer & EFER_LMA)) - efer &= ~EFER_LME; + + if (!npt_enabled) { + /* Shadow paging assumes NX to be available. */ + efer |= EFER_NX; + + if (!(efer & EFER_LMA)) + efer &= ~EFER_LME; + } to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); @@ -4591,6 +4597,7 @@ static int avic_handle_ldr_update(struct kvm_vcpu *vcpu) int ret = 0; struct vcpu_svm *svm = to_svm(vcpu); u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR); + u32 id = kvm_xapic_id(vcpu->arch.apic); if (ldr == svm->ldr_reg) return 0; @@ -4598,7 +4605,7 @@ static int avic_handle_ldr_update(struct kvm_vcpu *vcpu) avic_invalidate_logical_id_entry(vcpu); if (ldr) - ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr); + ret = avic_ldr_write(vcpu, id, ldr); if (!ret) svm->ldr_reg = ldr; @@ -4610,8 +4617,7 @@ static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu) { u64 *old, *new; struct vcpu_svm *svm = to_svm(vcpu); - u32 apic_id_reg = kvm_lapic_get_reg(vcpu->arch.apic, APIC_ID); - u32 id = (apic_id_reg >> 24) & 0xff; + u32 id = kvm_xapic_id(vcpu->arch.apic); if (vcpu->vcpu_id == id) return 0; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 41abc62c9a8a..0e7c9301fe86 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2610,7 +2610,7 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, /* VM-entry exception error code */ if (CC(has_error_code && - vmcs12->vm_entry_exception_error_code & GENMASK(31, 15))) + vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))) return -EINVAL; /* VM-entry interruption-info field: reserved bits */ @@ -2917,7 +2917,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12); -static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) +static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -2937,19 +2937,18 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) vmx->nested.apic_access_page = NULL; } page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr); - /* - * If translation failed, no matter: This feature asks - * to exit when accessing the given address, and if it - * can never be accessed, this feature won't do - * anything anyway. - */ if (!is_error_page(page)) { vmx->nested.apic_access_page = page; hpa = page_to_phys(vmx->nested.apic_access_page); vmcs_write64(APIC_ACCESS_ADDR, hpa); } else { - secondary_exec_controls_clearbit(vmx, - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); + pr_debug_ratelimited("%s: no backing 'struct page' for APIC-access address in vmcs12\n", + __func__); + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = + KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return false; } } @@ -2994,6 +2993,7 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); else exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); + return true; } /* @@ -3032,13 +3032,15 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, /* * If from_vmentry is false, this is being called from state restore (either RSM * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. -+ * -+ * Returns: -+ * 0 - success, i.e. proceed with actual VMEnter -+ * 1 - consistency check VMExit -+ * -1 - consistency check VMFail + * + * Returns: + * NVMX_ENTRY_SUCCESS: Entered VMX non-root mode + * NVMX_ENTRY_VMFAIL: Consistency check VMFail + * NVMX_ENTRY_VMEXIT: Consistency check VMExit + * NVMX_ENTRY_KVM_INTERNAL_ERROR: KVM internal error */ -int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) +enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, + bool from_vmentry) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); @@ -3081,11 +3083,12 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) prepare_vmcs02_early(vmx, vmcs12); if (from_vmentry) { - nested_get_vmcs12_pages(vcpu); + if (unlikely(!nested_get_vmcs12_pages(vcpu))) + return NVMX_VMENTRY_KVM_INTERNAL_ERROR; if (nested_vmx_check_vmentry_hw(vcpu)) { vmx_switch_vmcs(vcpu, &vmx->vmcs01); - return -1; + return NVMX_VMENTRY_VMFAIL; } if (nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual)) @@ -3149,7 +3152,7 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) * returned as far as L1 is concerned. It will only return (and set * the success flag) when L2 exits (see nested_vmx_vmexit()). */ - return 0; + return NVMX_VMENTRY_SUCCESS; /* * A failed consistency check that leads to a VMExit during L1's @@ -3165,14 +3168,14 @@ vmentry_fail_vmexit: vmx_switch_vmcs(vcpu, &vmx->vmcs01); if (!from_vmentry) - return 1; + return NVMX_VMENTRY_VMEXIT; load_vmcs12_host_state(vcpu, vmcs12); vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY; vmcs12->exit_qualification = exit_qual; if (enable_shadow_vmcs || vmx->nested.hv_evmcs) vmx->nested.need_vmcs12_to_shadow_sync = true; - return 1; + return NVMX_VMENTRY_VMEXIT; } /* @@ -3182,9 +3185,9 @@ vmentry_fail_vmexit: static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) { struct vmcs12 *vmcs12; + enum nvmx_vmentry_status status; struct vcpu_vmx *vmx = to_vmx(vcpu); u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); - int ret; if (!nested_vmx_check_permission(vcpu)) return 1; @@ -3244,13 +3247,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * the nested entry. */ vmx->nested.nested_run_pending = 1; - ret = nested_vmx_enter_non_root_mode(vcpu, true); - vmx->nested.nested_run_pending = !ret; - if (ret > 0) - return 1; - else if (ret) - return nested_vmx_failValid(vcpu, - VMXERR_ENTRY_INVALID_CONTROL_FIELD); + status = nested_vmx_enter_non_root_mode(vcpu, true); + if (unlikely(status != NVMX_VMENTRY_SUCCESS)) + goto vmentry_failed; /* Hide L1D cache contents from the nested guest. */ vmx->vcpu.arch.l1tf_flush_l1d = true; @@ -3281,6 +3280,15 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return kvm_vcpu_halt(vcpu); } return 1; + +vmentry_failed: + vmx->nested.nested_run_pending = 0; + if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR) + return 0; + if (status == NVMX_VMENTRY_VMEXIT) + return 1; + WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); + return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); } /* diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 187d39bf0bf1..6280f33e5fa6 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -6,6 +6,16 @@ #include "vmcs12.h" #include "vmx.h" +/* + * Status returned by nested_vmx_enter_non_root_mode(): + */ +enum nvmx_vmentry_status { + NVMX_VMENTRY_SUCCESS, /* Entered VMX non-root mode */ + NVMX_VMENTRY_VMFAIL, /* Consistency check VMFail */ + NVMX_VMENTRY_VMEXIT, /* Consistency check VMExit */ + NVMX_VMENTRY_KVM_INTERNAL_ERROR,/* KVM internal error */ +}; + void vmx_leave_nested(struct kvm_vcpu *vcpu); void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps, bool apicv); @@ -13,7 +23,8 @@ void nested_vmx_hardware_unsetup(void); __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)); void nested_vmx_vcpu_setup(void); void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu); -int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry); +enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, + bool from_vmentry); bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason); void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, u32 exit_intr_info, unsigned long exit_qualification); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 4dea0e0e7e39..3e9c059099e9 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -262,6 +262,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) static void intel_pmu_refresh(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct x86_pmu_capability x86_pmu; struct kvm_cpuid_entry2 *entry; union cpuid10_eax eax; union cpuid10_edx edx; @@ -283,8 +284,10 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) if (!pmu->version) return; + perf_get_x86_pmu_capability(&x86_pmu); + pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters, - INTEL_PMC_MAX_GENERIC); + x86_pmu.num_counters_gp); pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1; pmu->available_event_types = ~entry->ebx & ((1ull << eax.split.mask_length) - 1); @@ -294,7 +297,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) } else { pmu->nr_arch_fixed_counters = min_t(int, edx.split.num_counters_fixed, - INTEL_PMC_MAX_FIXED); + x86_pmu.num_counters_fixed); pmu->counter_bitmask[KVM_PMC_FIXED] = ((u64)1 << edx.split.bit_width_fixed) - 1; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d4575ffb3cec..5d21a4ab28cf 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -209,6 +209,11 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) struct page *page; unsigned int i; + if (!boot_cpu_has_bug(X86_BUG_L1TF)) { + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; + return 0; + } + if (!enable_ept) { l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED; return 0; @@ -964,17 +969,9 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) u64 guest_efer = vmx->vcpu.arch.efer; u64 ignore_bits = 0; - if (!enable_ept) { - /* - * NX is needed to handle CR0.WP=1, CR4.SMEP=1. Testing - * host CPUID is more efficient than testing guest CPUID - * or CR4. Host SMEP is anyway a requirement for guest SMEP. - */ - if (boot_cpu_has(X86_FEATURE_SMEP)) - guest_efer |= EFER_NX; - else if (!(guest_efer & EFER_NX)) - ignore_bits |= EFER_NX; - } + /* Shadow paging assumes NX to be available. */ + if (!enable_ept) + guest_efer |= EFER_NX; /* * LMA and LME handled by hardware; SCE meaningless outside long mode. @@ -5538,14 +5535,6 @@ static int handle_encls(struct kvm_vcpu *vcpu) return 1; } -static int handle_unexpected_vmexit(struct kvm_vcpu *vcpu) -{ - kvm_skip_emulated_instruction(vcpu); - WARN_ONCE(1, "Unexpected VM-Exit Reason = 0x%x", - vmcs_read32(VM_EXIT_REASON)); - return 1; -} - /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -5597,15 +5586,11 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_INVVPID] = handle_vmx_instruction, [EXIT_REASON_RDRAND] = handle_invalid_op, [EXIT_REASON_RDSEED] = handle_invalid_op, - [EXIT_REASON_XSAVES] = handle_unexpected_vmexit, - [EXIT_REASON_XRSTORS] = handle_unexpected_vmexit, [EXIT_REASON_PML_FULL] = handle_pml_full, [EXIT_REASON_INVPCID] = handle_invpcid, [EXIT_REASON_VMFUNC] = handle_vmx_instruction, [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, [EXIT_REASON_ENCLS] = handle_encls, - [EXIT_REASON_UMWAIT] = handle_unexpected_vmexit, - [EXIT_REASON_TPAUSE] = handle_unexpected_vmexit, }; static const int kvm_vmx_max_exit_handlers = @@ -7995,12 +7980,10 @@ static int __init vmx_init(void) * contain 'auto' which will be turned into the default 'cond' * mitigation mode. */ - if (boot_cpu_has(X86_BUG_L1TF)) { - r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); - if (r) { - vmx_exit(); - return r; - } + r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); + if (r) { + vmx_exit(); + return r; } #ifdef CONFIG_KEXEC_CORE diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0ed07d8d2caa..ff395f812719 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -92,8 +92,8 @@ u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA)); static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); #endif -#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM -#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU +#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ +#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) @@ -212,7 +212,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, { "mmu_unsync", VM_STAT(mmu_unsync) }, { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, - { "largepages", VM_STAT(lpages) }, + { "largepages", VM_STAT(lpages, .mode = 0444) }, { "max_mmu_page_hash_collisions", VM_STAT(max_mmu_page_hash_collisions) }, { NULL } @@ -360,8 +360,7 @@ EXPORT_SYMBOL_GPL(kvm_set_apic_base); asmlinkage __visible void kvm_spurious_fault(void) { /* Fault while not rebooting. We want the trace. */ - if (!kvm_rebooting) - BUG(); + BUG_ON(!kvm_rebooting); } EXPORT_SYMBOL_GPL(kvm_spurious_fault); @@ -885,34 +884,42 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) } EXPORT_SYMBOL_GPL(kvm_set_xcr); -int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - unsigned long old_cr4 = kvm_read_cr4(vcpu); - unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | - X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE; - if (cr4 & CR4_RESERVED_BITS) - return 1; + return -EINVAL; if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE)) - return 1; + return -EINVAL; if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP)) - return 1; + return -EINVAL; if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP)) - return 1; + return -EINVAL; if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE)) - return 1; + return -EINVAL; if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE)) - return 1; + return -EINVAL; if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57)) - return 1; + return -EINVAL; if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP)) + return -EINVAL; + + return 0; +} + +int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + unsigned long old_cr4 = kvm_read_cr4(vcpu); + unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | + X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE; + + if (kvm_valid_cr4(vcpu, cr4)) return 1; if (is_long_mode(vcpu)) { @@ -1161,13 +1168,6 @@ static u32 msrs_to_save[] = { MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13, MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15, MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17, - MSR_ARCH_PERFMON_PERFCTR0 + 18, MSR_ARCH_PERFMON_PERFCTR0 + 19, - MSR_ARCH_PERFMON_PERFCTR0 + 20, MSR_ARCH_PERFMON_PERFCTR0 + 21, - MSR_ARCH_PERFMON_PERFCTR0 + 22, MSR_ARCH_PERFMON_PERFCTR0 + 23, - MSR_ARCH_PERFMON_PERFCTR0 + 24, MSR_ARCH_PERFMON_PERFCTR0 + 25, - MSR_ARCH_PERFMON_PERFCTR0 + 26, MSR_ARCH_PERFMON_PERFCTR0 + 27, - MSR_ARCH_PERFMON_PERFCTR0 + 28, MSR_ARCH_PERFMON_PERFCTR0 + 29, - MSR_ARCH_PERFMON_PERFCTR0 + 30, MSR_ARCH_PERFMON_PERFCTR0 + 31, MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, @@ -1177,13 +1177,6 @@ static u32 msrs_to_save[] = { MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13, MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15, MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, - MSR_ARCH_PERFMON_EVENTSEL0 + 18, MSR_ARCH_PERFMON_EVENTSEL0 + 19, - MSR_ARCH_PERFMON_EVENTSEL0 + 20, MSR_ARCH_PERFMON_EVENTSEL0 + 21, - MSR_ARCH_PERFMON_EVENTSEL0 + 22, MSR_ARCH_PERFMON_EVENTSEL0 + 23, - MSR_ARCH_PERFMON_EVENTSEL0 + 24, MSR_ARCH_PERFMON_EVENTSEL0 + 25, - MSR_ARCH_PERFMON_EVENTSEL0 + 26, MSR_ARCH_PERFMON_EVENTSEL0 + 27, - MSR_ARCH_PERFMON_EVENTSEL0 + 28, MSR_ARCH_PERFMON_EVENTSEL0 + 29, - MSR_ARCH_PERFMON_EVENTSEL0 + 30, MSR_ARCH_PERFMON_EVENTSEL0 + 31, }; static unsigned num_msrs_to_save; @@ -2543,6 +2536,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) static void kvmclock_reset(struct kvm_vcpu *vcpu) { vcpu->arch.pv_time_enabled = false; + vcpu->arch.time = 0; } static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) @@ -2708,8 +2702,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_KVM_SYSTEM_TIME: { struct kvm_arch *ka = &vcpu->kvm->arch; - kvmclock_reset(vcpu); - if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) { bool tmp = (msr == MSR_KVM_SYSTEM_TIME); @@ -2723,14 +2715,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); /* we verify if the enable bit is set... */ + vcpu->arch.pv_time_enabled = false; if (!(data & 1)) break; - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, + if (!kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_time, data & ~1ULL, sizeof(struct pvclock_vcpu_time_info))) - vcpu->arch.pv_time_enabled = false; - else vcpu->arch.pv_time_enabled = true; break; @@ -5097,13 +5088,14 @@ out: static void kvm_init_msr_list(void) { + struct x86_pmu_capability x86_pmu; u32 dummy[2]; unsigned i, j; BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4, "Please update the fixed PMCs in msrs_to_save[]"); - BUILD_BUG_ON_MSG(INTEL_PMC_MAX_GENERIC != 32, - "Please update the generic perfctr/eventsel MSRs in msrs_to_save[]"); + + perf_get_x86_pmu_capability(&x86_pmu); for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) @@ -5145,6 +5137,15 @@ static void kvm_init_msr_list(void) intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2) continue; break; + case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17: + if (msrs_to_save[i] - MSR_ARCH_PERFMON_PERFCTR0 >= + min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) + continue; + break; + case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17: + if (msrs_to_save[i] - MSR_ARCH_PERFMON_EVENTSEL0 >= + min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) + continue; } default: break; @@ -7937,8 +7938,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) bool req_immediate_exit = false; if (kvm_request_pending(vcpu)) { - if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) - kvm_x86_ops->get_vmcs12_pages(vcpu); + if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) { + if (unlikely(!kvm_x86_ops->get_vmcs12_pages(vcpu))) { + r = 0; + goto out; + } + } if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) kvm_mmu_unload(vcpu); if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) @@ -8714,10 +8719,6 @@ EXPORT_SYMBOL_GPL(kvm_task_switch); static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { - if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && - (sregs->cr4 & X86_CR4_OSXSAVE)) - return -EINVAL; - if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) { /* * When EFER.LME and CR0.PG are set, the processor is in @@ -8736,7 +8737,7 @@ static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) return -EINVAL; } - return 0; + return kvm_valid_cr4(vcpu, sregs->cr4); } static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) |