diff options
Diffstat (limited to 'arch/x86/kvm/vmx.c')
-rw-r--r-- | arch/x86/kvm/vmx.c | 1259 |
1 files changed, 830 insertions, 429 deletions
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 92496b9b5f2b..40aa29204baf 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -51,10 +51,12 @@ #include <asm/apic.h> #include <asm/irq_remapping.h> #include <asm/mmu_context.h> -#include <asm/nospec-branch.h> +#include <asm/spec-ctrl.h> +#include <asm/mshyperv.h> #include "trace.h" #include "pmu.h" +#include "vmx_evmcs.h" #define __ex(x) __kvm_handle_fault_on_reboot(x) #define __ex_clear(x, reg) \ @@ -130,13 +132,15 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); #endif #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD) -#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE) -#define KVM_VM_CR0_ALWAYS_ON \ - (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) +#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE +#define KVM_VM_CR0_ALWAYS_ON \ + (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \ + X86_CR0_WP | X86_CR0_PG | X86_CR0_PE) #define KVM_CR4_GUEST_OWNED_BITS \ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD) +#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) @@ -165,34 +169,33 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); * Time is measured based on a counter that runs at the same rate as the TSC, * refer SDM volume 3b section 21.6.13 & 22.1.3. */ -#define KVM_VMX_DEFAULT_PLE_GAP 128 -#define KVM_VMX_DEFAULT_PLE_WINDOW 4096 -#define KVM_VMX_DEFAULT_PLE_WINDOW_GROW 2 -#define KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK 0 -#define KVM_VMX_DEFAULT_PLE_WINDOW_MAX \ - INT_MAX / KVM_VMX_DEFAULT_PLE_WINDOW_GROW +static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP; -static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP; -module_param(ple_gap, int, S_IRUGO); - -static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; -module_param(ple_window, int, S_IRUGO); +static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; +module_param(ple_window, uint, 0444); /* Default doubles per-vcpu window every exit. */ -static int ple_window_grow = KVM_VMX_DEFAULT_PLE_WINDOW_GROW; -module_param(ple_window_grow, int, S_IRUGO); +static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW; +module_param(ple_window_grow, uint, 0444); /* Default resets per-vcpu window every exit to ple_window. */ -static int ple_window_shrink = KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK; -module_param(ple_window_shrink, int, S_IRUGO); +static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK; +module_param(ple_window_shrink, uint, 0444); /* Default is to compute the maximum so we can never overflow. */ -static int ple_window_actual_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; -static int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; -module_param(ple_window_max, int, S_IRUGO); +static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; +module_param(ple_window_max, uint, 0444); extern const ulong vmx_return; +struct kvm_vmx { + struct kvm kvm; + + unsigned int tss_addr; + bool ept_identity_pagetable_done; + gpa_t ept_identity_map_addr; +}; + #define NR_AUTOLOAD_MSRS 8 struct vmcs { @@ -424,6 +427,35 @@ struct __packed vmcs12 { */ #define VMCS12_MAX_FIELD_INDEX 0x17 +struct nested_vmx_msrs { + /* + * We only store the "true" versions of the VMX capability MSRs. We + * generate the "non-true" versions by setting the must-be-1 bits + * according to the SDM. + */ + u32 procbased_ctls_low; + u32 procbased_ctls_high; + u32 secondary_ctls_low; + u32 secondary_ctls_high; + u32 pinbased_ctls_low; + u32 pinbased_ctls_high; + u32 exit_ctls_low; + u32 exit_ctls_high; + u32 entry_ctls_low; + u32 entry_ctls_high; + u32 misc_low; + u32 misc_high; + u32 ept_caps; + u32 vpid_caps; + u64 basic; + u64 cr0_fixed0; + u64 cr0_fixed1; + u64 cr4_fixed0; + u64 cr4_fixed1; + u64 vmcs_enum; + u64 vmfunc_controls; +}; + /* * The nested_vmx structure is part of vcpu_vmx, and holds information we need * for correct emulation of VMX (i.e., nested VMX) on this vcpu. @@ -475,32 +507,7 @@ struct nested_vmx { u16 vpid02; u16 last_vpid; - /* - * We only store the "true" versions of the VMX capability MSRs. We - * generate the "non-true" versions by setting the must-be-1 bits - * according to the SDM. - */ - u32 nested_vmx_procbased_ctls_low; - u32 nested_vmx_procbased_ctls_high; - u32 nested_vmx_secondary_ctls_low; - u32 nested_vmx_secondary_ctls_high; - u32 nested_vmx_pinbased_ctls_low; - u32 nested_vmx_pinbased_ctls_high; - u32 nested_vmx_exit_ctls_low; - u32 nested_vmx_exit_ctls_high; - u32 nested_vmx_entry_ctls_low; - u32 nested_vmx_entry_ctls_high; - u32 nested_vmx_misc_low; - u32 nested_vmx_misc_high; - u32 nested_vmx_ept_caps; - u32 nested_vmx_vpid_caps; - u64 nested_vmx_basic; - u64 nested_vmx_cr0_fixed0; - u64 nested_vmx_cr0_fixed1; - u64 nested_vmx_cr4_fixed0; - u64 nested_vmx_cr4_fixed1; - u64 nested_vmx_vmcs_enum; - u64 nested_vmx_vmfunc_controls; + struct nested_vmx_msrs msrs; /* SMM related state */ struct { @@ -691,6 +698,11 @@ enum segment_cache_field { SEG_FIELD_NR = 4 }; +static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm) +{ + return container_of(kvm, struct kvm_vmx, kvm); +} + static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) { return container_of(vcpu, struct vcpu_vmx, vcpu); @@ -953,6 +965,7 @@ static struct vmcs_config { u32 cpu_based_2nd_exec_ctrl; u32 vmexit_ctrl; u32 vmentry_ctrl; + struct nested_vmx_msrs nested; } vmcs_config; static struct vmx_capability { @@ -999,6 +1012,169 @@ static const u32 vmx_msr_index[] = { MSR_EFER, MSR_TSC_AUX, MSR_STAR, }; +DEFINE_STATIC_KEY_FALSE(enable_evmcs); + +#define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs)) + +#define KVM_EVMCS_VERSION 1 + +#if IS_ENABLED(CONFIG_HYPERV) +static bool __read_mostly enlightened_vmcs = true; +module_param(enlightened_vmcs, bool, 0444); + +static inline void evmcs_write64(unsigned long field, u64 value) +{ + u16 clean_field; + int offset = get_evmcs_offset(field, &clean_field); + + if (offset < 0) + return; + + *(u64 *)((char *)current_evmcs + offset) = value; + + current_evmcs->hv_clean_fields &= ~clean_field; +} + +static inline void evmcs_write32(unsigned long field, u32 value) +{ + u16 clean_field; + int offset = get_evmcs_offset(field, &clean_field); + + if (offset < 0) + return; + + *(u32 *)((char *)current_evmcs + offset) = value; + current_evmcs->hv_clean_fields &= ~clean_field; +} + +static inline void evmcs_write16(unsigned long field, u16 value) +{ + u16 clean_field; + int offset = get_evmcs_offset(field, &clean_field); + + if (offset < 0) + return; + + *(u16 *)((char *)current_evmcs + offset) = value; + current_evmcs->hv_clean_fields &= ~clean_field; +} + +static inline u64 evmcs_read64(unsigned long field) +{ + int offset = get_evmcs_offset(field, NULL); + + if (offset < 0) + return 0; + + return *(u64 *)((char *)current_evmcs + offset); +} + +static inline u32 evmcs_read32(unsigned long field) +{ + int offset = get_evmcs_offset(field, NULL); + + if (offset < 0) + return 0; + + return *(u32 *)((char *)current_evmcs + offset); +} + +static inline u16 evmcs_read16(unsigned long field) +{ + int offset = get_evmcs_offset(field, NULL); + + if (offset < 0) + return 0; + + return *(u16 *)((char *)current_evmcs + offset); +} + +static void evmcs_load(u64 phys_addr) +{ + struct hv_vp_assist_page *vp_ap = + hv_get_vp_assist_page(smp_processor_id()); + + vp_ap->current_nested_vmcs = phys_addr; + vp_ap->enlighten_vmentry = 1; +} + +static void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) +{ + /* + * Enlightened VMCSv1 doesn't support these: + * + * POSTED_INTR_NV = 0x00000002, + * GUEST_INTR_STATUS = 0x00000810, + * APIC_ACCESS_ADDR = 0x00002014, + * POSTED_INTR_DESC_ADDR = 0x00002016, + * EOI_EXIT_BITMAP0 = 0x0000201c, + * EOI_EXIT_BITMAP1 = 0x0000201e, + * EOI_EXIT_BITMAP2 = 0x00002020, + * EOI_EXIT_BITMAP3 = 0x00002022, + */ + vmcs_conf->pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; + vmcs_conf->cpu_based_2nd_exec_ctrl &= + ~SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; + vmcs_conf->cpu_based_2nd_exec_ctrl &= + ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + vmcs_conf->cpu_based_2nd_exec_ctrl &= + ~SECONDARY_EXEC_APIC_REGISTER_VIRT; + + /* + * GUEST_PML_INDEX = 0x00000812, + * PML_ADDRESS = 0x0000200e, + */ + vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_PML; + + /* VM_FUNCTION_CONTROL = 0x00002018, */ + vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_VMFUNC; + + /* + * EPTP_LIST_ADDRESS = 0x00002024, + * VMREAD_BITMAP = 0x00002026, + * VMWRITE_BITMAP = 0x00002028, + */ + vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_SHADOW_VMCS; + + /* + * TSC_MULTIPLIER = 0x00002032, + */ + vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_TSC_SCALING; + + /* + * PLE_GAP = 0x00004020, + * PLE_WINDOW = 0x00004022, + */ + vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; + + /* + * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E, + */ + vmcs_conf->pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + + /* + * GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808, + * HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04, + */ + vmcs_conf->vmexit_ctrl &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; + vmcs_conf->vmentry_ctrl &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; + + /* + * Currently unsupported in KVM: + * GUEST_IA32_RTIT_CTL = 0x00002814, + */ +} +#else /* !IS_ENABLED(CONFIG_HYPERV) */ +static inline void evmcs_write64(unsigned long field, u64 value) {} +static inline void evmcs_write32(unsigned long field, u32 value) {} +static inline void evmcs_write16(unsigned long field, u16 value) {} +static inline u64 evmcs_read64(unsigned long field) { return 0; } +static inline u32 evmcs_read32(unsigned long field) { return 0; } +static inline u16 evmcs_read16(unsigned long field) { return 0; } +static inline void evmcs_load(u64 phys_addr) {} +static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {} +#endif /* IS_ENABLED(CONFIG_HYPERV) */ + static inline bool is_exception_n(u32 intr_info, u8 vector) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | @@ -1031,6 +1207,11 @@ static inline bool is_invalid_opcode(u32 intr_info) return is_exception_n(intr_info, UD_VECTOR); } +static inline bool is_gp_fault(u32 intr_info) +{ + return is_exception_n(intr_info, GP_VECTOR); +} + static inline bool is_external_interrupt(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) @@ -1313,6 +1494,12 @@ static inline bool cpu_has_vmx_vmfunc(void) SECONDARY_EXEC_ENABLE_VMFUNC; } +static bool vmx_umip_emulated(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_DESC; +} + static inline bool report_flexpriority(void) { return flexpriority_enabled; @@ -1320,7 +1507,7 @@ static inline bool report_flexpriority(void) static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu) { - return vmx_misc_cr3_count(to_vmx(vcpu)->nested.nested_vmx_misc_low); + return vmx_misc_cr3_count(to_vmx(vcpu)->nested.msrs.misc_low); } static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit) @@ -1341,6 +1528,16 @@ static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12) PIN_BASED_VMX_PREEMPTION_TIMER; } +static inline bool nested_cpu_has_nmi_exiting(struct vmcs12 *vmcs12) +{ + return vmcs12->pin_based_vm_exec_control & PIN_BASED_NMI_EXITING; +} + +static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12) +{ + return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; +} + static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) { return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); @@ -1479,6 +1676,9 @@ static void vmcs_load(struct vmcs *vmcs) u64 phys_addr = __pa(vmcs); u8 error; + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_load(phys_addr); + asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) : "cc", "memory"); @@ -1652,18 +1852,24 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field) static __always_inline u16 vmcs_read16(unsigned long field) { vmcs_check16(field); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_read16(field); return __vmcs_readl(field); } static __always_inline u32 vmcs_read32(unsigned long field) { vmcs_check32(field); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_read32(field); return __vmcs_readl(field); } static __always_inline u64 vmcs_read64(unsigned long field) { vmcs_check64(field); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_read64(field); #ifdef CONFIG_X86_64 return __vmcs_readl(field); #else @@ -1674,6 +1880,8 @@ static __always_inline u64 vmcs_read64(unsigned long field) static __always_inline unsigned long vmcs_readl(unsigned long field) { vmcs_checkl(field); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_read64(field); return __vmcs_readl(field); } @@ -1697,18 +1905,27 @@ static __always_inline void __vmcs_writel(unsigned long field, unsigned long val static __always_inline void vmcs_write16(unsigned long field, u16 value) { vmcs_check16(field); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_write16(field, value); + __vmcs_writel(field, value); } static __always_inline void vmcs_write32(unsigned long field, u32 value) { vmcs_check32(field); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_write32(field, value); + __vmcs_writel(field, value); } static __always_inline void vmcs_write64(unsigned long field, u64 value) { vmcs_check64(field); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_write64(field, value); + __vmcs_writel(field, value); #ifndef CONFIG_X86_64 asm volatile (""); @@ -1719,6 +1936,9 @@ static __always_inline void vmcs_write64(unsigned long field, u64 value) static __always_inline void vmcs_writel(unsigned long field, unsigned long value) { vmcs_checkl(field); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_write64(field, value); + __vmcs_writel(field, value); } @@ -1726,6 +1946,9 @@ static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask) { BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000, "vmcs_clear_bits does not support 64-bit fields"); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_write32(field, evmcs_read32(field) & ~mask); + __vmcs_writel(field, __vmcs_readl(field) & ~mask); } @@ -1733,6 +1956,9 @@ static __always_inline void vmcs_set_bits(unsigned long field, u32 mask) { BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000, "vmcs_set_bits does not support 64-bit fields"); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_write32(field, evmcs_read32(field) | mask); + __vmcs_writel(field, __vmcs_readl(field) | mask); } @@ -1864,6 +2090,14 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | (1u << DB_VECTOR) | (1u << AC_VECTOR); + /* + * Guest access to VMware backdoor ports could legitimately + * trigger #GP because of TSS I/O permission bitmap. + * We intercept those #GP and allow access to them anyway + * as VMware does. + */ + if (enable_vmware_backdoor) + eb |= (1u << GP_VECTOR); if ((vcpu->guest_debug & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) @@ -2129,6 +2363,9 @@ static unsigned long segment_base(u16 selector) static void vmx_save_host_state(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); +#ifdef CONFIG_X86_64 + int cpu = raw_smp_processor_id(); +#endif int i; if (vmx->host_state.loaded) @@ -2141,7 +2378,15 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) */ vmx->host_state.ldt_sel = kvm_read_ldt(); vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; + +#ifdef CONFIG_X86_64 + save_fsgs_for_kvm(); + vmx->host_state.fs_sel = current->thread.fsindex; + vmx->host_state.gs_sel = current->thread.gsindex; +#else savesegment(fs, vmx->host_state.fs_sel); + savesegment(gs, vmx->host_state.gs_sel); +#endif if (!(vmx->host_state.fs_sel & 7)) { vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); vmx->host_state.fs_reload_needed = 0; @@ -2149,7 +2394,6 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) vmcs_write16(HOST_FS_SELECTOR, 0); vmx->host_state.fs_reload_needed = 1; } - savesegment(gs, vmx->host_state.gs_sel); if (!(vmx->host_state.gs_sel & 7)) vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); else { @@ -2160,20 +2404,16 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) #ifdef CONFIG_X86_64 savesegment(ds, vmx->host_state.ds_sel); savesegment(es, vmx->host_state.es_sel); -#endif -#ifdef CONFIG_X86_64 - vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); - vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); -#else - vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel)); - vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel)); -#endif + vmcs_writel(HOST_FS_BASE, current->thread.fsbase); + vmcs_writel(HOST_GS_BASE, cpu_kernelmode_gs_base(cpu)); -#ifdef CONFIG_X86_64 - rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); + vmx->msr_host_kernel_gs_base = current->thread.gsbase; if (is_long_mode(&vmx->vcpu)) wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); +#else + vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel)); + vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel)); #endif if (boot_cpu_has(X86_FEATURE_MPX)) rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); @@ -2532,6 +2772,19 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit return 0; } +static void vmx_clear_hlt(struct kvm_vcpu *vcpu) +{ + /* + * Ensure that we clear the HLT state in the VMCS. We don't need to + * explicitly skip the instruction because if the HLT state is set, + * then the instruction is already executing and RIP has already been + * advanced. + */ + if (kvm_hlt_in_guest(vcpu->kvm) && + vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT) + vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); +} + static void vmx_queue_exception(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -2554,6 +2807,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu) return; } + WARN_ON_ONCE(vmx->emulation_required); + if (kvm_exception_is_soft(nr)) { vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, vmx->vcpu.arch.event_exit_inst_len); @@ -2562,6 +2817,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu) intr_info |= INTR_TYPE_HARD_EXCEPTION; vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); + + vmx_clear_hlt(vcpu); } static bool vmx_rdtscp_supported(void) @@ -2629,18 +2886,15 @@ static void setup_msrs(struct vcpu_vmx *vmx) vmx_update_msr_bitmap(&vmx->vcpu); } -/* - * reads and returns guest's timestamp counter "register" - * guest_tsc = (host_tsc * tsc multiplier) >> 48 + tsc_offset - * -- Intel TSC Scaling for Virtualization White Paper, sec 1.3 - */ -static u64 guest_read_tsc(struct kvm_vcpu *vcpu) +static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu) { - u64 host_tsc, tsc_offset; + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - host_tsc = rdtsc(); - tsc_offset = vmcs_read64(TSC_OFFSET); - return kvm_scale_tsc(vcpu, host_tsc) + tsc_offset; + if (is_guest_mode(vcpu) && + (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)) + return vcpu->arch.tsc_offset - vmcs12->tsc_offset; + + return vcpu->arch.tsc_offset; } /* @@ -2689,8 +2943,13 @@ static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu) * bit in the high half is on if the corresponding bit in the control field * may be on. See also vmx_control_verify(). */ -static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) +static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv) { + if (!nested) { + memset(msrs, 0, sizeof(*msrs)); + return; + } + /* * Note that as a general rule, the high half of the MSRs (bits in * the control fields which may be 1) should be initialized by the @@ -2708,70 +2967,68 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) /* pin-based controls */ rdmsr(MSR_IA32_VMX_PINBASED_CTLS, - vmx->nested.nested_vmx_pinbased_ctls_low, - vmx->nested.nested_vmx_pinbased_ctls_high); - vmx->nested.nested_vmx_pinbased_ctls_low |= + msrs->pinbased_ctls_low, + msrs->pinbased_ctls_high); + msrs->pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; - vmx->nested.nested_vmx_pinbased_ctls_high &= + msrs->pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING | - PIN_BASED_VIRTUAL_NMIS; - vmx->nested.nested_vmx_pinbased_ctls_high |= + PIN_BASED_VIRTUAL_NMIS | + (apicv ? PIN_BASED_POSTED_INTR : 0); + msrs->pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | PIN_BASED_VMX_PREEMPTION_TIMER; - if (kvm_vcpu_apicv_active(&vmx->vcpu)) - vmx->nested.nested_vmx_pinbased_ctls_high |= - PIN_BASED_POSTED_INTR; /* exit controls */ rdmsr(MSR_IA32_VMX_EXIT_CTLS, - vmx->nested.nested_vmx_exit_ctls_low, - vmx->nested.nested_vmx_exit_ctls_high); - vmx->nested.nested_vmx_exit_ctls_low = + msrs->exit_ctls_low, + msrs->exit_ctls_high); + msrs->exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; - vmx->nested.nested_vmx_exit_ctls_high &= + msrs->exit_ctls_high &= #ifdef CONFIG_X86_64 VM_EXIT_HOST_ADDR_SPACE_SIZE | #endif VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; - vmx->nested.nested_vmx_exit_ctls_high |= + msrs->exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; if (kvm_mpx_supported()) - vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; + msrs->exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; /* We support free control of debug control saving. */ - vmx->nested.nested_vmx_exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; + msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; /* entry controls */ rdmsr(MSR_IA32_VMX_ENTRY_CTLS, - vmx->nested.nested_vmx_entry_ctls_low, - vmx->nested.nested_vmx_entry_ctls_high); - vmx->nested.nested_vmx_entry_ctls_low = + msrs->entry_ctls_low, + msrs->entry_ctls_high); + msrs->entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; - vmx->nested.nested_vmx_entry_ctls_high &= + msrs->entry_ctls_high &= #ifdef CONFIG_X86_64 VM_ENTRY_IA32E_MODE | #endif VM_ENTRY_LOAD_IA32_PAT; - vmx->nested.nested_vmx_entry_ctls_high |= + msrs->entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); if (kvm_mpx_supported()) - vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; + msrs->entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; /* We support free control of debug control loading. */ - vmx->nested.nested_vmx_entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; + msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; /* cpu-based controls */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, - vmx->nested.nested_vmx_procbased_ctls_low, - vmx->nested.nested_vmx_procbased_ctls_high); - vmx->nested.nested_vmx_procbased_ctls_low = + msrs->procbased_ctls_low, + msrs->procbased_ctls_high); + msrs->procbased_ctls_low = CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; - vmx->nested.nested_vmx_procbased_ctls_high &= + msrs->procbased_ctls_high &= CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | @@ -2791,12 +3048,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) * can use it to avoid exits to L1 - even when L0 runs L2 * without MSR bitmaps. */ - vmx->nested.nested_vmx_procbased_ctls_high |= + msrs->procbased_ctls_high |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | CPU_BASED_USE_MSR_BITMAPS; /* We support free control of CR3 access interception. */ - vmx->nested.nested_vmx_procbased_ctls_low &= + msrs->procbased_ctls_low &= ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); /* @@ -2804,10 +3061,10 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) * depend on CPUID bits, they are added later by vmx_cpuid_update. */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, - vmx->nested.nested_vmx_secondary_ctls_low, - vmx->nested.nested_vmx_secondary_ctls_high); - vmx->nested.nested_vmx_secondary_ctls_low = 0; - vmx->nested.nested_vmx_secondary_ctls_high &= + msrs->secondary_ctls_low, + msrs->secondary_ctls_high); + msrs->secondary_ctls_low = 0; + msrs->secondary_ctls_high &= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_DESC | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | @@ -2817,33 +3074,33 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) if (enable_ept) { /* nested EPT: emulate EPT also to L1 */ - vmx->nested.nested_vmx_secondary_ctls_high |= + msrs->secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT; - vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | + msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT | VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT; if (cpu_has_vmx_ept_execute_only()) - vmx->nested.nested_vmx_ept_caps |= + msrs->ept_caps |= VMX_EPT_EXECUTE_ONLY_BIT; - vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept; - vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | + msrs->ept_caps &= vmx_capability.ept; + msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | VMX_EPT_1GB_PAGE_BIT; if (enable_ept_ad_bits) { - vmx->nested.nested_vmx_secondary_ctls_high |= + msrs->secondary_ctls_high |= SECONDARY_EXEC_ENABLE_PML; - vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT; + msrs->ept_caps |= VMX_EPT_AD_BIT; } } if (cpu_has_vmx_vmfunc()) { - vmx->nested.nested_vmx_secondary_ctls_high |= + msrs->secondary_ctls_high |= SECONDARY_EXEC_ENABLE_VMFUNC; /* * Advertise EPTP switching unconditionally * since we emulate it */ if (enable_ept) - vmx->nested.nested_vmx_vmfunc_controls = + msrs->vmfunc_controls = VMX_VMFUNC_EPTP_SWITCHING; } @@ -2854,25 +3111,25 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) * not failing the single-context invvpid, and it is worse. */ if (enable_vpid) { - vmx->nested.nested_vmx_secondary_ctls_high |= + msrs->secondary_ctls_high |= SECONDARY_EXEC_ENABLE_VPID; - vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT | + msrs->vpid_caps = VMX_VPID_INVVPID_BIT | VMX_VPID_EXTENT_SUPPORTED_MASK; } if (enable_unrestricted_guest) - vmx->nested.nested_vmx_secondary_ctls_high |= + msrs->secondary_ctls_high |= SECONDARY_EXEC_UNRESTRICTED_GUEST; /* miscellaneous data */ rdmsr(MSR_IA32_VMX_MISC, - vmx->nested.nested_vmx_misc_low, - vmx->nested.nested_vmx_misc_high); - vmx->nested.nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA; - vmx->nested.nested_vmx_misc_low |= + msrs->misc_low, + msrs->misc_high); + msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA; + msrs->misc_low |= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | VMX_MISC_ACTIVITY_HLT; - vmx->nested.nested_vmx_misc_high = 0; + msrs->misc_high = 0; /* * This MSR reports some information about VMX support. We @@ -2880,14 +3137,14 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) * guest, and the VMCS structure we give it - not about the * VMX support of the underlying hardware. */ - vmx->nested.nested_vmx_basic = + msrs->basic = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS | ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); if (cpu_has_vmx_basic_inout()) - vmx->nested.nested_vmx_basic |= VMX_BASIC_INOUT; + msrs->basic |= VMX_BASIC_INOUT; /* * These MSRs specify bits which the guest must keep fixed on @@ -2896,15 +3153,15 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) */ #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) #define VMXON_CR4_ALWAYSON X86_CR4_VMXE - vmx->nested.nested_vmx_cr0_fixed0 = VMXON_CR0_ALWAYSON; - vmx->nested.nested_vmx_cr4_fixed0 = VMXON_CR4_ALWAYSON; + msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; + msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; /* These MSRs specify bits which the guest must keep fixed off. */ - rdmsrl(MSR_IA32_VMX_CR0_FIXED1, vmx->nested.nested_vmx_cr0_fixed1); - rdmsrl(MSR_IA32_VMX_CR4_FIXED1, vmx->nested.nested_vmx_cr4_fixed1); + rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); + rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); /* highest index: VMX_PREEMPTION_TIMER_VALUE */ - vmx->nested.nested_vmx_vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1; + msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1; } /* @@ -2941,7 +3198,7 @@ static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | /* reserved */ BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); - u64 vmx_basic = vmx->nested.nested_vmx_basic; + u64 vmx_basic = vmx->nested.msrs.basic; if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) return -EINVAL; @@ -2960,7 +3217,7 @@ static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) return -EINVAL; - vmx->nested.nested_vmx_basic = data; + vmx->nested.msrs.basic = data; return 0; } @@ -2972,24 +3229,24 @@ vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) switch (msr_index) { case MSR_IA32_VMX_TRUE_PINBASED_CTLS: - lowp = &vmx->nested.nested_vmx_pinbased_ctls_low; - highp = &vmx->nested.nested_vmx_pinbased_ctls_high; + lowp = &vmx->nested.msrs.pinbased_ctls_low; + highp = &vmx->nested.msrs.pinbased_ctls_high; break; case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: - lowp = &vmx->nested.nested_vmx_procbased_ctls_low; - highp = &vmx->nested.nested_vmx_procbased_ctls_high; + lowp = &vmx->nested.msrs.procbased_ctls_low; + highp = &vmx->nested.msrs.procbased_ctls_high; break; case MSR_IA32_VMX_TRUE_EXIT_CTLS: - lowp = &vmx->nested.nested_vmx_exit_ctls_low; - highp = &vmx->nested.nested_vmx_exit_ctls_high; + lowp = &vmx->nested.msrs.exit_ctls_low; + highp = &vmx->nested.msrs.exit_ctls_high; break; case MSR_IA32_VMX_TRUE_ENTRY_CTLS: - lowp = &vmx->nested.nested_vmx_entry_ctls_low; - highp = &vmx->nested.nested_vmx_entry_ctls_high; + lowp = &vmx->nested.msrs.entry_ctls_low; + highp = &vmx->nested.msrs.entry_ctls_high; break; case MSR_IA32_VMX_PROCBASED_CTLS2: - lowp = &vmx->nested.nested_vmx_secondary_ctls_low; - highp = &vmx->nested.nested_vmx_secondary_ctls_high; + lowp = &vmx->nested.msrs.secondary_ctls_low; + highp = &vmx->nested.msrs.secondary_ctls_high; break; default: BUG(); @@ -3020,13 +3277,13 @@ static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) GENMASK_ULL(13, 9) | BIT_ULL(31); u64 vmx_misc; - vmx_misc = vmx_control_msr(vmx->nested.nested_vmx_misc_low, - vmx->nested.nested_vmx_misc_high); + vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, + vmx->nested.msrs.misc_high); if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) return -EINVAL; - if ((vmx->nested.nested_vmx_pinbased_ctls_high & + if ((vmx->nested.msrs.pinbased_ctls_high & PIN_BASED_VMX_PREEMPTION_TIMER) && vmx_misc_preemption_timer_rate(data) != vmx_misc_preemption_timer_rate(vmx_misc)) @@ -3041,8 +3298,8 @@ static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) return -EINVAL; - vmx->nested.nested_vmx_misc_low = data; - vmx->nested.nested_vmx_misc_high = data >> 32; + vmx->nested.msrs.misc_low = data; + vmx->nested.msrs.misc_high = data >> 32; return 0; } @@ -3050,15 +3307,15 @@ static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) { u64 vmx_ept_vpid_cap; - vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.nested_vmx_ept_caps, - vmx->nested.nested_vmx_vpid_caps); + vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps, + vmx->nested.msrs.vpid_caps); /* Every bit is either reserved or a feature bit. */ if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) return -EINVAL; - vmx->nested.nested_vmx_ept_caps = data; - vmx->nested.nested_vmx_vpid_caps = data >> 32; + vmx->nested.msrs.ept_caps = data; + vmx->nested.msrs.vpid_caps = data >> 32; return 0; } @@ -3068,10 +3325,10 @@ static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) switch (msr_index) { case MSR_IA32_VMX_CR0_FIXED0: - msr = &vmx->nested.nested_vmx_cr0_fixed0; + msr = &vmx->nested.msrs.cr0_fixed0; break; case MSR_IA32_VMX_CR4_FIXED0: - msr = &vmx->nested.nested_vmx_cr4_fixed0; + msr = &vmx->nested.msrs.cr4_fixed0; break; default: BUG(); @@ -3135,7 +3392,7 @@ static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) case MSR_IA32_VMX_EPT_VPID_CAP: return vmx_restore_vmx_ept_vpid_cap(vmx, data); case MSR_IA32_VMX_VMCS_ENUM: - vmx->nested.nested_vmx_vmcs_enum = data; + vmx->nested.msrs.vmcs_enum = data; return 0; default: /* @@ -3146,77 +3403,75 @@ static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) } /* Returns 0 on success, non-0 otherwise. */ -static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) +static int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - switch (msr_index) { case MSR_IA32_VMX_BASIC: - *pdata = vmx->nested.nested_vmx_basic; + *pdata = msrs->basic; break; case MSR_IA32_VMX_TRUE_PINBASED_CTLS: case MSR_IA32_VMX_PINBASED_CTLS: *pdata = vmx_control_msr( - vmx->nested.nested_vmx_pinbased_ctls_low, - vmx->nested.nested_vmx_pinbased_ctls_high); + msrs->pinbased_ctls_low, + msrs->pinbased_ctls_high); if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; break; case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: case MSR_IA32_VMX_PROCBASED_CTLS: *pdata = vmx_control_msr( - vmx->nested.nested_vmx_procbased_ctls_low, - vmx->nested.nested_vmx_procbased_ctls_high); + msrs->procbased_ctls_low, + msrs->procbased_ctls_high); if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; break; case MSR_IA32_VMX_TRUE_EXIT_CTLS: case MSR_IA32_VMX_EXIT_CTLS: *pdata = vmx_control_msr( - vmx->nested.nested_vmx_exit_ctls_low, - vmx->nested.nested_vmx_exit_ctls_high); + msrs->exit_ctls_low, + msrs->exit_ctls_high); if (msr_index == MSR_IA32_VMX_EXIT_CTLS) *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; break; case MSR_IA32_VMX_TRUE_ENTRY_CTLS: case MSR_IA32_VMX_ENTRY_CTLS: *pdata = vmx_control_msr( - vmx->nested.nested_vmx_entry_ctls_low, - vmx->nested.nested_vmx_entry_ctls_high); + msrs->entry_ctls_low, + msrs->entry_ctls_high); if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; break; case MSR_IA32_VMX_MISC: *pdata = vmx_control_msr( - vmx->nested.nested_vmx_misc_low, - vmx->nested.nested_vmx_misc_high); + msrs->misc_low, + msrs->misc_high); break; case MSR_IA32_VMX_CR0_FIXED0: - *pdata = vmx->nested.nested_vmx_cr0_fixed0; + *pdata = msrs->cr0_fixed0; break; case MSR_IA32_VMX_CR0_FIXED1: - *pdata = vmx->nested.nested_vmx_cr0_fixed1; + *pdata = msrs->cr0_fixed1; break; case MSR_IA32_VMX_CR4_FIXED0: - *pdata = vmx->nested.nested_vmx_cr4_fixed0; + *pdata = msrs->cr4_fixed0; break; case MSR_IA32_VMX_CR4_FIXED1: - *pdata = vmx->nested.nested_vmx_cr4_fixed1; + *pdata = msrs->cr4_fixed1; break; case MSR_IA32_VMX_VMCS_ENUM: - *pdata = vmx->nested.nested_vmx_vmcs_enum; + *pdata = msrs->vmcs_enum; break; case MSR_IA32_VMX_PROCBASED_CTLS2: *pdata = vmx_control_msr( - vmx->nested.nested_vmx_secondary_ctls_low, - vmx->nested.nested_vmx_secondary_ctls_high); + msrs->secondary_ctls_low, + msrs->secondary_ctls_high); break; case MSR_IA32_VMX_EPT_VPID_CAP: - *pdata = vmx->nested.nested_vmx_ept_caps | - ((u64)vmx->nested.nested_vmx_vpid_caps << 32); + *pdata = msrs->ept_caps | + ((u64)msrs->vpid_caps << 32); break; case MSR_IA32_VMX_VMFUNC: - *pdata = vmx->nested.nested_vmx_vmfunc_controls; + *pdata = msrs->vmfunc_controls; break; default: return 1; @@ -3235,7 +3490,16 @@ static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu, static int vmx_get_msr_feature(struct kvm_msr_entry *msr) { - return 1; + switch (msr->index) { + case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: + if (!nested) + return 1; + return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data); + default: + return 1; + } + + return 0; } /* @@ -3263,12 +3527,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) #endif case MSR_EFER: return kvm_get_msr_common(vcpu, msr_info); - case MSR_IA32_TSC: - msr_info->data = guest_read_tsc(vcpu); - break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) && !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) return 1; @@ -3309,7 +3569,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: if (!nested_vmx_allowed(vcpu)) return 1; - return vmx_get_vmx_msr(vcpu, msr_info->index, &msr_info->data); + return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, + &msr_info->data); case MSR_IA32_XSS: if (!vmx_xsaves_supported()) return 1; @@ -3384,17 +3645,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vmcs_write64(GUEST_BNDCFGS, data); break; - case MSR_IA32_TSC: - kvm_write_tsc(vcpu, msr_info); - break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) && !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) return 1; /* The STIBP bit doesn't fault even if it's not advertised */ - if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP)) + if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD)) return 1; vmx->spec_ctrl = data; @@ -3420,7 +3677,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_PRED_CMD: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_IBPB) && !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) return 1; @@ -3602,6 +3858,14 @@ static int hardware_enable(void) if (cr4_read_shadow() & X86_CR4_VMXE) return -EBUSY; + /* + * This can happen if we hot-added a CPU but failed to allocate + * VP assist page for it. + */ + if (static_branch_unlikely(&enable_evmcs) && + !hv_get_vp_assist_page(cpu)) + return -EFAULT; + INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); @@ -3700,6 +3964,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) u32 _vmexit_control = 0; u32 _vmentry_control = 0; + memset(vmcs_conf, 0, sizeof(*vmcs_conf)); min = CPU_BASED_HLT_EXITING | #ifdef CONFIG_X86_64 CPU_BASED_CR8_LOAD_EXITING | @@ -3710,13 +3975,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) CPU_BASED_UNCOND_IO_EXITING | CPU_BASED_MOV_DR_EXITING | CPU_BASED_USE_TSC_OFFSETING | + CPU_BASED_MWAIT_EXITING | + CPU_BASED_MONITOR_EXITING | CPU_BASED_INVLPG_EXITING | CPU_BASED_RDPMC_EXITING; - if (!kvm_mwait_in_guest()) - min |= CPU_BASED_MWAIT_EXITING | - CPU_BASED_MONITOR_EXITING; - opt = CPU_BASED_TPR_SHADOW | CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; @@ -3835,7 +4098,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) vmcs_conf->size = vmx_msr_high & 0x1fff; vmcs_conf->order = get_order(vmcs_conf->size); vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff; - vmcs_conf->revision_id = vmx_msr_low; + + /* KVM supports Enlightened VMCS v1 only */ + if (static_branch_unlikely(&enable_evmcs)) + vmcs_conf->revision_id = KVM_EVMCS_VERSION; + else + vmcs_conf->revision_id = vmx_msr_low; vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; @@ -3843,6 +4111,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) vmcs_conf->vmexit_ctrl = _vmexit_control; vmcs_conf->vmentry_ctrl = _vmentry_control; + if (static_branch_unlikely(&enable_evmcs)) + evmcs_sanitize_exec_ctrls(vmcs_conf); + cpu_has_load_ia32_efer = allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS, VM_ENTRY_LOAD_IA32_EFER) @@ -4162,6 +4433,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu) { unsigned long flags; struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); @@ -4177,13 +4449,13 @@ static void enter_rmode(struct kvm_vcpu *vcpu) * Very old userspace does not call KVM_SET_TSS_ADDR before entering * vcpu. Warn the user that an update is overdue. */ - if (!vcpu->kvm->arch.tss_addr) + if (!kvm_vmx->tss_addr) printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be " "called before entering vcpu\n"); vmx_segment_cache_clear(vmx); - vmcs_writel(GUEST_TR_BASE, vcpu->kvm->arch.tss_addr); + vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr); vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); @@ -4275,12 +4547,6 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa); } -static void vmx_flush_tlb_ept_only(struct kvm_vcpu *vcpu) -{ - if (enable_ept) - vmx_flush_tlb(vcpu, true); -} - static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) { ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; @@ -4291,7 +4557,7 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) static void vmx_decache_cr3(struct kvm_vcpu *vcpu) { - if (enable_ept && is_paging(vcpu)) + if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu))) vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); } @@ -4339,11 +4605,11 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu) static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) { - u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed0; - u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed1; + u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0; + u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1; struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high & + if (to_vmx(vcpu)->nested.msrs.secondary_ctls_high & SECONDARY_EXEC_UNRESTRICTED_GUEST && nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) fixed0 &= ~(X86_CR0_PE | X86_CR0_PG); @@ -4353,16 +4619,16 @@ static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) { - u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed0; - u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed1; + u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0; + u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1; return fixed_bits_valid(val, fixed0, fixed1); } static bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val) { - u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr4_fixed0; - u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr4_fixed1; + u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr4_fixed0; + u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr4_fixed1; return fixed_bits_valid(val, fixed0, fixed1); } @@ -4428,7 +4694,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } #endif - if (enable_ept) + if (enable_ept && !enable_unrestricted_guest) ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); vmcs_writel(CR0_READ_SHADOW, cr0); @@ -4469,10 +4735,11 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (enable_ept) { eptp = construct_eptp(vcpu, cr3); vmcs_write64(EPT_POINTER, eptp); - if (is_paging(vcpu) || is_guest_mode(vcpu)) + if (enable_unrestricted_guest || is_paging(vcpu) || + is_guest_mode(vcpu)) guest_cr3 = kvm_read_cr3(vcpu); else - guest_cr3 = vcpu->kvm->arch.ept_identity_map_addr; + guest_cr3 = to_kvm_vmx(vcpu->kvm)->ept_identity_map_addr; ept_load_pdptrs(vcpu); } @@ -4487,20 +4754,26 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) * is in force while we are in guest mode. Do not let guests control * this bit, even if host CR4.MCE == 0. */ - unsigned long hw_cr4 = - (cr4_read_shadow() & X86_CR4_MCE) | - (cr4 & ~X86_CR4_MCE) | - (to_vmx(vcpu)->rmode.vm86_active ? - KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); + unsigned long hw_cr4; - if ((cr4 & X86_CR4_UMIP) && !boot_cpu_has(X86_FEATURE_UMIP)) { - vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, - SECONDARY_EXEC_DESC); - hw_cr4 &= ~X86_CR4_UMIP; - } else if (!is_guest_mode(vcpu) || - !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) - vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, + hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE); + if (enable_unrestricted_guest) + hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST; + else if (to_vmx(vcpu)->rmode.vm86_active) + hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON; + else + hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON; + + if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) { + if (cr4 & X86_CR4_UMIP) { + vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_DESC); + hw_cr4 &= ~X86_CR4_UMIP; + } else if (!is_guest_mode(vcpu) || + !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) + vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, + SECONDARY_EXEC_DESC); + } if (cr4 & X86_CR4_VMXE) { /* @@ -4517,16 +4790,17 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 1; vcpu->arch.cr4 = cr4; - if (enable_ept) { - if (!is_paging(vcpu)) { - hw_cr4 &= ~X86_CR4_PAE; - hw_cr4 |= X86_CR4_PSE; - } else if (!(cr4 & X86_CR4_PAE)) { - hw_cr4 &= ~X86_CR4_PAE; + + if (!enable_unrestricted_guest) { + if (enable_ept) { + if (!is_paging(vcpu)) { + hw_cr4 &= ~X86_CR4_PAE; + hw_cr4 |= X86_CR4_PSE; + } else if (!(cr4 & X86_CR4_PAE)) { + hw_cr4 &= ~X86_CR4_PAE; + } } - } - if (!enable_unrestricted_guest && !is_paging(vcpu)) /* * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in * hardware. To emulate this behavior, SMEP/SMAP/PKU needs @@ -4538,7 +4812,9 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) * If enable_unrestricted_guest, the CPU automatically * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0. */ - hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); + if (!is_paging(vcpu)) + hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); + } vmcs_writel(CR4_READ_SHADOW, cr4); vmcs_writel(GUEST_CR4, hw_cr4); @@ -4906,7 +5182,7 @@ static int init_rmode_tss(struct kvm *kvm) int idx, r; idx = srcu_read_lock(&kvm->srcu); - fn = kvm->arch.tss_addr >> PAGE_SHIFT; + fn = to_kvm_vmx(kvm)->tss_addr >> PAGE_SHIFT; r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); if (r < 0) goto out; @@ -4932,22 +5208,23 @@ out: static int init_rmode_identity_map(struct kvm *kvm) { + struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); int i, idx, r = 0; kvm_pfn_t identity_map_pfn; u32 tmp; - /* Protect kvm->arch.ept_identity_pagetable_done. */ + /* Protect kvm_vmx->ept_identity_pagetable_done. */ mutex_lock(&kvm->slots_lock); - if (likely(kvm->arch.ept_identity_pagetable_done)) + if (likely(kvm_vmx->ept_identity_pagetable_done)) goto out2; - if (!kvm->arch.ept_identity_map_addr) - kvm->arch.ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; - identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT; + if (!kvm_vmx->ept_identity_map_addr) + kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; + identity_map_pfn = kvm_vmx->ept_identity_map_addr >> PAGE_SHIFT; r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, - kvm->arch.ept_identity_map_addr, PAGE_SIZE); + kvm_vmx->ept_identity_map_addr, PAGE_SIZE); if (r < 0) goto out2; @@ -4964,7 +5241,7 @@ static int init_rmode_identity_map(struct kvm *kvm) if (r < 0) goto out; } - kvm->arch.ept_identity_pagetable_done = true; + kvm_vmx->ept_identity_pagetable_done = true; out: srcu_read_unlock(&kvm->srcu, idx); @@ -5500,6 +5777,11 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) exec_control |= CPU_BASED_CR3_STORE_EXITING | CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_INVLPG_EXITING; + if (kvm_mwait_in_guest(vmx->vcpu.kvm)) + exec_control &= ~(CPU_BASED_MWAIT_EXITING | + CPU_BASED_MONITOR_EXITING); + if (kvm_hlt_in_guest(vmx->vcpu.kvm)) + exec_control &= ~CPU_BASED_HLT_EXITING; return exec_control; } @@ -5533,7 +5815,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) } if (!enable_unrestricted_guest) exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; - if (!ple_gap) + if (kvm_pause_in_guest(vmx->vcpu.kvm)) exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; if (!kvm_vcpu_apicv_active(vcpu)) exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | @@ -5565,10 +5847,10 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) if (nested) { if (xsaves_enabled) - vmx->nested.nested_vmx_secondary_ctls_high |= + vmx->nested.msrs.secondary_ctls_high |= SECONDARY_EXEC_XSAVES; else - vmx->nested.nested_vmx_secondary_ctls_high &= + vmx->nested.msrs.secondary_ctls_high &= ~SECONDARY_EXEC_XSAVES; } } @@ -5580,10 +5862,10 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) if (nested) { if (rdtscp_enabled) - vmx->nested.nested_vmx_secondary_ctls_high |= + vmx->nested.msrs.secondary_ctls_high |= SECONDARY_EXEC_RDTSCP; else - vmx->nested.nested_vmx_secondary_ctls_high &= + vmx->nested.msrs.secondary_ctls_high &= ~SECONDARY_EXEC_RDTSCP; } } @@ -5601,10 +5883,10 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) if (nested) { if (invpcid_enabled) - vmx->nested.nested_vmx_secondary_ctls_high |= + vmx->nested.msrs.secondary_ctls_high |= SECONDARY_EXEC_ENABLE_INVPCID; else - vmx->nested.nested_vmx_secondary_ctls_high &= + vmx->nested.msrs.secondary_ctls_high &= ~SECONDARY_EXEC_ENABLE_INVPCID; } } @@ -5616,10 +5898,10 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) if (nested) { if (rdrand_enabled) - vmx->nested.nested_vmx_secondary_ctls_high |= + vmx->nested.msrs.secondary_ctls_high |= SECONDARY_EXEC_RDRAND_EXITING; else - vmx->nested.nested_vmx_secondary_ctls_high &= + vmx->nested.msrs.secondary_ctls_high &= ~SECONDARY_EXEC_RDRAND_EXITING; } } @@ -5631,10 +5913,10 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) if (nested) { if (rdseed_enabled) - vmx->nested.nested_vmx_secondary_ctls_high |= + vmx->nested.msrs.secondary_ctls_high |= SECONDARY_EXEC_RDSEED_EXITING; else - vmx->nested.nested_vmx_secondary_ctls_high &= + vmx->nested.msrs.secondary_ctls_high &= ~SECONDARY_EXEC_RDSEED_EXITING; } } @@ -5696,7 +5978,7 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); } - if (ple_gap) { + if (!kvm_pause_in_guest(vmx->vcpu.kvm)) { vmcs_write32(PLE_GAP, ple_gap); vmx->ple_window = ple_window; vmx->ple_window_dirty = true; @@ -5861,6 +6143,8 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) update_exception_bitmap(vcpu); vpid_sync_context(vmx->vpid); + if (init_event) + vmx_clear_hlt(vcpu); } /* @@ -5885,8 +6169,7 @@ static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) { - return get_vmcs12(vcpu)->pin_based_vm_exec_control & - PIN_BASED_NMI_EXITING; + return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu)); } static void enable_irq_window(struct kvm_vcpu *vcpu) @@ -5932,6 +6215,8 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu) } else intr |= INTR_TYPE_EXT_INTR; vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); + + vmx_clear_hlt(vcpu); } static void vmx_inject_nmi(struct kvm_vcpu *vcpu) @@ -5962,6 +6247,8 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); + + vmx_clear_hlt(vcpu); } static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) @@ -6024,14 +6311,23 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) { int ret; + if (enable_unrestricted_guest) + return 0; + ret = x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr, PAGE_SIZE * 3); if (ret) return ret; - kvm->arch.tss_addr = addr; + to_kvm_vmx(kvm)->tss_addr = addr; return init_rmode_tss(kvm); } +static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) +{ + to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr; + return 0; +} + static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) { switch (vec) { @@ -6134,19 +6430,24 @@ static int handle_exception(struct kvm_vcpu *vcpu) if (is_nmi(intr_info)) return 1; /* already handled by vmx_vcpu_run() */ - if (is_invalid_opcode(intr_info)) { - er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD); - if (er == EMULATE_USER_EXIT) - return 0; - if (er != EMULATE_DONE) - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } + if (is_invalid_opcode(intr_info)) + return handle_ud(vcpu); error_code = 0; if (intr_info & INTR_INFO_DELIVER_CODE_MASK) error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) { + WARN_ON_ONCE(!enable_vmware_backdoor); + er = emulate_instruction(vcpu, + EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL); + if (er == EMULATE_USER_EXIT) + return 0; + else if (er != EMULATE_DONE) + kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); + return 1; + } + /* * The #PF with PFEC.RSVD = 1 indicates the guest is accessing * MMIO, it is better to report an internal error. @@ -6232,28 +6533,22 @@ static int handle_triple_fault(struct kvm_vcpu *vcpu) static int handle_io(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; - int size, in, string, ret; + int size, in, string; unsigned port; exit_qualification = vmcs_readl(EXIT_QUALIFICATION); string = (exit_qualification & 16) != 0; - in = (exit_qualification & 8) != 0; ++vcpu->stat.io_exits; - if (string || in) + if (string) return emulate_instruction(vcpu, 0) == EMULATE_DONE; port = exit_qualification >> 16; size = (exit_qualification & 7) + 1; + in = (exit_qualification & 8) != 0; - ret = kvm_skip_emulated_instruction(vcpu); - - /* - * TODO: we might be squashing a KVM_GUESTDBG_SINGLESTEP-triggered - * KVM_EXIT_DEBUG here. - */ - return kvm_fast_pio_out(vcpu, size, port) && ret; + return kvm_fast_pio(vcpu, size, port, in); } static void @@ -6344,6 +6639,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) err = handle_set_cr0(vcpu, val); return kvm_complete_insn_gp(vcpu, err); case 3: + WARN_ON_ONCE(enable_unrestricted_guest); err = kvm_set_cr3(vcpu, val); return kvm_complete_insn_gp(vcpu, err); case 4: @@ -6376,6 +6672,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) case 1: /*mov from cr*/ switch (cr) { case 3: + WARN_ON_ONCE(enable_unrestricted_guest); val = kvm_read_cr3(vcpu); kvm_register_write(vcpu, reg, val); trace_kvm_cr_read(cr, val); @@ -6769,7 +7066,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) static int handle_ept_misconfig(struct kvm_vcpu *vcpu) { - int ret; gpa_t gpa; /* @@ -6797,17 +7093,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) NULL, 0) == EMULATE_DONE; } - ret = kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); - if (ret >= 0) - return ret; - - /* It is the real ept misconfig */ - WARN_ON(1); - - vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; - vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG; - - return 0; + return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); } static int handle_nmi_window(struct kvm_vcpu *vcpu) @@ -6830,6 +7116,13 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) bool intr_window_requested; unsigned count = 130; + /* + * We should never reach the point where we are emulating L2 + * due to invalid guest state as that means we incorrectly + * allowed a nested VMEntry with an invalid vmcs12. + */ + WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending); + cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; @@ -6848,12 +7141,12 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) goto out; } - if (err != EMULATE_DONE) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; - return 0; - } + if (err != EMULATE_DONE) + goto emulation_error; + + if (vmx->emulation_required && !vmx->rmode.vm86_active && + vcpu->arch.exception.pending) + goto emulation_error; if (vcpu->arch.halt_request) { vcpu->arch.halt_request = 0; @@ -6869,34 +7162,12 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) out: return ret; -} - -static int __grow_ple_window(int val) -{ - if (ple_window_grow < 1) - return ple_window; - - val = min(val, ple_window_actual_max); - - if (ple_window_grow < ple_window) - val *= ple_window_grow; - else - val += ple_window_grow; - - return val; -} -static int __shrink_ple_window(int val, int modifier, int minimum) -{ - if (modifier < 1) - return ple_window; - - if (modifier < ple_window) - val /= modifier; - else - val -= modifier; - - return max(val, minimum); +emulation_error: + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return 0; } static void grow_ple_window(struct kvm_vcpu *vcpu) @@ -6904,7 +7175,9 @@ static void grow_ple_window(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); int old = vmx->ple_window; - vmx->ple_window = __grow_ple_window(old); + vmx->ple_window = __grow_ple_window(old, ple_window, + ple_window_grow, + ple_window_max); if (vmx->ple_window != old) vmx->ple_window_dirty = true; @@ -6917,8 +7190,9 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); int old = vmx->ple_window; - vmx->ple_window = __shrink_ple_window(old, - ple_window_shrink, ple_window); + vmx->ple_window = __shrink_ple_window(old, ple_window, + ple_window_shrink, + ple_window); if (vmx->ple_window != old) vmx->ple_window_dirty = true; @@ -6927,21 +7201,6 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) } /* - * ple_window_actual_max is computed to be one grow_ple_window() below - * ple_window_max. (See __grow_ple_window for the reason.) - * This prevents overflows, because ple_window_max is int. - * ple_window_max effectively rounded down to a multiple of ple_window_grow in - * this process. - * ple_window_max is also prevented from setting vmx->ple_window < ple_window. - */ -static void update_ple_window_actual_max(void) -{ - ple_window_actual_max = - __shrink_ple_window(max(ple_window_max, ple_window), - ple_window_grow, INT_MIN); -} - -/* * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. */ static void wakeup_handler(void) @@ -6960,7 +7219,7 @@ static void wakeup_handler(void) spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); } -void vmx_enable_tdp(void) +static void vmx_enable_tdp(void) { kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull, @@ -7061,8 +7320,6 @@ static __init int hardware_setup(void) else kvm_disable_tdp(); - update_ple_window_actual_max(); - /* * Only enable PML when hardware supports PML feature, and both EPT * and EPT A/D bit features are enabled -- PML depends on them to work. @@ -7094,6 +7351,7 @@ static __init int hardware_setup(void) init_vmcs_shadow_fields(); kvm_set_posted_intr_wakeup_handler(wakeup_handler); + nested_vmx_setup_ctls_msrs(&vmcs_config.nested, enable_apicv); kvm_mce_cap_supported |= MCG_LMCE_P; @@ -7122,7 +7380,7 @@ static __exit void hardware_unsetup(void) */ static int handle_pause(struct kvm_vcpu *vcpu) { - if (ple_gap) + if (!kvm_pause_in_guest(vcpu->kvm)) grow_ple_window(vcpu); /* @@ -7954,9 +8212,9 @@ static int handle_invept(struct kvm_vcpu *vcpu) u64 eptp, gpa; } operand; - if (!(vmx->nested.nested_vmx_secondary_ctls_high & + if (!(vmx->nested.msrs.secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) || - !(vmx->nested.nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) { + !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } @@ -7967,7 +8225,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); - types = (vmx->nested.nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; + types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; if (type >= 32 || !(types & (1 << type))) { nested_vmx_failValid(vcpu, @@ -8018,9 +8276,9 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) u64 gla; } operand; - if (!(vmx->nested.nested_vmx_secondary_ctls_high & + if (!(vmx->nested.msrs.secondary_ctls_high & SECONDARY_EXEC_ENABLE_VPID) || - !(vmx->nested.nested_vmx_vpid_caps & VMX_VPID_INVVPID_BIT)) { + !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } @@ -8031,7 +8289,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); - types = (vmx->nested.nested_vmx_vpid_caps & + types = (vmx->nested.msrs.vpid_caps & VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; if (type >= 32 || !(types & (1 << type))) { @@ -8125,11 +8383,11 @@ static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address) /* Check for memory type validity */ switch (address & VMX_EPTP_MT_MASK) { case VMX_EPTP_MT_UC: - if (!(vmx->nested.nested_vmx_ept_caps & VMX_EPTP_UC_BIT)) + if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT)) return false; break; case VMX_EPTP_MT_WB: - if (!(vmx->nested.nested_vmx_ept_caps & VMX_EPTP_WB_BIT)) + if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT)) return false; break; default: @@ -8146,7 +8404,7 @@ static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address) /* AD, if set, should be supported */ if (address & VMX_EPTP_AD_ENABLE_BIT) { - if (!(vmx->nested.nested_vmx_ept_caps & VMX_EPT_AD_BIT)) + if (!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT)) return false; } @@ -8790,7 +9048,8 @@ static void dump_vmcs(void) pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n", vmcs_read64(GUEST_IA32_DEBUGCTL), vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS)); - if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) + if (cpu_has_load_perf_global_ctrl && + vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) pr_err("PerfGlobCtl = 0x%016llx\n", vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL)); if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) @@ -8826,7 +9085,8 @@ static void dump_vmcs(void) pr_err("EFER = 0x%016llx PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_EFER), vmcs_read64(HOST_IA32_PAT)); - if (vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) + if (cpu_has_load_perf_global_ctrl && + vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) pr_err("PerfGlobCtl = 0x%016llx\n", vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); @@ -9017,7 +9277,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) } else { sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; - vmx_flush_tlb_ept_only(vcpu); + vmx_flush_tlb(vcpu, true); } vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); @@ -9045,7 +9305,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) !nested_cpu_has2(get_vmcs12(&vmx->vcpu), SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { vmcs_write64(APIC_ACCESS_ADDR, hpa); - vmx_flush_tlb_ept_only(vcpu); + vmx_flush_tlb(vcpu, true); } } @@ -9178,9 +9438,9 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) /* We need to handle NMIs before interrupts are enabled */ if (is_nmi(exit_intr_info)) { - kvm_before_handle_nmi(&vmx->vcpu); + kvm_before_interrupt(&vmx->vcpu); asm("int $2"); - kvm_after_handle_nmi(&vmx->vcpu); + kvm_after_interrupt(&vmx->vcpu); } } @@ -9225,9 +9485,21 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) } STACK_FRAME_NON_STANDARD(vmx_handle_external_intr); -static bool vmx_has_high_real_mode_segbase(void) +static bool vmx_has_emulated_msr(int index) { - return enable_unrestricted_guest || emulate_invalid_guest_state; + switch (index) { + case MSR_IA32_SMBASE: + /* + * We cannot do SMM unless we can run the guest in big + * real mode. + */ + return enable_unrestricted_guest || emulate_invalid_guest_state; + case MSR_AMD64_VIRT_SPEC_CTRL: + /* This is AMD only. */ + return false; + default: + return true; + } } static bool vmx_mpx_supported(void) @@ -9242,12 +9514,6 @@ static bool vmx_xsaves_supported(void) SECONDARY_EXEC_XSAVES; } -static bool vmx_umip_emulated(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_DESC; -} - static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -9403,7 +9669,7 @@ static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long cr3, cr4; + unsigned long cr3, cr4, evmcs_rsp; /* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!enable_vnmi && @@ -9465,10 +9731,13 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) * is no need to worry about the conditional branch over the wrmsr * being speculatively taken. */ - if (vmx->spec_ctrl) - native_wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); + x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0); vmx->__launched = vmx->loaded_vmcs->launched; + + evmcs_rsp = static_branch_unlikely(&enable_evmcs) ? + (unsigned long)¤t_evmcs->host_rsp : 0; + asm( /* Store host registers */ "push %%" _ASM_DX "; push %%" _ASM_BP ";" @@ -9477,15 +9746,21 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) "cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t" "je 1f \n\t" "mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t" + /* Avoid VMWRITE when Enlightened VMCS is in use */ + "test %%" _ASM_SI ", %%" _ASM_SI " \n\t" + "jz 2f \n\t" + "mov %%" _ASM_SP ", (%%" _ASM_SI ") \n\t" + "jmp 1f \n\t" + "2: \n\t" __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" "1: \n\t" /* Reload cr2 if changed */ "mov %c[cr2](%0), %%" _ASM_AX " \n\t" "mov %%cr2, %%" _ASM_DX " \n\t" "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t" - "je 2f \n\t" + "je 3f \n\t" "mov %%" _ASM_AX", %%cr2 \n\t" - "2: \n\t" + "3: \n\t" /* Check if vmlaunch of vmresume is needed */ "cmpl $0, %c[launched](%0) \n\t" /* Load guest registers. Don't clobber flags. */ @@ -9554,7 +9829,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) ".global vmx_return \n\t" "vmx_return: " _ASM_PTR " 2b \n\t" ".popsection" - : : "c"(vmx), "d"((unsigned long)HOST_RSP), + : : "c"(vmx), "d"((unsigned long)HOST_RSP), "S"(evmcs_rsp), [launched]"i"(offsetof(struct vcpu_vmx, __launched)), [fail]"i"(offsetof(struct vcpu_vmx, fail)), [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), @@ -9579,10 +9854,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) [wordsize]"i"(sizeof(ulong)) : "cc", "memory" #ifdef CONFIG_X86_64 - , "rax", "rbx", "rdi", "rsi" + , "rax", "rbx", "rdi" , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" #else - , "eax", "ebx", "edi", "esi" + , "eax", "ebx", "edi" #endif ); @@ -9604,12 +9879,16 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); - if (vmx->spec_ctrl) - native_wrmsrl(MSR_IA32_SPEC_CTRL, 0); + x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0); /* Eliminate branch target predictions from guest mode */ vmexit_fill_RSB(); + /* All fields are clean at this point */ + if (static_branch_unlikely(&enable_evmcs)) + current_evmcs->hv_clean_fields |= + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ if (vmx->host_debugctlmsr) update_debugctlmsr(vmx->host_debugctlmsr); @@ -9646,14 +9925,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) __write_pkru(vmx->host_pkru); } - /* - * the KVM_REQ_EVENT optimization bit is only on for one entry, and if - * we did not inject a still-pending event to L1 now because of - * nested_run_pending, we need to re-enable this bit. - */ - if (vmx->nested.nested_run_pending) - kvm_make_request(KVM_REQ_EVENT, vcpu); - vmx->nested.nested_run_pending = 0; vmx->idt_vectoring_info = 0; @@ -9670,6 +9941,17 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) } STACK_FRAME_NON_STANDARD(vmx_vcpu_run); +static struct kvm *vmx_vm_alloc(void) +{ + struct kvm_vmx *kvm_vmx = kzalloc(sizeof(struct kvm_vmx), GFP_KERNEL); + return &kvm_vmx->kvm; +} + +static void vmx_vm_free(struct kvm *kvm) +{ + kfree(to_kvm_vmx(kvm)); +} + static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -9777,14 +10059,15 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) goto free_vmcs; } - if (enable_ept) { + if (enable_ept && !enable_unrestricted_guest) { err = init_rmode_identity_map(kvm); if (err) goto free_vmcs; } if (nested) { - nested_vmx_setup_ctls_msrs(vmx); + nested_vmx_setup_ctls_msrs(&vmx->nested.msrs, + kvm_vcpu_apicv_active(&vmx->vcpu)); vmx->nested.vpid02 = allocate_vpid(); } @@ -9817,6 +10100,13 @@ free_vcpu: return ERR_PTR(err); } +static int vmx_vm_init(struct kvm *kvm) +{ + if (!ple_gap) + kvm->arch.pause_in_guest = true; + return 0; +} + static void __init vmx_check_processor_compat(void *rtn) { struct vmcs_config vmcs_conf; @@ -9824,6 +10114,7 @@ static void __init vmx_check_processor_compat(void *rtn) *(int *)rtn = 0; if (setup_vmcs_config(&vmcs_conf) < 0) *(int *)rtn = -EIO; + nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, enable_apicv); if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n", smp_processor_id()); @@ -9911,12 +10202,12 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_cpuid_entry2 *entry; - vmx->nested.nested_vmx_cr0_fixed1 = 0xffffffff; - vmx->nested.nested_vmx_cr4_fixed1 = X86_CR4_PCE; + vmx->nested.msrs.cr0_fixed1 = 0xffffffff; + vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE; #define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \ if (entry && (entry->_reg & (_cpuid_mask))) \ - vmx->nested.nested_vmx_cr4_fixed1 |= (_cr4_mask); \ + vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \ } while (0) entry = kvm_find_cpuid_entry(vcpu, 0x1, 0); @@ -10013,7 +10304,7 @@ static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) kvm_mmu_unload(vcpu); kvm_init_shadow_ept_mmu(vcpu, - to_vmx(vcpu)->nested.nested_vmx_ept_caps & + to_vmx(vcpu)->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT, nested_ept_ad_enabled(vcpu)); vcpu->arch.mmu.set_cr3 = vmx_set_cr3; @@ -10311,6 +10602,16 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, return true; } +static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && + !page_address_valid(vcpu, vmcs12->apic_access_addr)) + return -EINVAL; + else + return 0; +} + static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { @@ -10879,11 +11180,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); } - if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) - vmcs_write64(TSC_OFFSET, - vcpu->arch.tsc_offset + vmcs12->tsc_offset); - else - vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); + vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); + if (kvm_has_tsc_control) decache_tsc_multiplier(vmx); @@ -10925,7 +11223,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, } } else if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { - vmx_flush_tlb_ept_only(vcpu); + vmx_flush_tlb(vcpu, true); } /* @@ -10952,6 +11250,16 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ vmx_set_efer(vcpu, vcpu->arch.efer); + /* + * Guest state is invalid and unrestricted guest is disabled, + * which means L1 attempted VMEntry to L2 with invalid state. + * Fail the VMEntry. + */ + if (vmx->emulation_required) { + *entry_failure_code = ENTRY_FAIL_DEFAULT; + return 1; + } + /* Shadow page tables on either EPT or shadow page tables. */ if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), entry_failure_code)) @@ -10965,6 +11273,19 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, return 0; } +static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has_nmi_exiting(vmcs12) && + nested_cpu_has_virtual_nmis(vmcs12)) + return -EINVAL; + + if (!nested_cpu_has_virtual_nmis(vmcs12) && + nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING)) + return -EINVAL; + + return 0; +} + static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -10979,6 +11300,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_vmx_check_apic_access_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; @@ -10992,26 +11316,29 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, - vmx->nested.nested_vmx_procbased_ctls_low, - vmx->nested.nested_vmx_procbased_ctls_high) || + vmx->nested.msrs.procbased_ctls_low, + vmx->nested.msrs.procbased_ctls_high) || (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && !vmx_control_verify(vmcs12->secondary_vm_exec_control, - vmx->nested.nested_vmx_secondary_ctls_low, - vmx->nested.nested_vmx_secondary_ctls_high)) || + vmx->nested.msrs.secondary_ctls_low, + vmx->nested.msrs.secondary_ctls_high)) || !vmx_control_verify(vmcs12->pin_based_vm_exec_control, - vmx->nested.nested_vmx_pinbased_ctls_low, - vmx->nested.nested_vmx_pinbased_ctls_high) || + vmx->nested.msrs.pinbased_ctls_low, + vmx->nested.msrs.pinbased_ctls_high) || !vmx_control_verify(vmcs12->vm_exit_controls, - vmx->nested.nested_vmx_exit_ctls_low, - vmx->nested.nested_vmx_exit_ctls_high) || + vmx->nested.msrs.exit_ctls_low, + vmx->nested.msrs.exit_ctls_high) || !vmx_control_verify(vmcs12->vm_entry_controls, - vmx->nested.nested_vmx_entry_ctls_low, - vmx->nested.nested_vmx_entry_ctls_high)) + vmx->nested.msrs.entry_ctls_low, + vmx->nested.msrs.entry_ctls_high)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (nested_vmx_check_nmi_controls(vmcs12)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; if (nested_cpu_has_vmfunc(vmcs12)) { if (vmcs12->vm_function_control & - ~vmx->nested.nested_vmx_vmfunc_controls) + ~vmx->nested.msrs.vmfunc_controls) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; if (nested_cpu_has_eptp_switching(vmcs12)) { @@ -11097,6 +11424,7 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) struct vmcs12 *vmcs12 = get_vmcs12(vcpu); u32 msr_entry_idx; u32 exit_qual; + int r; enter_guest_mode(vcpu); @@ -11106,26 +11434,21 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); vmx_segment_cache_clear(vmx); - if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) { - leave_guest_mode(vcpu); - vmx_switch_vmcs(vcpu, &vmx->vmcs01); - nested_vmx_entry_failure(vcpu, vmcs12, - EXIT_REASON_INVALID_STATE, exit_qual); - return 1; - } + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) + vcpu->arch.tsc_offset += vmcs12->tsc_offset; + + r = EXIT_REASON_INVALID_STATE; + if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) + goto fail; nested_get_vmcs12_pages(vcpu, vmcs12); + r = EXIT_REASON_MSR_LOAD_FAIL; msr_entry_idx = nested_vmx_load_msr(vcpu, vmcs12->vm_entry_msr_load_addr, vmcs12->vm_entry_msr_load_count); - if (msr_entry_idx) { - leave_guest_mode(vcpu); - vmx_switch_vmcs(vcpu, &vmx->vmcs01); - nested_vmx_entry_failure(vcpu, vmcs12, - EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx); - return 1; - } + if (msr_entry_idx) + goto fail; /* * Note no nested_vmx_succeed or nested_vmx_fail here. At this point @@ -11134,6 +11457,14 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) * the success flag) when L2 exits (see nested_vmx_vmexit()). */ return 0; + +fail: + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) + vcpu->arch.tsc_offset -= vmcs12->tsc_offset; + leave_guest_mode(vcpu); + vmx_switch_vmcs(vcpu, &vmx->vmcs01); + nested_vmx_entry_failure(vcpu, vmcs12, r, exit_qual); + return 1; } /* @@ -11293,7 +11624,7 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, } else if (vcpu->arch.nmi_injected) { vmcs12->idt_vectoring_info_field = INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; - } else if (vcpu->arch.interrupt.pending) { + } else if (vcpu->arch.interrupt.injected) { nr = vcpu->arch.interrupt.nr; idt_vectoring = nr | VECTORING_INFO_VALID_MASK; @@ -11705,6 +12036,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, leave_guest_mode(vcpu); + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) + vcpu->arch.tsc_offset -= vmcs12->tsc_offset; + if (likely(!vmx->fail)) { if (exit_reason == -1) sync_vmcs12(vcpu, vmcs12); @@ -11742,7 +12076,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, } else if (!nested_cpu_has_ept(vmcs12) && nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { - vmx_flush_tlb_ept_only(vcpu); + vmx_flush_tlb(vcpu, true); } /* This is needed for same reason as it was needed in prepare_vmcs02 */ @@ -11901,10 +12235,16 @@ static inline int u64_shl_div_u64(u64 a, unsigned int shift, static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - u64 tscl = rdtsc(); - u64 guest_tscl = kvm_read_l1_tsc(vcpu, tscl); - u64 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; + struct vcpu_vmx *vmx; + u64 tscl, guest_tscl, delta_tsc; + + if (kvm_mwait_in_guest(vcpu->kvm)) + return -EOPNOTSUPP; + + vmx = to_vmx(vcpu); + tscl = rdtsc(); + guest_tscl = kvm_read_l1_tsc(vcpu, tscl); + delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; /* Convert to host delta tsc if tsc scaling is enabled */ if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio && @@ -11941,7 +12281,7 @@ static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) { - if (ple_gap) + if (!kvm_pause_in_guest(vcpu->kvm)) shrink_ple_window(vcpu); } @@ -12210,7 +12550,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)); vcpu_info.vector = irq.vector; - trace_kvm_pi_irte_update(vcpu->vcpu_id, host_irq, e->gsi, + trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi, vcpu_info.vector, vcpu_info.pi_desc_addr, set); if (set) @@ -12259,6 +12599,7 @@ static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) vmx->nested.smm.vmxon = vmx->nested.vmxon; vmx->nested.vmxon = false; + vmx_clear_hlt(vcpu); return 0; } @@ -12298,7 +12639,11 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .hardware_enable = hardware_enable, .hardware_disable = hardware_disable, .cpu_has_accelerated_tpr = report_flexpriority, - .cpu_has_high_real_mode_segbase = vmx_has_high_real_mode_segbase, + .has_emulated_msr = vmx_has_emulated_msr, + + .vm_init = vmx_vm_init, + .vm_alloc = vmx_vm_alloc, + .vm_free = vmx_vm_free, .vcpu_create = vmx_create_vcpu, .vcpu_free = vmx_free_vcpu, @@ -12367,6 +12712,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .deliver_posted_interrupt = vmx_deliver_posted_interrupt, .set_tss_addr = vmx_set_tss_addr, + .set_identity_map_addr = vmx_set_identity_map_addr, .get_tdp_level = get_ept_level, .get_mt_mask = vmx_get_mt_mask, @@ -12383,6 +12729,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, + .read_l1_tsc_offset = vmx_read_l1_tsc_offset, .write_tsc_offset = vmx_write_tsc_offset, .set_tdp_cr3 = vmx_set_cr3, @@ -12425,7 +12772,38 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { static int __init vmx_init(void) { - int r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), + int r; + +#if IS_ENABLED(CONFIG_HYPERV) + /* + * Enlightened VMCS usage should be recommended and the host needs + * to support eVMCS v1 or above. We can also disable eVMCS support + * with module parameter. + */ + if (enlightened_vmcs && + ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED && + (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >= + KVM_EVMCS_VERSION) { + int cpu; + + /* Check that we have assist pages on all online CPUs */ + for_each_online_cpu(cpu) { + if (!hv_get_vp_assist_page(cpu)) { + enlightened_vmcs = false; + break; + } + } + + if (enlightened_vmcs) { + pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n"); + static_branch_enable(&enable_evmcs); + } + } else { + enlightened_vmcs = false; + } +#endif + + r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx), THIS_MODULE); if (r) return r; @@ -12446,6 +12824,29 @@ static void __exit vmx_exit(void) #endif kvm_exit(); + +#if IS_ENABLED(CONFIG_HYPERV) + if (static_branch_unlikely(&enable_evmcs)) { + int cpu; + struct hv_vp_assist_page *vp_ap; + /* + * Reset everything to support using non-enlightened VMCS + * access later (e.g. when we reload the module with + * enlightened_vmcs=0) + */ + for_each_online_cpu(cpu) { + vp_ap = hv_get_vp_assist_page(cpu); + + if (!vp_ap) + continue; + + vp_ap->current_nested_vmcs = 0; + vp_ap->enlighten_vmentry = 0; + } + + static_branch_disable(&enable_evmcs); + } +#endif } module_init(vmx_init) |