diff options
Diffstat (limited to 'arch/x86/kvm/vmx.c')
-rw-r--r-- | arch/x86/kvm/vmx.c | 347 |
1 files changed, 235 insertions, 112 deletions
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a06f101ef64b..1f68c5831924 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -31,6 +31,7 @@ #include <linux/ftrace_event.h> #include <linux/slab.h> #include <linux/tboot.h> +#include <linux/hrtimer.h> #include "kvm_cache_regs.h" #include "x86.h" @@ -42,6 +43,7 @@ #include <asm/i387.h> #include <asm/xcr.h> #include <asm/perf_event.h> +#include <asm/debugreg.h> #include <asm/kexec.h> #include "trace.h" @@ -110,6 +112,8 @@ module_param(nested, bool, S_IRUGO); #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) +#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 + /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * ple_gap: upper bound on the amount of time between two successive @@ -202,6 +206,7 @@ struct __packed vmcs12 { u64 guest_pdptr1; u64 guest_pdptr2; u64 guest_pdptr3; + u64 guest_bndcfgs; u64 host_ia32_pat; u64 host_ia32_efer; u64 host_ia32_perf_global_ctrl; @@ -374,6 +379,9 @@ struct nested_vmx { */ struct page *apic_access_page; u64 msr_ia32_feature_control; + + struct hrtimer preemption_timer; + bool preemption_timer_expired; }; #define POSTED_INTR_ON 0 @@ -441,6 +449,7 @@ struct vcpu_vmx { #endif int gs_ldt_reload_needed; int fs_reload_needed; + u64 msr_host_bndcfgs; } host_state; struct { int vm86_active; @@ -533,6 +542,7 @@ static const unsigned long shadow_read_write_fields[] = { GUEST_CS_LIMIT, GUEST_CS_BASE, GUEST_ES_BASE, + GUEST_BNDCFGS, CR0_GUEST_HOST_MASK, CR0_READ_SHADOW, CR4_READ_SHADOW, @@ -588,6 +598,7 @@ static const unsigned short vmcs_field_to_offset_table[] = { FIELD64(GUEST_PDPTR1, guest_pdptr1), FIELD64(GUEST_PDPTR2, guest_pdptr2), FIELD64(GUEST_PDPTR3, guest_pdptr3), + FIELD64(GUEST_BNDCFGS, guest_bndcfgs), FIELD64(HOST_IA32_PAT, host_ia32_pat), FIELD64(HOST_IA32_EFER, host_ia32_efer), FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl), @@ -718,6 +729,7 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); static u64 construct_eptp(unsigned long root_hpa); static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); +static bool vmx_mpx_supported(void); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -728,6 +740,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var); static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu); static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx); static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); +static bool vmx_mpx_supported(void); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -1047,6 +1060,12 @@ static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12) return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; } +static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12) +{ + return vmcs12->pin_based_vm_exec_control & + PIN_BASED_VMX_PREEMPTION_TIMER; +} + static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) { return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); @@ -1710,6 +1729,8 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) if (is_long_mode(&vmx->vcpu)) wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #endif + if (boot_cpu_has(X86_FEATURE_MPX)) + rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); for (i = 0; i < vmx->save_nmsrs; ++i) kvm_set_shared_msr(vmx->guest_msrs[i].index, vmx->guest_msrs[i].data, @@ -1747,6 +1768,8 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) #ifdef CONFIG_X86_64 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); #endif + if (vmx->host_state.msr_host_bndcfgs) + wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); /* * If the FPU is not active (through the host task or * the guest vcpu), then restore the cr0.TS bit. @@ -2248,9 +2271,9 @@ static __init void nested_vmx_setup_ctls_msrs(void) */ nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK | - PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS | + PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS; + nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | PIN_BASED_VMX_PREEMPTION_TIMER; - nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; /* * Exit controls @@ -2265,15 +2288,12 @@ static __init void nested_vmx_setup_ctls_msrs(void) #ifdef CONFIG_X86_64 VM_EXIT_HOST_ADDR_SPACE_SIZE | #endif - VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | + VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; + nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | + VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; - if (!(nested_vmx_pinbased_ctls_high & PIN_BASED_VMX_PREEMPTION_TIMER) || - !(nested_vmx_exit_ctls_high & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) { - nested_vmx_exit_ctls_high &= ~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; - nested_vmx_pinbased_ctls_high &= ~PIN_BASED_VMX_PREEMPTION_TIMER; - } - nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | - VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER); + if (vmx_mpx_supported()) + nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; /* entry controls */ rdmsr(MSR_IA32_VMX_ENTRY_CTLS, @@ -2287,6 +2307,8 @@ static __init void nested_vmx_setup_ctls_msrs(void) VM_ENTRY_LOAD_IA32_PAT; nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); + if (vmx_mpx_supported()) + nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; /* cpu-based controls */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, @@ -2342,9 +2364,9 @@ static __init void nested_vmx_setup_ctls_msrs(void) /* miscellaneous data */ rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high); - nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK | - VMX_MISC_SAVE_EFER_LMA; - nested_vmx_misc_low |= VMX_MISC_ACTIVITY_HLT; + nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA; + nested_vmx_misc_low |= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | + VMX_MISC_ACTIVITY_HLT; nested_vmx_misc_high = 0; } @@ -2479,6 +2501,11 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) case MSR_IA32_SYSENTER_ESP: data = vmcs_readl(GUEST_SYSENTER_ESP); break; + case MSR_IA32_BNDCFGS: + if (!vmx_mpx_supported()) + return 1; + data = vmcs_read64(GUEST_BNDCFGS); + break; case MSR_IA32_FEATURE_CONTROL: if (!nested_vmx_allowed(vcpu)) return 1; @@ -2547,6 +2574,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_SYSENTER_ESP: vmcs_writel(GUEST_SYSENTER_ESP, data); break; + case MSR_IA32_BNDCFGS: + if (!vmx_mpx_supported()) + return 1; + vmcs_write64(GUEST_BNDCFGS, data); + break; case MSR_IA32_TSC: kvm_write_tsc(vcpu, msr_info); break; @@ -2832,12 +2864,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) vmx_capability.ept, vmx_capability.vpid); } - min = 0; + min = VM_EXIT_SAVE_DEBUG_CONTROLS; #ifdef CONFIG_X86_64 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; #endif opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT | - VM_EXIT_ACK_INTR_ON_EXIT; + VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_CLEAR_BNDCFGS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, &_vmexit_control) < 0) return -EIO; @@ -2853,8 +2885,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT)) _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; - min = 0; - opt = VM_ENTRY_LOAD_IA32_PAT; + min = VM_ENTRY_LOAD_DEBUG_CONTROLS; + opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, &_vmentry_control) < 0) return -EIO; @@ -3452,13 +3484,14 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) hw_cr4 &= ~X86_CR4_PAE; hw_cr4 |= X86_CR4_PSE; /* - * SMEP is disabled if CPU is in non-paging mode in - * hardware. However KVM always uses paging mode to + * SMEP/SMAP is disabled if CPU is in non-paging mode + * in hardware. However KVM always uses paging mode to * emulate guest non-paging mode with TDP. - * To emulate this behavior, SMEP needs to be manually - * disabled when guest switches to non-paging mode. + * To emulate this behavior, SMEP/SMAP needs to be + * manually disabled when guest switches to non-paging + * mode. */ - hw_cr4 &= ~X86_CR4_SMEP; + hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP); } else if (!(cr4 & X86_CR4_PAE)) { hw_cr4 &= ~X86_CR4_PAE; } @@ -4223,6 +4256,10 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) static u32 vmx_exec_control(struct vcpu_vmx *vmx) { u32 exec_control = vmcs_config.cpu_based_exec_ctrl; + + if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) + exec_control &= ~CPU_BASED_MOV_DR_EXITING; + if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) { exec_control &= ~CPU_BASED_TPR_SHADOW; #ifdef CONFIG_X86_64 @@ -4496,39 +4533,28 @@ static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) PIN_BASED_NMI_EXITING; } -static int enable_irq_window(struct kvm_vcpu *vcpu) +static void enable_irq_window(struct kvm_vcpu *vcpu) { u32 cpu_based_vm_exec_control; - if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) - /* - * We get here if vmx_interrupt_allowed() said we can't - * inject to L1 now because L2 must run. The caller will have - * to make L2 exit right after entry, so we can inject to L1 - * more promptly. - */ - return -EBUSY; - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); - return 0; } -static int enable_nmi_window(struct kvm_vcpu *vcpu) +static void enable_nmi_window(struct kvm_vcpu *vcpu) { u32 cpu_based_vm_exec_control; - if (!cpu_has_virtual_nmis()) - return enable_irq_window(vcpu); - - if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) - return enable_irq_window(vcpu); + if (!cpu_has_virtual_nmis() || + vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { + enable_irq_window(vcpu); + return; + } cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); - return 0; } static void vmx_inject_irq(struct kvm_vcpu *vcpu) @@ -4620,22 +4646,8 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu)) { - if (to_vmx(vcpu)->nested.nested_run_pending) - return 0; - if (nested_exit_on_nmi(vcpu)) { - nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, - NMI_VECTOR | INTR_TYPE_NMI_INTR | - INTR_INFO_VALID_MASK, 0); - /* - * The NMI-triggered VM exit counts as injection: - * clear this one and block further NMIs. - */ - vcpu->arch.nmi_pending = 0; - vmx_set_nmi_mask(vcpu, true); - return 0; - } - } + if (to_vmx(vcpu)->nested.nested_run_pending) + return 0; if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) return 0; @@ -4647,19 +4659,8 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu)) { - if (to_vmx(vcpu)->nested.nested_run_pending) - return 0; - if (nested_exit_on_intr(vcpu)) { - nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, - 0, 0); - /* - * fall through to normal code, but now in L1, not L2 - */ - } - } - - return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && + return (!to_vmx(vcpu)->nested.nested_run_pending && + vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); } @@ -5102,6 +5103,22 @@ static int handle_dr(struct kvm_vcpu *vcpu) } } + if (vcpu->guest_debug == 0) { + u32 cpu_based_vm_exec_control; + + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control &= ~CPU_BASED_MOV_DR_EXITING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + + /* + * No more DR vmexits; force a reload of the debug registers + * and reenter on this instruction. The next vmexit will + * retrieve the full state of the debug registers. + */ + vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; + return 1; + } + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); dr = exit_qualification & DEBUG_REG_ACCESS_NUM; reg = DEBUG_REG_ACCESS_REG(exit_qualification); @@ -5128,6 +5145,24 @@ static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) { } +static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) +{ + u32 cpu_based_vm_exec_control; + + get_debugreg(vcpu->arch.db[0], 0); + get_debugreg(vcpu->arch.db[1], 1); + get_debugreg(vcpu->arch.db[2], 2); + get_debugreg(vcpu->arch.db[3], 3); + get_debugreg(vcpu->arch.dr6, 6); + vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); + + vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; + + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); +} + static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) { vmcs_writel(GUEST_DR7, val); @@ -5727,6 +5762,18 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu, */ } +static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) +{ + struct vcpu_vmx *vmx = + container_of(timer, struct vcpu_vmx, nested.preemption_timer); + + vmx->nested.preemption_timer_expired = true; + kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); + kvm_vcpu_kick(&vmx->vcpu); + + return HRTIMER_NORESTART; +} + /* * Emulate the VMXON instruction. * Currently, we just remember that VMX is active, and do not save or even @@ -5791,6 +5838,10 @@ static int handle_vmon(struct kvm_vcpu *vcpu) INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); vmx->nested.vmcs02_num = 0; + hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; + vmx->nested.vmxon = true; skip_emulated_instruction(vcpu); @@ -6688,7 +6739,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) else if (is_page_fault(intr_info)) return enable_ept; else if (is_no_device(intr_info) && - !(nested_read_cr0(vmcs12) & X86_CR0_TS)) + !(vmcs12->guest_cr0 & X86_CR0_TS)) return 0; return vmcs12->exception_bitmap & (1u << (intr_info & INTR_INFO_VECTOR_MASK)); @@ -6767,9 +6818,6 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) * table is L0's fault. */ return 0; - case EXIT_REASON_PREEMPTION_TIMER: - return vmcs12->pin_based_vm_exec_control & - PIN_BASED_VMX_PREEMPTION_TIMER; case EXIT_REASON_WBINVD: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); case EXIT_REASON_XSETBV: @@ -6785,27 +6833,6 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) *info2 = vmcs_read32(VM_EXIT_INTR_INFO); } -static void nested_adjust_preemption_timer(struct kvm_vcpu *vcpu) -{ - u64 delta_tsc_l1; - u32 preempt_val_l1, preempt_val_l2, preempt_scale; - - if (!(get_vmcs12(vcpu)->pin_based_vm_exec_control & - PIN_BASED_VMX_PREEMPTION_TIMER)) - return; - preempt_scale = native_read_msr(MSR_IA32_VMX_MISC) & - MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE; - preempt_val_l2 = vmcs_read32(VMX_PREEMPTION_TIMER_VALUE); - delta_tsc_l1 = vmx_read_l1_tsc(vcpu, native_read_tsc()) - - vcpu->arch.last_guest_tsc; - preempt_val_l1 = delta_tsc_l1 >> preempt_scale; - if (preempt_val_l2 <= preempt_val_l1) - preempt_val_l2 = 0; - else - preempt_val_l2 -= preempt_val_l1; - vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, preempt_val_l2); -} - /* * The guest has exited. See if we can fix it or if we need userspace * assistance. @@ -7052,6 +7079,12 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) local_irq_enable(); } +static bool vmx_mpx_supported(void) +{ + return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) && + (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS); +} + static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -7218,8 +7251,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) atomic_switch_perf_msrs(vmx); debugctlmsr = get_debugctlmsr(); - if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) - nested_adjust_preemption_timer(vcpu); vmx->__launched = vmx->loaded_vmcs->launched; asm( /* Store host registers */ @@ -7616,6 +7647,28 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, kvm_inject_page_fault(vcpu, fault); } +static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) +{ + u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (vcpu->arch.virtual_tsc_khz == 0) + return; + + /* Make sure short timeouts reliably trigger an immediate vmexit. + * hrtimer_start does not guarantee this. */ + if (preemption_timeout <= 1) { + vmx_preemption_timer_fn(&vmx->nested.preemption_timer); + return; + } + + preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; + preemption_timeout *= 1000000; + do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); + hrtimer_start(&vmx->nested.preemption_timer, + ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); +} + /* * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it @@ -7629,7 +7682,6 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 exec_control; - u32 exit_control; vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); @@ -7687,13 +7739,14 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_write64(VMCS_LINK_POINTER, -1ull); - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, - (vmcs_config.pin_based_exec_ctrl | - vmcs12->pin_based_vm_exec_control)); + exec_control = vmcs12->pin_based_vm_exec_control; + exec_control |= vmcs_config.pin_based_exec_ctrl; + exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); - if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) - vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, - vmcs12->vmx_preemption_timer_value); + vmx->nested.preemption_timer_expired = false; + if (nested_cpu_has_preemption_timer(vmcs12)) + vmx_start_preemption_timer(vcpu); /* * Whether page-faults are trapped is determined by a combination of @@ -7721,7 +7774,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) enable_ept ? vmcs12->page_fault_error_code_match : 0); if (cpu_has_secondary_exec_ctrls()) { - u32 exec_control = vmx_secondary_exec_control(vmx); + exec_control = vmx_secondary_exec_control(vmx); if (!vmx->rdtscp_enabled) exec_control &= ~SECONDARY_EXEC_RDTSCP; /* Take the following fields only from vmcs12 */ @@ -7808,10 +7861,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER * bits are further modified by vmx_set_efer() below. */ - exit_control = vmcs_config.vmexit_ctrl; - if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) - exit_control |= VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; - vm_exit_controls_init(vmx, exit_control); + vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are * emulated by vmx_set_efer(), below. @@ -7830,6 +7880,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) set_cr4_guest_host_mask(vmx); + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) + vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset); @@ -8155,6 +8208,58 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, } } +static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && + vmx->nested.preemption_timer_expired) { + if (vmx->nested.nested_run_pending) + return -EBUSY; + nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); + return 0; + } + + if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) { + if (vmx->nested.nested_run_pending || + vcpu->arch.interrupt.pending) + return -EBUSY; + nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, + NMI_VECTOR | INTR_TYPE_NMI_INTR | + INTR_INFO_VALID_MASK, 0); + /* + * The NMI-triggered VM exit counts as injection: + * clear this one and block further NMIs. + */ + vcpu->arch.nmi_pending = 0; + vmx_set_nmi_mask(vcpu, true); + return 0; + } + + if ((kvm_cpu_has_interrupt(vcpu) || external_intr) && + nested_exit_on_intr(vcpu)) { + if (vmx->nested.nested_run_pending) + return -EBUSY; + nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); + } + + return 0; +} + +static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) +{ + ktime_t remaining = + hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); + u64 value; + + if (ktime_to_ns(remaining) <= 0) + return 0; + + value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; + do_div(value, 1000000); + return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; +} + /* * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), @@ -8225,10 +8330,13 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, else vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; - if ((vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) && - (vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) - vmcs12->vmx_preemption_timer_value = - vmcs_read32(VMX_PREEMPTION_TIMER_VALUE); + if (nested_cpu_has_preemption_timer(vmcs12)) { + if (vmcs12->vm_exit_controls & + VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) + vmcs12->vmx_preemption_timer_value = + vmx_get_preemption_timer_value(vcpu); + hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); + } /* * In some cases (usually, nested EPT), L2 is allowed to change its @@ -8260,6 +8368,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); + if (vmx_mpx_supported()) + vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); /* update exit information fields: */ @@ -8369,6 +8479,10 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); + /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ + if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) + vmcs_write64(GUEST_BNDCFGS, 0); + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); vcpu->arch.pat = vmcs12->host_ia32_pat; @@ -8495,6 +8609,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, nested_vmx_succeed(vcpu); if (enable_shadow_vmcs) vmx->nested.sync_shadow_vmcs = true; + + /* in case we halted in L2 */ + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } /* @@ -8573,6 +8690,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .get_dr6 = vmx_get_dr6, .set_dr6 = vmx_set_dr6, .set_dr7 = vmx_set_dr7, + .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, @@ -8634,6 +8752,9 @@ static struct kvm_x86_ops vmx_x86_ops = { .check_intercept = vmx_check_intercept, .handle_external_intr = vmx_handle_external_intr, + .mpx_supported = vmx_mpx_supported, + + .check_nested_events = vmx_check_nested_events, }; static int __init vmx_init(void) @@ -8721,6 +8842,8 @@ static int __init vmx_init(void) vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); + vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); + memcpy(vmx_msr_bitmap_legacy_x2apic, vmx_msr_bitmap_legacy, PAGE_SIZE); memcpy(vmx_msr_bitmap_longmode_x2apic, |