diff options
Diffstat (limited to 'arch/x86/kvm/svm/nested.c')
-rw-r--r-- | arch/x86/kvm/svm/nested.c | 573 |
1 files changed, 339 insertions, 234 deletions
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index fb204eaa8bb3..540d43ba2cf4 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -29,6 +29,8 @@ #include "lapic.h" #include "svm.h" +#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK + static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, struct x86_exception *fault) { @@ -92,12 +94,12 @@ static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu) static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - struct vmcb *hsave = svm->nested.hsave; WARN_ON(mmu_is_nested(vcpu)); vcpu->arch.mmu = &vcpu->arch.guest_mmu; - kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, hsave->save.cr4, hsave->save.efer, + kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01.ptr->save.cr4, + svm->vmcb01.ptr->save.efer, svm->nested.ctl.nested_cr3); vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3; vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr; @@ -123,7 +125,7 @@ void recalc_intercepts(struct vcpu_svm *svm) return; c = &svm->vmcb->control; - h = &svm->nested.hsave->control; + h = &svm->vmcb01.ptr->control; g = &svm->nested.ctl; for (i = 0; i < MAX_INTERCEPT; i++) @@ -213,44 +215,64 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) return true; } -static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu) +/* + * Bits 11:0 of bitmap address are ignored by hardware + */ +static bool nested_svm_check_bitmap_pa(struct kvm_vcpu *vcpu, u64 pa, u32 size) { - struct vcpu_svm *svm = to_svm(vcpu); + u64 addr = PAGE_ALIGN(pa); - if (WARN_ON(!is_guest_mode(vcpu))) - return true; - - if (!nested_svm_vmrun_msrpm(svm)) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = - KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; - return false; - } - - return true; + return kvm_vcpu_is_legal_gpa(vcpu, addr) && + kvm_vcpu_is_legal_gpa(vcpu, addr + size - 1); } -static bool nested_vmcb_check_controls(struct vmcb_control_area *control) +static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu, + struct vmcb_control_area *control) { - if ((vmcb_is_intercept(control, INTERCEPT_VMRUN)) == 0) + if (CC(!vmcb_is_intercept(control, INTERCEPT_VMRUN))) return false; - if (control->asid == 0) + if (CC(control->asid == 0)) return false; - if ((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) && - !npt_enabled) + if (CC((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) && !npt_enabled)) + return false; + + if (CC(!nested_svm_check_bitmap_pa(vcpu, control->msrpm_base_pa, + MSRPM_SIZE))) + return false; + if (CC(!nested_svm_check_bitmap_pa(vcpu, control->iopm_base_pa, + IOPM_SIZE))) return false; return true; } -static bool nested_vmcb_check_save(struct vcpu_svm *svm, struct vmcb *vmcb12) +static bool nested_vmcb_check_cr3_cr4(struct kvm_vcpu *vcpu, + struct vmcb_save_area *save) { - struct kvm_vcpu *vcpu = &svm->vcpu; - bool vmcb12_lma; + /* + * These checks are also performed by KVM_SET_SREGS, + * except that EFER.LMA is not checked by SVM against + * CR0.PG && EFER.LME. + */ + if ((save->efer & EFER_LME) && (save->cr0 & X86_CR0_PG)) { + if (CC(!(save->cr4 & X86_CR4_PAE)) || + CC(!(save->cr0 & X86_CR0_PE)) || + CC(kvm_vcpu_is_illegal_gpa(vcpu, save->cr3))) + return false; + } + + if (CC(!kvm_is_valid_cr4(vcpu, save->cr4))) + return false; + + return true; +} +/* Common checks that apply to both L1 and L2 state. */ +static bool nested_vmcb_valid_sregs(struct kvm_vcpu *vcpu, + struct vmcb_save_area *save) +{ /* * FIXME: these should be done after copying the fields, * to avoid TOC/TOU races. For these save area checks @@ -258,31 +280,27 @@ static bool nested_vmcb_check_save(struct vcpu_svm *svm, struct vmcb *vmcb12) * kvm_set_cr4 handle failure; EFER_SVME is an exception * so it is force-set later in nested_prepare_vmcb_save. */ - if ((vmcb12->save.efer & EFER_SVME) == 0) + if (CC(!(save->efer & EFER_SVME))) return false; - if (((vmcb12->save.cr0 & X86_CR0_CD) == 0) && (vmcb12->save.cr0 & X86_CR0_NW)) + if (CC((save->cr0 & X86_CR0_CD) == 0 && (save->cr0 & X86_CR0_NW)) || + CC(save->cr0 & ~0xffffffffULL)) return false; - if (!kvm_dr6_valid(vmcb12->save.dr6) || !kvm_dr7_valid(vmcb12->save.dr7)) + if (CC(!kvm_dr6_valid(save->dr6)) || CC(!kvm_dr7_valid(save->dr7))) return false; - vmcb12_lma = (vmcb12->save.efer & EFER_LME) && (vmcb12->save.cr0 & X86_CR0_PG); + if (!nested_vmcb_check_cr3_cr4(vcpu, save)) + return false; - if (vmcb12_lma) { - if (!(vmcb12->save.cr4 & X86_CR4_PAE) || - !(vmcb12->save.cr0 & X86_CR0_PE) || - kvm_vcpu_is_illegal_gpa(vcpu, vmcb12->save.cr3)) - return false; - } - if (!kvm_is_valid_cr4(&svm->vcpu, vmcb12->save.cr4)) + if (CC(!kvm_valid_efer(vcpu, save->efer))) return false; return true; } -static void load_nested_vmcb_control(struct vcpu_svm *svm, - struct vmcb_control_area *control) +static void nested_load_control_from_vmcb12(struct vcpu_svm *svm, + struct vmcb_control_area *control) { copy_vmcb_control_area(&svm->nested.ctl, control); @@ -294,9 +312,9 @@ static void load_nested_vmcb_control(struct vcpu_svm *svm, /* * Synchronize fields that are written by the processor, so that - * they can be copied back into the nested_vmcb. + * they can be copied back into the vmcb12. */ -void sync_nested_vmcb_control(struct vcpu_svm *svm) +void nested_sync_control_from_vmcb02(struct vcpu_svm *svm) { u32 mask; svm->nested.ctl.event_inj = svm->vmcb->control.event_inj; @@ -324,8 +342,8 @@ void sync_nested_vmcb_control(struct vcpu_svm *svm) * Transfer any event that L0 or L1 wanted to inject into L2 to * EXIT_INT_INFO. */ -static void nested_vmcb_save_pending_event(struct vcpu_svm *svm, - struct vmcb *vmcb12) +static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm, + struct vmcb *vmcb12) { struct kvm_vcpu *vcpu = &svm->vcpu; u32 exit_int_info = 0; @@ -369,12 +387,12 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm) static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_npt) { - if (kvm_vcpu_is_illegal_gpa(vcpu, cr3)) + if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) return -EINVAL; if (!nested_npt && is_pae_paging(vcpu) && (cr3 != kvm_read_cr3(vcpu) || pdptrs_changed(vcpu))) { - if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) + if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) return -EINVAL; } @@ -393,15 +411,42 @@ static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, return 0; } -static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12) +void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm) { + if (!svm->nested.vmcb02.ptr) + return; + + /* FIXME: merge g_pat from vmcb01 and vmcb12. */ + svm->nested.vmcb02.ptr->save.g_pat = svm->vmcb01.ptr->save.g_pat; +} + +static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12) +{ + bool new_vmcb12 = false; + + nested_vmcb02_compute_g_pat(svm); + /* Load the nested guest state */ - svm->vmcb->save.es = vmcb12->save.es; - svm->vmcb->save.cs = vmcb12->save.cs; - svm->vmcb->save.ss = vmcb12->save.ss; - svm->vmcb->save.ds = vmcb12->save.ds; - svm->vmcb->save.gdtr = vmcb12->save.gdtr; - svm->vmcb->save.idtr = vmcb12->save.idtr; + if (svm->nested.vmcb12_gpa != svm->nested.last_vmcb12_gpa) { + new_vmcb12 = true; + svm->nested.last_vmcb12_gpa = svm->nested.vmcb12_gpa; + } + + if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_SEG))) { + svm->vmcb->save.es = vmcb12->save.es; + svm->vmcb->save.cs = vmcb12->save.cs; + svm->vmcb->save.ss = vmcb12->save.ss; + svm->vmcb->save.ds = vmcb12->save.ds; + svm->vmcb->save.cpl = vmcb12->save.cpl; + vmcb_mark_dirty(svm->vmcb, VMCB_SEG); + } + + if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DT))) { + svm->vmcb->save.gdtr = vmcb12->save.gdtr; + svm->vmcb->save.idtr = vmcb12->save.idtr; + vmcb_mark_dirty(svm->vmcb, VMCB_DT); + } + kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED); /* @@ -413,7 +458,9 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12) svm_set_cr0(&svm->vcpu, vmcb12->save.cr0); svm_set_cr4(&svm->vcpu, vmcb12->save.cr4); - svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = vmcb12->save.cr2; + + svm->vcpu.arch.cr2 = vmcb12->save.cr2; + kvm_rax_write(&svm->vcpu, vmcb12->save.rax); kvm_rsp_write(&svm->vcpu, vmcb12->save.rsp); kvm_rip_write(&svm->vcpu, vmcb12->save.rip); @@ -422,15 +469,41 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12) svm->vmcb->save.rax = vmcb12->save.rax; svm->vmcb->save.rsp = vmcb12->save.rsp; svm->vmcb->save.rip = vmcb12->save.rip; - svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1; - svm->vcpu.arch.dr6 = vmcb12->save.dr6 | DR6_ACTIVE_LOW; - svm->vmcb->save.cpl = vmcb12->save.cpl; + + /* These bits will be set properly on the first execution when new_vmc12 is true */ + if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) { + svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1; + svm->vcpu.arch.dr6 = vmcb12->save.dr6 | DR6_ACTIVE_LOW; + vmcb_mark_dirty(svm->vmcb, VMCB_DR); + } } -static void nested_prepare_vmcb_control(struct vcpu_svm *svm) +static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) { const u32 mask = V_INTR_MASKING_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK; + /* + * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2, + * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes. + */ + + /* + * Also covers avic_vapic_bar, avic_backing_page, avic_logical_id, + * avic_physical_id. + */ + WARN_ON(svm->vmcb01.ptr->control.int_ctl & AVIC_ENABLE_MASK); + + /* Copied from vmcb01. msrpm_base can be overwritten later. */ + svm->vmcb->control.nested_ctl = svm->vmcb01.ptr->control.nested_ctl; + svm->vmcb->control.iopm_base_pa = svm->vmcb01.ptr->control.iopm_base_pa; + svm->vmcb->control.msrpm_base_pa = svm->vmcb01.ptr->control.msrpm_base_pa; + + /* Done at vmrun: asid. */ + + /* Also overwritten later if necessary. */ + svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; + + /* nested_cr3. */ if (nested_npt_enabled(svm)) nested_svm_init_mmu_context(&svm->vcpu); @@ -439,7 +512,7 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm) svm->vmcb->control.int_ctl = (svm->nested.ctl.int_ctl & ~mask) | - (svm->nested.hsave->control.int_ctl & mask); + (svm->vmcb01.ptr->control.int_ctl & mask); svm->vmcb->control.virt_ext = svm->nested.ctl.virt_ext; svm->vmcb->control.int_vector = svm->nested.ctl.int_vector; @@ -454,17 +527,28 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm) enter_guest_mode(&svm->vcpu); /* - * Merge guest and host intercepts - must be called with vcpu in - * guest-mode to take affect here + * Merge guest and host intercepts - must be called with vcpu in + * guest-mode to take effect. */ recalc_intercepts(svm); +} - vmcb_mark_all_dirty(svm->vmcb); +static void nested_svm_copy_common_state(struct vmcb *from_vmcb, struct vmcb *to_vmcb) +{ + /* + * Some VMCB state is shared between L1 and L2 and thus has to be + * moved at the time of nested vmrun and vmexit. + * + * VMLOAD/VMSAVE state would also belong in this category, but KVM + * always performs VMLOAD and VMSAVE from the VMCB01. + */ + to_vmcb->save.spec_ctrl = from_vmcb->save.spec_ctrl; } -int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa, +int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, struct vmcb *vmcb12) { + struct vcpu_svm *svm = to_svm(vcpu); int ret; trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa, @@ -482,8 +566,14 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa, svm->nested.vmcb12_gpa = vmcb12_gpa; - nested_prepare_vmcb_control(svm); - nested_prepare_vmcb_save(svm, vmcb12); + + WARN_ON(svm->vmcb == svm->nested.vmcb02.ptr); + + nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr); + + svm_switch_vmcb(svm, &svm->nested.vmcb02); + nested_vmcb02_prepare_control(svm); + nested_vmcb02_prepare_save(svm, vmcb12); ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3, nested_npt_enabled(svm)); @@ -491,47 +581,48 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa, return ret; if (!npt_enabled) - svm->vcpu.arch.mmu->inject_page_fault = svm_inject_page_fault_nested; + vcpu->arch.mmu->inject_page_fault = svm_inject_page_fault_nested; svm_set_gif(svm, true); return 0; } -int nested_svm_vmrun(struct vcpu_svm *svm) +int nested_svm_vmrun(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); int ret; struct vmcb *vmcb12; - struct vmcb *hsave = svm->nested.hsave; - struct vmcb *vmcb = svm->vmcb; struct kvm_host_map map; u64 vmcb12_gpa; - if (is_smm(&svm->vcpu)) { - kvm_queue_exception(&svm->vcpu, UD_VECTOR); + ++vcpu->stat.nested_run; + + if (is_smm(vcpu)) { + kvm_queue_exception(vcpu, UD_VECTOR); return 1; } vmcb12_gpa = svm->vmcb->save.rax; - ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb12_gpa), &map); + ret = kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map); if (ret == -EINVAL) { - kvm_inject_gp(&svm->vcpu, 0); + kvm_inject_gp(vcpu, 0); return 1; } else if (ret) { - return kvm_skip_emulated_instruction(&svm->vcpu); + return kvm_skip_emulated_instruction(vcpu); } - ret = kvm_skip_emulated_instruction(&svm->vcpu); + ret = kvm_skip_emulated_instruction(vcpu); vmcb12 = map.hva; if (WARN_ON_ONCE(!svm->nested.initialized)) return -EINVAL; - load_nested_vmcb_control(svm, &vmcb12->control); + nested_load_control_from_vmcb12(svm, &vmcb12->control); - if (!nested_vmcb_check_save(svm, vmcb12) || - !nested_vmcb_check_controls(&svm->nested.ctl)) { + if (!nested_vmcb_valid_sregs(vcpu, &vmcb12->save) || + !nested_vmcb_check_controls(vcpu, &svm->nested.ctl)) { vmcb12->control.exit_code = SVM_EXIT_ERR; vmcb12->control.exit_code_hi = 0; vmcb12->control.exit_info_1 = 0; @@ -541,36 +632,25 @@ int nested_svm_vmrun(struct vcpu_svm *svm) /* Clear internal status */ - kvm_clear_exception_queue(&svm->vcpu); - kvm_clear_interrupt_queue(&svm->vcpu); + kvm_clear_exception_queue(vcpu); + kvm_clear_interrupt_queue(vcpu); /* - * Save the old vmcb, so we don't need to pick what we save, but can - * restore everything when a VMEXIT occurs + * Since vmcb01 is not in use, we can use it to store some of the L1 + * state. */ - hsave->save.es = vmcb->save.es; - hsave->save.cs = vmcb->save.cs; - hsave->save.ss = vmcb->save.ss; - hsave->save.ds = vmcb->save.ds; - hsave->save.gdtr = vmcb->save.gdtr; - hsave->save.idtr = vmcb->save.idtr; - hsave->save.efer = svm->vcpu.arch.efer; - hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); - hsave->save.cr4 = svm->vcpu.arch.cr4; - hsave->save.rflags = kvm_get_rflags(&svm->vcpu); - hsave->save.rip = kvm_rip_read(&svm->vcpu); - hsave->save.rsp = vmcb->save.rsp; - hsave->save.rax = vmcb->save.rax; - if (npt_enabled) - hsave->save.cr3 = vmcb->save.cr3; - else - hsave->save.cr3 = kvm_read_cr3(&svm->vcpu); - - copy_vmcb_control_area(&hsave->control, &vmcb->control); + svm->vmcb01.ptr->save.efer = vcpu->arch.efer; + svm->vmcb01.ptr->save.cr0 = kvm_read_cr0(vcpu); + svm->vmcb01.ptr->save.cr4 = vcpu->arch.cr4; + svm->vmcb01.ptr->save.rflags = kvm_get_rflags(vcpu); + svm->vmcb01.ptr->save.rip = kvm_rip_read(vcpu); + + if (!npt_enabled) + svm->vmcb01.ptr->save.cr3 = kvm_read_cr3(vcpu); svm->nested.nested_run_pending = 1; - if (enter_svm_guest_mode(svm, vmcb12_gpa, vmcb12)) + if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12)) goto out_exit_err; if (nested_svm_vmrun_msrpm(svm)) @@ -587,7 +667,7 @@ out_exit_err: nested_svm_vmexit(svm); out: - kvm_vcpu_unmap(&svm->vcpu, &map, true); + kvm_vcpu_unmap(vcpu, &map, true); return ret; } @@ -610,27 +690,30 @@ void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) int nested_svm_vmexit(struct vcpu_svm *svm) { - int rc; + struct kvm_vcpu *vcpu = &svm->vcpu; struct vmcb *vmcb12; - struct vmcb *hsave = svm->nested.hsave; struct vmcb *vmcb = svm->vmcb; struct kvm_host_map map; + int rc; - rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map); + /* Triple faults in L2 should never escape. */ + WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)); + + rc = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map); if (rc) { if (rc == -EINVAL) - kvm_inject_gp(&svm->vcpu, 0); + kvm_inject_gp(vcpu, 0); return 1; } vmcb12 = map.hva; /* Exit Guest-Mode */ - leave_guest_mode(&svm->vcpu); + leave_guest_mode(vcpu); svm->nested.vmcb12_gpa = 0; WARN_ON_ONCE(svm->nested.nested_run_pending); - kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu); + kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); /* in case we halted in L2 */ svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE; @@ -644,14 +727,14 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb12->save.gdtr = vmcb->save.gdtr; vmcb12->save.idtr = vmcb->save.idtr; vmcb12->save.efer = svm->vcpu.arch.efer; - vmcb12->save.cr0 = kvm_read_cr0(&svm->vcpu); - vmcb12->save.cr3 = kvm_read_cr3(&svm->vcpu); + vmcb12->save.cr0 = kvm_read_cr0(vcpu); + vmcb12->save.cr3 = kvm_read_cr3(vcpu); vmcb12->save.cr2 = vmcb->save.cr2; vmcb12->save.cr4 = svm->vcpu.arch.cr4; - vmcb12->save.rflags = kvm_get_rflags(&svm->vcpu); - vmcb12->save.rip = kvm_rip_read(&svm->vcpu); - vmcb12->save.rsp = kvm_rsp_read(&svm->vcpu); - vmcb12->save.rax = kvm_rax_read(&svm->vcpu); + vmcb12->save.rflags = kvm_get_rflags(vcpu); + vmcb12->save.rip = kvm_rip_read(vcpu); + vmcb12->save.rsp = kvm_rsp_read(vcpu); + vmcb12->save.rax = kvm_rax_read(vcpu); vmcb12->save.dr7 = vmcb->save.dr7; vmcb12->save.dr6 = svm->vcpu.arch.dr6; vmcb12->save.cpl = vmcb->save.cpl; @@ -663,7 +746,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb12->control.exit_info_2 = vmcb->control.exit_info_2; if (vmcb12->control.exit_code != SVM_EXIT_ERR) - nested_vmcb_save_pending_event(svm, vmcb12); + nested_save_pending_event_to_vmcb12(svm, vmcb12); if (svm->nrips_enabled) vmcb12->control.next_rip = vmcb->control.next_rip; @@ -678,37 +761,39 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb12->control.pause_filter_thresh = svm->vmcb->control.pause_filter_thresh; - /* Restore the original control entries */ - copy_vmcb_control_area(&vmcb->control, &hsave->control); + nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr); + + svm_switch_vmcb(svm, &svm->vmcb01); + WARN_ON_ONCE(svm->vmcb->control.exit_code != SVM_EXIT_VMRUN); - /* On vmexit the GIF is set to false */ + /* + * On vmexit the GIF is set to false and + * no event can be injected in L1. + */ svm_set_gif(svm, false); + svm->vmcb->control.exit_int_info = 0; - svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset = - svm->vcpu.arch.l1_tsc_offset; + svm->vcpu.arch.tsc_offset = svm->vcpu.arch.l1_tsc_offset; + if (svm->vmcb->control.tsc_offset != svm->vcpu.arch.tsc_offset) { + svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset; + vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); + } svm->nested.ctl.nested_cr3 = 0; - /* Restore selected save entries */ - svm->vmcb->save.es = hsave->save.es; - svm->vmcb->save.cs = hsave->save.cs; - svm->vmcb->save.ss = hsave->save.ss; - svm->vmcb->save.ds = hsave->save.ds; - svm->vmcb->save.gdtr = hsave->save.gdtr; - svm->vmcb->save.idtr = hsave->save.idtr; - kvm_set_rflags(&svm->vcpu, hsave->save.rflags); - kvm_set_rflags(&svm->vcpu, hsave->save.rflags | X86_EFLAGS_FIXED); - svm_set_efer(&svm->vcpu, hsave->save.efer); - svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE); - svm_set_cr4(&svm->vcpu, hsave->save.cr4); - kvm_rax_write(&svm->vcpu, hsave->save.rax); - kvm_rsp_write(&svm->vcpu, hsave->save.rsp); - kvm_rip_write(&svm->vcpu, hsave->save.rip); - svm->vmcb->save.dr7 = DR7_FIXED_1; - svm->vmcb->save.cpl = 0; - svm->vmcb->control.exit_int_info = 0; + /* + * Restore processor state that had been saved in vmcb01 + */ + kvm_set_rflags(vcpu, svm->vmcb->save.rflags); + svm_set_efer(vcpu, svm->vmcb->save.efer); + svm_set_cr0(vcpu, svm->vmcb->save.cr0 | X86_CR0_PE); + svm_set_cr4(vcpu, svm->vmcb->save.cr4); + kvm_rax_write(vcpu, svm->vmcb->save.rax); + kvm_rsp_write(vcpu, svm->vmcb->save.rsp); + kvm_rip_write(vcpu, svm->vmcb->save.rip); - vmcb_mark_all_dirty(svm->vmcb); + svm->vcpu.arch.dr7 = DR7_FIXED_1; + kvm_update_dr7(&svm->vcpu); trace_kvm_nested_vmexit_inject(vmcb12->control.exit_code, vmcb12->control.exit_info_1, @@ -717,50 +802,62 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb12->control.exit_int_info_err, KVM_ISA_SVM); - kvm_vcpu_unmap(&svm->vcpu, &map, true); + kvm_vcpu_unmap(vcpu, &map, true); - nested_svm_uninit_mmu_context(&svm->vcpu); + nested_svm_uninit_mmu_context(vcpu); - rc = nested_svm_load_cr3(&svm->vcpu, hsave->save.cr3, false); + rc = nested_svm_load_cr3(vcpu, svm->vmcb->save.cr3, false); if (rc) return 1; - if (npt_enabled) - svm->vmcb->save.cr3 = hsave->save.cr3; - /* * Drop what we picked up for L2 via svm_complete_interrupts() so it * doesn't end up in L1. */ svm->vcpu.arch.nmi_injected = false; - kvm_clear_exception_queue(&svm->vcpu); - kvm_clear_interrupt_queue(&svm->vcpu); + kvm_clear_exception_queue(vcpu); + kvm_clear_interrupt_queue(vcpu); + + /* + * If we are here following the completion of a VMRUN that + * is being single-stepped, queue the pending #DB intercept + * right now so that it an be accounted for before we execute + * L1's next instruction. + */ + if (unlikely(svm->vmcb->save.rflags & X86_EFLAGS_TF)) + kvm_queue_exception(&(svm->vcpu), DB_VECTOR); return 0; } +static void nested_svm_triple_fault(struct kvm_vcpu *vcpu) +{ + nested_svm_simple_vmexit(to_svm(vcpu), SVM_EXIT_SHUTDOWN); +} + int svm_allocate_nested(struct vcpu_svm *svm) { - struct page *hsave_page; + struct page *vmcb02_page; if (svm->nested.initialized) return 0; - hsave_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); - if (!hsave_page) + vmcb02_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!vmcb02_page) return -ENOMEM; - svm->nested.hsave = page_address(hsave_page); + svm->nested.vmcb02.ptr = page_address(vmcb02_page); + svm->nested.vmcb02.pa = __sme_set(page_to_pfn(vmcb02_page) << PAGE_SHIFT); svm->nested.msrpm = svm_vcpu_alloc_msrpm(); if (!svm->nested.msrpm) - goto err_free_hsave; + goto err_free_vmcb02; svm_vcpu_init_msrpm(&svm->vcpu, svm->nested.msrpm); svm->nested.initialized = true; return 0; -err_free_hsave: - __free_page(hsave_page); +err_free_vmcb02: + __free_page(vmcb02_page); return -ENOMEM; } @@ -772,8 +869,8 @@ void svm_free_nested(struct vcpu_svm *svm) svm_vcpu_free_msrpm(svm->nested.msrpm); svm->nested.msrpm = NULL; - __free_page(virt_to_page(svm->nested.hsave)); - svm->nested.hsave = NULL; + __free_page(virt_to_page(svm->nested.vmcb02.ptr)); + svm->nested.vmcb02.ptr = NULL; svm->nested.initialized = false; } @@ -783,18 +880,19 @@ void svm_free_nested(struct vcpu_svm *svm) */ void svm_leave_nested(struct vcpu_svm *svm) { - if (is_guest_mode(&svm->vcpu)) { - struct vmcb *hsave = svm->nested.hsave; - struct vmcb *vmcb = svm->vmcb; + struct kvm_vcpu *vcpu = &svm->vcpu; + if (is_guest_mode(vcpu)) { svm->nested.nested_run_pending = 0; - leave_guest_mode(&svm->vcpu); - copy_vmcb_control_area(&vmcb->control, &hsave->control); - nested_svm_uninit_mmu_context(&svm->vcpu); + leave_guest_mode(vcpu); + + svm_switch_vmcb(svm, &svm->nested.vmcb02); + + nested_svm_uninit_mmu_context(vcpu); vmcb_mark_all_dirty(svm->vmcb); } - kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu); + kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); } static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) @@ -903,16 +1001,15 @@ int nested_svm_exit_handled(struct vcpu_svm *svm) return vmexit; } -int nested_svm_check_permissions(struct vcpu_svm *svm) +int nested_svm_check_permissions(struct kvm_vcpu *vcpu) { - if (!(svm->vcpu.arch.efer & EFER_SVME) || - !is_paging(&svm->vcpu)) { - kvm_queue_exception(&svm->vcpu, UD_VECTOR); + if (!(vcpu->arch.efer & EFER_SVME) || !is_paging(vcpu)) { + kvm_queue_exception(vcpu, UD_VECTOR); return 1; } - if (svm->vmcb->save.cpl) { - kvm_inject_gp(&svm->vcpu, 0); + if (to_svm(vcpu)->vmcb->save.cpl) { + kvm_inject_gp(vcpu, 0); return 1; } @@ -960,50 +1057,11 @@ static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm) nested_svm_vmexit(svm); } -static void nested_svm_smi(struct vcpu_svm *svm) -{ - svm->vmcb->control.exit_code = SVM_EXIT_SMI; - svm->vmcb->control.exit_info_1 = 0; - svm->vmcb->control.exit_info_2 = 0; - - nested_svm_vmexit(svm); -} - -static void nested_svm_nmi(struct vcpu_svm *svm) -{ - svm->vmcb->control.exit_code = SVM_EXIT_NMI; - svm->vmcb->control.exit_info_1 = 0; - svm->vmcb->control.exit_info_2 = 0; - - nested_svm_vmexit(svm); -} - -static void nested_svm_intr(struct vcpu_svm *svm) -{ - trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip); - - svm->vmcb->control.exit_code = SVM_EXIT_INTR; - svm->vmcb->control.exit_info_1 = 0; - svm->vmcb->control.exit_info_2 = 0; - - nested_svm_vmexit(svm); -} - static inline bool nested_exit_on_init(struct vcpu_svm *svm) { return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INIT); } -static void nested_svm_init(struct vcpu_svm *svm) -{ - svm->vmcb->control.exit_code = SVM_EXIT_INIT; - svm->vmcb->control.exit_info_1 = 0; - svm->vmcb->control.exit_info_2 = 0; - - nested_svm_vmexit(svm); -} - - static int svm_check_nested_events(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1017,12 +1075,18 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu) return -EBUSY; if (!nested_exit_on_init(svm)) return 0; - nested_svm_init(svm); + nested_svm_simple_vmexit(svm, SVM_EXIT_INIT); return 0; } if (vcpu->arch.exception.pending) { - if (block_nested_events) + /* + * Only a pending nested run can block a pending exception. + * Otherwise an injected NMI/interrupt should either be + * lost or delivered to the nested hypervisor in the EXITINTINFO + * vmcb field, while delivering the pending exception. + */ + if (svm->nested.nested_run_pending) return -EBUSY; if (!nested_exit_on_exception(svm)) return 0; @@ -1035,7 +1099,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu) return -EBUSY; if (!nested_exit_on_smi(svm)) return 0; - nested_svm_smi(svm); + nested_svm_simple_vmexit(svm, SVM_EXIT_SMI); return 0; } @@ -1044,7 +1108,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu) return -EBUSY; if (!nested_exit_on_nmi(svm)) return 0; - nested_svm_nmi(svm); + nested_svm_simple_vmexit(svm, SVM_EXIT_NMI); return 0; } @@ -1053,7 +1117,8 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu) return -EBUSY; if (!nested_exit_on_intr(svm)) return 0; - nested_svm_intr(svm); + trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip); + nested_svm_simple_vmexit(svm, SVM_EXIT_INTR); return 0; } @@ -1072,8 +1137,8 @@ int nested_svm_exit_special(struct vcpu_svm *svm) case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); - if (get_host_vmcb(svm)->control.intercepts[INTERCEPT_EXCEPTION] & - excp_bits) + if (svm->vmcb01.ptr->control.intercepts[INTERCEPT_EXCEPTION] & + excp_bits) return NESTED_EXIT_HOST; else if (exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR && svm->vcpu.arch.apf.host_apf_flags) @@ -1137,10 +1202,9 @@ static int svm_get_nested_state(struct kvm_vcpu *vcpu, if (copy_to_user(&user_vmcb->control, &svm->nested.ctl, sizeof(user_vmcb->control))) return -EFAULT; - if (copy_to_user(&user_vmcb->save, &svm->nested.hsave->save, + if (copy_to_user(&user_vmcb->save, &svm->vmcb01.ptr->save, sizeof(user_vmcb->save))) return -EFAULT; - out: return kvm_state.size; } @@ -1150,7 +1214,6 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, struct kvm_nested_state *kvm_state) { struct vcpu_svm *svm = to_svm(vcpu); - struct vmcb *hsave = svm->nested.hsave; struct vmcb __user *user_vmcb = (struct vmcb __user *) &user_kvm_nested_state->data.svm[0]; struct vmcb_control_area *ctl; @@ -1195,8 +1258,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, return -EINVAL; ret = -ENOMEM; - ctl = kzalloc(sizeof(*ctl), GFP_KERNEL); - save = kzalloc(sizeof(*save), GFP_KERNEL); + ctl = kzalloc(sizeof(*ctl), GFP_KERNEL_ACCOUNT); + save = kzalloc(sizeof(*save), GFP_KERNEL_ACCOUNT); if (!ctl || !save) goto out_free; @@ -1207,12 +1270,12 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, goto out_free; ret = -EINVAL; - if (!nested_vmcb_check_controls(ctl)) + if (!nested_vmcb_check_controls(vcpu, ctl)) goto out_free; /* * Processor state contains L2 state. Check that it is - * valid for guest mode (see nested_vmcb_checks). + * valid for guest mode (see nested_vmcb_check_save). */ cr0 = kvm_read_cr0(vcpu); if (((cr0 & X86_CR0_CD) == 0) && (cr0 & X86_CR0_NW)) @@ -1221,29 +1284,48 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, /* * Validate host state saved from before VMRUN (see * nested_svm_check_permissions). - * TODO: validate reserved bits for all saved state. */ - if (!(save->cr0 & X86_CR0_PG)) - goto out_free; - if (!(save->efer & EFER_SVME)) + if (!(save->cr0 & X86_CR0_PG) || + !(save->cr0 & X86_CR0_PE) || + (save->rflags & X86_EFLAGS_VM) || + !nested_vmcb_valid_sregs(vcpu, save)) goto out_free; /* - * All checks done, we can enter guest mode. L1 control fields - * come from the nested save state. Guest state is already - * in the registers, the save area of the nested state instead - * contains saved L1 state. + * All checks done, we can enter guest mode. Userspace provides + * vmcb12.control, which will be combined with L1 and stored into + * vmcb02, and the L1 save state which we store in vmcb01. + * L2 registers if needed are moved from the current VMCB to VMCB02. */ svm->nested.nested_run_pending = !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); - copy_vmcb_control_area(&hsave->control, &svm->vmcb->control); - hsave->save = *save; - svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa; - load_nested_vmcb_control(svm, ctl); - nested_prepare_vmcb_control(svm); + if (svm->current_vmcb == &svm->vmcb01) + svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save; + + svm->vmcb01.ptr->save.es = save->es; + svm->vmcb01.ptr->save.cs = save->cs; + svm->vmcb01.ptr->save.ss = save->ss; + svm->vmcb01.ptr->save.ds = save->ds; + svm->vmcb01.ptr->save.gdtr = save->gdtr; + svm->vmcb01.ptr->save.idtr = save->idtr; + svm->vmcb01.ptr->save.rflags = save->rflags | X86_EFLAGS_FIXED; + svm->vmcb01.ptr->save.efer = save->efer; + svm->vmcb01.ptr->save.cr0 = save->cr0; + svm->vmcb01.ptr->save.cr3 = save->cr3; + svm->vmcb01.ptr->save.cr4 = save->cr4; + svm->vmcb01.ptr->save.rax = save->rax; + svm->vmcb01.ptr->save.rsp = save->rsp; + svm->vmcb01.ptr->save.rip = save->rip; + svm->vmcb01.ptr->save.cpl = 0; + + nested_load_control_from_vmcb12(svm, ctl); + + svm_switch_vmcb(svm, &svm->nested.vmcb02); + + nested_vmcb02_prepare_control(svm); kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); ret = 0; @@ -1254,8 +1336,31 @@ out_free: return ret; } +static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (WARN_ON(!is_guest_mode(vcpu))) + return true; + + if (nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3, + nested_npt_enabled(svm))) + return false; + + if (!nested_svm_vmrun_msrpm(svm)) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = + KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return false; + } + + return true; +} + struct kvm_x86_nested_ops svm_nested_ops = { .check_events = svm_check_nested_events, + .triple_fault = nested_svm_triple_fault, .get_nested_state_pages = svm_get_nested_state_pages, .get_state = svm_get_nested_state, .set_state = svm_set_nested_state, |