diff options
Diffstat (limited to 'arch/x86/kvm/vmx/nested.c')
-rw-r--r-- | arch/x86/kvm/vmx/nested.c | 129 |
1 files changed, 79 insertions, 50 deletions
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index d737a51a53ca..f24a2c225070 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -211,7 +211,6 @@ static void free_nested(struct kvm_vcpu *vcpu) if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) return; - hrtimer_cancel(&vmx->nested.preemption_timer); vmx->nested.vmxon = false; vmx->nested.smm.vmxon = false; free_vpid(vmx->nested.vpid02); @@ -274,6 +273,7 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) { vcpu_load(vcpu); + vmx_leave_nested(vcpu); vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01); free_nested(vcpu); vcpu_put(vcpu); @@ -1980,17 +1980,6 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) prepare_vmcs02_early_full(vmx, vmcs12); /* - * HOST_RSP is normally set correctly in vmx_vcpu_run() just before - * entry, but only if the current (host) sp changed from the value - * we wrote last (vmx->host_rsp). This cache is no longer relevant - * if we switch vmcs, and rather than hold a separate cache per vmcs, - * here we just force the write to happen on entry. host_rsp will - * also be written unconditionally by nested_vmx_check_vmentry_hw() - * if we are doing early consistency checks via hardware. - */ - vmx->host_rsp = 0; - - /* * PIN CONTROLS */ exec_control = vmcs12->pin_based_vm_exec_control; @@ -2289,10 +2278,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, } vmx_set_rflags(vcpu, vmcs12->guest_rflags); - vmx->nested.preemption_timer_expired = false; - if (nested_cpu_has_preemption_timer(vmcs12)) - vmx_start_preemption_timer(vcpu); - /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the * bitwise-or of what L1 wants to trap for L2, and what we want to * trap. Note that CR0.TS also needs updating - we do this later. @@ -2722,6 +2707,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long cr3, cr4; + bool vm_fail; if (!nested_early_check) return 0; @@ -2755,29 +2741,34 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) vmx->loaded_vmcs->host_state.cr4 = cr4; } - vmx->__launched = vmx->loaded_vmcs->launched; - asm( - /* Set HOST_RSP */ "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ - __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t" - "mov %%" _ASM_SP ", %c[host_rsp](%1)\n\t" + "cmp %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t" + "je 1f \n\t" + __ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t" + "mov %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t" + "1: \n\t" "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ /* Check if vmlaunch or vmresume is needed */ - "cmpl $0, %c[launched](%% " _ASM_CX")\n\t" + "cmpb $0, %c[launched](%[loaded_vmcs])\n\t" + /* + * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set + * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail + * Valid. vmx_vmenter() directly "returns" RFLAGS, and so the + * results of VM-Enter is captured via CC_{SET,OUT} to vm_fail. + */ "call vmx_vmenter\n\t" - /* Set vmx->fail accordingly */ - "setbe %c[fail](%% " _ASM_CX")\n\t" - : ASM_CALL_CONSTRAINT - : "c"(vmx), "d"((unsigned long)HOST_RSP), - [launched]"i"(offsetof(struct vcpu_vmx, __launched)), - [fail]"i"(offsetof(struct vcpu_vmx, fail)), - [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), + CC_SET(be) + : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail) + : [HOST_RSP]"r"((unsigned long)HOST_RSP), + [loaded_vmcs]"r"(vmx->loaded_vmcs), + [launched]"i"(offsetof(struct loaded_vmcs, launched)), + [host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)), [wordsize]"i"(sizeof(ulong)) - : "rax", "cc", "memory" + : "cc", "memory" ); preempt_enable(); @@ -2787,10 +2778,9 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) if (vmx->msr_autoload.guest.nr) vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); - if (vmx->fail) { + if (vm_fail) { WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != VMXERR_ENTRY_INVALID_CONTROL_FIELD); - vmx->fail = 0; return 1; } @@ -2813,8 +2803,6 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) return 0; } -STACK_FRAME_NON_STANDARD(nested_vmx_check_vmentry_hw); - static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12); @@ -3031,6 +3019,15 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) kvm_make_request(KVM_REQ_EVENT, vcpu); /* + * Do not start the preemption timer hrtimer until after we know + * we are successful, so that only nested_vmx_vmexit needs to cancel + * the timer. + */ + vmx->nested.preemption_timer_expired = false; + if (nested_cpu_has_preemption_timer(vmcs12)) + vmx_start_preemption_timer(vcpu); + + /* * Note no nested_vmx_succeed or nested_vmx_fail here. At this point * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet * returned as far as L1 is concerned. It will only return (and set @@ -3450,13 +3447,10 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) else vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; - if (nested_cpu_has_preemption_timer(vmcs12)) { - if (vmcs12->vm_exit_controls & - VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) + if (nested_cpu_has_preemption_timer(vmcs12) && + vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) vmcs12->vmx_preemption_timer_value = vmx_get_preemption_timer_value(vcpu); - hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); - } /* * In some cases (usually, nested EPT), L2 is allowed to change its @@ -3864,6 +3858,9 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, leave_guest_mode(vcpu); + if (nested_cpu_has_preemption_timer(vmcs12)) + hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) vcpu->arch.tsc_offset -= vmcs12->tsc_offset; @@ -3915,9 +3912,6 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmx_flush_tlb(vcpu, true); } - /* This is needed for same reason as it was needed in prepare_vmcs02 */ - vmx->host_rsp = 0; - /* Unpin physical memory we referred to in vmcs02 */ if (vmx->nested.apic_access_page) { kvm_release_page_dirty(vmx->nested.apic_access_page); @@ -4035,25 +4029,50 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, /* Addr = segment_base + offset */ /* offset = base + [index * scale] + displacement */ off = exit_qualification; /* holds the displacement */ + if (addr_size == 1) + off = (gva_t)sign_extend64(off, 31); + else if (addr_size == 0) + off = (gva_t)sign_extend64(off, 15); if (base_is_valid) off += kvm_register_read(vcpu, base_reg); if (index_is_valid) off += kvm_register_read(vcpu, index_reg)<<scaling; vmx_get_segment(vcpu, &s, seg_reg); - *ret = s.base + off; + /* + * The effective address, i.e. @off, of a memory operand is truncated + * based on the address size of the instruction. Note that this is + * the *effective address*, i.e. the address prior to accounting for + * the segment's base. + */ if (addr_size == 1) /* 32 bit */ - *ret &= 0xffffffff; + off &= 0xffffffff; + else if (addr_size == 0) /* 16 bit */ + off &= 0xffff; /* Checks for #GP/#SS exceptions. */ exn = false; if (is_long_mode(vcpu)) { + /* + * The virtual/linear address is never truncated in 64-bit + * mode, e.g. a 32-bit address size can yield a 64-bit virtual + * address when using FS/GS with a non-zero base. + */ + *ret = s.base + off; + /* Long mode: #GP(0)/#SS(0) if the memory address is in a * non-canonical form. This is the only check on the memory * destination for long mode! */ exn = is_noncanonical_address(*ret, vcpu); - } else if (is_protmode(vcpu)) { + } else { + /* + * When not in long mode, the virtual/linear address is + * unconditionally truncated to 32 bits regardless of the + * address size. + */ + *ret = (s.base + off) & 0xffffffff; + /* Protected mode: apply checks for segment validity in the * following order: * - segment type check (#GP(0) may be thrown) @@ -4077,10 +4096,16 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. */ exn = (s.unusable != 0); - /* Protected mode: #GP(0)/#SS(0) if the memory - * operand is outside the segment limit. + + /* + * Protected mode: #GP(0)/#SS(0) if the memory operand is + * outside the segment limit. All CPUs that support VMX ignore + * limit checks for flat segments, i.e. segments with base==0, + * limit==0xffffffff and of type expand-up data or code. */ - exn = exn || (off + sizeof(u64) > s.limit); + if (!(s.base == 0 && s.limit == 0xffffffff && + ((s.type & 8) || !(s.type & 4)))) + exn = exn || (off + sizeof(u64) > s.limit); } if (exn) { kvm_queue_exception_e(vcpu, @@ -4145,11 +4170,11 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) if (r < 0) goto out_vmcs02; - vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL); + vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); if (!vmx->nested.cached_vmcs12) goto out_cached_vmcs12; - vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL); + vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); if (!vmx->nested.cached_shadow_vmcs12) goto out_cached_shadow_vmcs12; @@ -5696,6 +5721,10 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) enable_shadow_vmcs = 0; if (enable_shadow_vmcs) { for (i = 0; i < VMX_BITMAP_NR; i++) { + /* + * The vmx_bitmap is not tied to a VM and so should + * not be charged to a memcg. + */ vmx_bitmap[i] = (unsigned long *) __get_free_page(GFP_KERNEL); if (!vmx_bitmap[i]) { |