diff options
Diffstat (limited to 'arch/x86/kvm/x86.c')
-rw-r--r-- | arch/x86/kvm/x86.c | 576 |
1 files changed, 395 insertions, 181 deletions
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b0c47b41c264..4bd5f8a751de 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -173,8 +173,13 @@ bool __read_mostly enable_vmware_backdoor = false; module_param(enable_vmware_backdoor, bool, S_IRUGO); EXPORT_SYMBOL_GPL(enable_vmware_backdoor); -static bool __read_mostly force_emulation_prefix = false; -module_param(force_emulation_prefix, bool, S_IRUGO); +/* + * Flags to manipulate forced emulation behavior (any non-zero value will + * enable forced emulation). + */ +#define KVM_FEP_CLEAR_RFLAGS_RF BIT(1) +static int __read_mostly force_emulation_prefix; +module_param(force_emulation_prefix, int, 0644); int __read_mostly pi_inject_timer = -1; module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR); @@ -528,6 +533,7 @@ static int exception_class(int vector) #define EXCPT_TRAP 1 #define EXCPT_ABORT 2 #define EXCPT_INTERRUPT 3 +#define EXCPT_DB 4 static int exception_type(int vector) { @@ -538,8 +544,14 @@ static int exception_type(int vector) mask = 1 << vector; - /* #DB is trap, as instruction watchpoints are handled elsewhere */ - if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR))) + /* + * #DBs can be trap-like or fault-like, the caller must check other CPU + * state, e.g. DR6, to determine whether a #DB is a trap or fault. + */ + if (mask & (1 << DB_VECTOR)) + return EXCPT_DB; + + if (mask & ((1 << BP_VECTOR) | (1 << OF_VECTOR))) return EXCPT_TRAP; if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR))) @@ -549,16 +561,13 @@ static int exception_type(int vector) return EXCPT_FAULT; } -void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu) +void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu, + struct kvm_queued_exception *ex) { - unsigned nr = vcpu->arch.exception.nr; - bool has_payload = vcpu->arch.exception.has_payload; - unsigned long payload = vcpu->arch.exception.payload; - - if (!has_payload) + if (!ex->has_payload) return; - switch (nr) { + switch (ex->vector) { case DB_VECTOR: /* * "Certain debug exceptions may clear bit 0-3. The @@ -583,8 +592,8 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu) * So they need to be flipped for DR6. */ vcpu->arch.dr6 |= DR6_ACTIVE_LOW; - vcpu->arch.dr6 |= payload; - vcpu->arch.dr6 ^= payload & DR6_ACTIVE_LOW; + vcpu->arch.dr6 |= ex->payload; + vcpu->arch.dr6 ^= ex->payload & DR6_ACTIVE_LOW; /* * The #DB payload is defined as compatible with the 'pending @@ -595,15 +604,30 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu) vcpu->arch.dr6 &= ~BIT(12); break; case PF_VECTOR: - vcpu->arch.cr2 = payload; + vcpu->arch.cr2 = ex->payload; break; } - vcpu->arch.exception.has_payload = false; - vcpu->arch.exception.payload = 0; + ex->has_payload = false; + ex->payload = 0; } EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload); +static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vector, + bool has_error_code, u32 error_code, + bool has_payload, unsigned long payload) +{ + struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit; + + ex->vector = vector; + ex->injected = false; + ex->pending = true; + ex->has_error_code = has_error_code; + ex->error_code = error_code; + ex->has_payload = has_payload; + ex->payload = payload; +} + static void kvm_multiple_exception(struct kvm_vcpu *vcpu, unsigned nr, bool has_error, u32 error_code, bool has_payload, unsigned long payload, bool reinject) @@ -613,18 +637,31 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, kvm_make_request(KVM_REQ_EVENT, vcpu); + /* + * If the exception is destined for L2 and isn't being reinjected, + * morph it to a VM-Exit if L1 wants to intercept the exception. A + * previously injected exception is not checked because it was checked + * when it was original queued, and re-checking is incorrect if _L1_ + * injected the exception, in which case it's exempt from interception. + */ + if (!reinject && is_guest_mode(vcpu) && + kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, nr, error_code)) { + kvm_queue_exception_vmexit(vcpu, nr, has_error, error_code, + has_payload, payload); + return; + } + if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) { queue: if (reinject) { /* - * On vmentry, vcpu->arch.exception.pending is only - * true if an event injection was blocked by - * nested_run_pending. In that case, however, - * vcpu_enter_guest requests an immediate exit, - * and the guest shouldn't proceed far enough to - * need reinjection. + * On VM-Entry, an exception can be pending if and only + * if event injection was blocked by nested_run_pending. + * In that case, however, vcpu_enter_guest() requests an + * immediate exit, and the guest shouldn't proceed far + * enough to need reinjection. */ - WARN_ON_ONCE(vcpu->arch.exception.pending); + WARN_ON_ONCE(kvm_is_exception_pending(vcpu)); vcpu->arch.exception.injected = true; if (WARN_ON_ONCE(has_payload)) { /* @@ -639,17 +676,18 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, vcpu->arch.exception.injected = false; } vcpu->arch.exception.has_error_code = has_error; - vcpu->arch.exception.nr = nr; + vcpu->arch.exception.vector = nr; vcpu->arch.exception.error_code = error_code; vcpu->arch.exception.has_payload = has_payload; vcpu->arch.exception.payload = payload; if (!is_guest_mode(vcpu)) - kvm_deliver_exception_payload(vcpu); + kvm_deliver_exception_payload(vcpu, + &vcpu->arch.exception); return; } /* to check exception */ - prev_nr = vcpu->arch.exception.nr; + prev_nr = vcpu->arch.exception.vector; if (prev_nr == DF_VECTOR) { /* triple fault -> shutdown */ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); @@ -657,25 +695,22 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, } class1 = exception_class(prev_nr); class2 = exception_class(nr); - if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) - || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) { + if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) || + (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) { /* - * Generate double fault per SDM Table 5-5. Set - * exception.pending = true so that the double fault - * can trigger a nested vmexit. + * Synthesize #DF. Clear the previously injected or pending + * exception so as not to incorrectly trigger shutdown. */ - vcpu->arch.exception.pending = true; vcpu->arch.exception.injected = false; - vcpu->arch.exception.has_error_code = true; - vcpu->arch.exception.nr = DF_VECTOR; - vcpu->arch.exception.error_code = 0; - vcpu->arch.exception.has_payload = false; - vcpu->arch.exception.payload = 0; - } else + vcpu->arch.exception.pending = false; + + kvm_queue_exception_e(vcpu, DF_VECTOR, 0); + } else { /* replace previous exception with a new one in a hope that instruction re-execution will regenerate lost exception */ goto queue; + } } void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) @@ -729,20 +764,22 @@ static int complete_emulated_insn_gp(struct kvm_vcpu *vcpu, int err) void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { ++vcpu->stat.pf_guest; - vcpu->arch.exception.nested_apf = - is_guest_mode(vcpu) && fault->async_page_fault; - if (vcpu->arch.exception.nested_apf) { - vcpu->arch.apf.nested_apf_token = fault->address; - kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); - } else { + + /* + * Async #PF in L2 is always forwarded to L1 as a VM-Exit regardless of + * whether or not L1 wants to intercept "regular" #PF. + */ + if (is_guest_mode(vcpu) && fault->async_page_fault) + kvm_queue_exception_vmexit(vcpu, PF_VECTOR, + true, fault->error_code, + true, fault->address); + else kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code, fault->address); - } } EXPORT_SYMBOL_GPL(kvm_inject_page_fault); -/* Returns true if the page fault was immediately morphed into a VM-Exit. */ -bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, +void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { struct kvm_mmu *fault_mmu; @@ -760,26 +797,7 @@ bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address, fault_mmu->root.hpa); - /* - * A workaround for KVM's bad exception handling. If KVM injected an - * exception into L2, and L2 encountered a #PF while vectoring the - * injected exception, manually check to see if L1 wants to intercept - * #PF, otherwise queuing the #PF will lead to #DF or a lost exception. - * In all other cases, defer the check to nested_ops->check_events(), - * which will correctly handle priority (this does not). Note, other - * exceptions, e.g. #GP, are theoretically affected, #PF is simply the - * most problematic, e.g. when L0 and L1 are both intercepting #PF for - * shadow paging. - * - * TODO: Rewrite exception handling to track injected and pending - * (VM-Exit) exceptions separately. - */ - if (unlikely(vcpu->arch.exception.injected && is_guest_mode(vcpu)) && - kvm_x86_ops.nested_ops->handle_page_fault_workaround(vcpu, fault)) - return true; - fault_mmu->inject_page_fault(vcpu, fault); - return false; } EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault); @@ -4841,7 +4859,7 @@ static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu) return (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_accept_dm_intr(vcpu) && !kvm_event_needs_reinjection(vcpu) && - !vcpu->arch.exception.pending); + !kvm_is_exception_pending(vcpu)); } static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, @@ -5016,25 +5034,38 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { + struct kvm_queued_exception *ex; + process_nmi(vcpu); if (kvm_check_request(KVM_REQ_SMI, vcpu)) process_smi(vcpu); /* - * In guest mode, payload delivery should be deferred, - * so that the L1 hypervisor can intercept #PF before - * CR2 is modified (or intercept #DB before DR6 is - * modified under nVMX). Unless the per-VM capability, - * KVM_CAP_EXCEPTION_PAYLOAD, is set, we may not defer the delivery of - * an exception payload and handle after a KVM_GET_VCPU_EVENTS. Since we - * opportunistically defer the exception payload, deliver it if the - * capability hasn't been requested before processing a - * KVM_GET_VCPU_EVENTS. + * KVM's ABI only allows for one exception to be migrated. Luckily, + * the only time there can be two queued exceptions is if there's a + * non-exiting _injected_ exception, and a pending exiting exception. + * In that case, ignore the VM-Exiting exception as it's an extension + * of the injected exception. + */ + if (vcpu->arch.exception_vmexit.pending && + !vcpu->arch.exception.pending && + !vcpu->arch.exception.injected) + ex = &vcpu->arch.exception_vmexit; + else + ex = &vcpu->arch.exception; + + /* + * In guest mode, payload delivery should be deferred if the exception + * will be intercepted by L1, e.g. KVM should not modifying CR2 if L1 + * intercepts #PF, ditto for DR6 and #DBs. If the per-VM capability, + * KVM_CAP_EXCEPTION_PAYLOAD, is not set, userspace may or may not + * propagate the payload and so it cannot be safely deferred. Deliver + * the payload if the capability hasn't been requested. */ if (!vcpu->kvm->arch.exception_payload_enabled && - vcpu->arch.exception.pending && vcpu->arch.exception.has_payload) - kvm_deliver_exception_payload(vcpu); + ex->pending && ex->has_payload) + kvm_deliver_exception_payload(vcpu, ex); /* * The API doesn't provide the instruction length for software @@ -5042,26 +5073,25 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, * isn't advanced, we should expect to encounter the exception * again. */ - if (kvm_exception_is_soft(vcpu->arch.exception.nr)) { + if (kvm_exception_is_soft(ex->vector)) { events->exception.injected = 0; events->exception.pending = 0; } else { - events->exception.injected = vcpu->arch.exception.injected; - events->exception.pending = vcpu->arch.exception.pending; + events->exception.injected = ex->injected; + events->exception.pending = ex->pending; /* * For ABI compatibility, deliberately conflate * pending and injected exceptions when * KVM_CAP_EXCEPTION_PAYLOAD isn't enabled. */ if (!vcpu->kvm->arch.exception_payload_enabled) - events->exception.injected |= - vcpu->arch.exception.pending; + events->exception.injected |= ex->pending; } - events->exception.nr = vcpu->arch.exception.nr; - events->exception.has_error_code = vcpu->arch.exception.has_error_code; - events->exception.error_code = vcpu->arch.exception.error_code; - events->exception_has_payload = vcpu->arch.exception.has_payload; - events->exception_payload = vcpu->arch.exception.payload; + events->exception.nr = ex->vector; + events->exception.has_error_code = ex->has_error_code; + events->exception.error_code = ex->error_code; + events->exception_has_payload = ex->has_payload; + events->exception_payload = ex->payload; events->interrupt.injected = vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft; @@ -5131,9 +5161,22 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, return -EINVAL; process_nmi(vcpu); + + /* + * Flag that userspace is stuffing an exception, the next KVM_RUN will + * morph the exception to a VM-Exit if appropriate. Do this only for + * pending exceptions, already-injected exceptions are not subject to + * intercpetion. Note, userspace that conflates pending and injected + * is hosed, and will incorrectly convert an injected exception into a + * pending exception, which in turn may cause a spurious VM-Exit. + */ + vcpu->arch.exception_from_userspace = events->exception.pending; + + vcpu->arch.exception_vmexit.pending = false; + vcpu->arch.exception.injected = events->exception.injected; vcpu->arch.exception.pending = events->exception.pending; - vcpu->arch.exception.nr = events->exception.nr; + vcpu->arch.exception.vector = events->exception.nr; vcpu->arch.exception.has_error_code = events->exception.has_error_code; vcpu->arch.exception.error_code = events->exception.error_code; vcpu->arch.exception.has_payload = events->exception_has_payload; @@ -7257,6 +7300,7 @@ static int kvm_can_emulate_insn(struct kvm_vcpu *vcpu, int emul_type, int handle_ud(struct kvm_vcpu *vcpu) { static const char kvm_emulate_prefix[] = { __KVM_EMULATE_PREFIX }; + int fep_flags = READ_ONCE(force_emulation_prefix); int emul_type = EMULTYPE_TRAP_UD; char sig[5]; /* ud2; .ascii "kvm" */ struct x86_exception e; @@ -7264,10 +7308,12 @@ int handle_ud(struct kvm_vcpu *vcpu) if (unlikely(!kvm_can_emulate_insn(vcpu, emul_type, NULL, 0))) return 1; - if (force_emulation_prefix && + if (fep_flags && kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu), sig, sizeof(sig), &e) == 0 && memcmp(sig, kvm_emulate_prefix, sizeof(sig)) == 0) { + if (fep_flags & KVM_FEP_CLEAR_RFLAGS_RF) + kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) & ~X86_EFLAGS_RF); kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig)); emul_type = EMULTYPE_TRAP_UD_FORCED; } @@ -7933,14 +7979,20 @@ static int emulator_get_msr_with_filter(struct x86_emulate_ctxt *ctxt, int r; r = kvm_get_msr_with_filter(vcpu, msr_index, pdata); + if (r < 0) + return X86EMUL_UNHANDLEABLE; - if (r && kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_RDMSR, 0, - complete_emulated_rdmsr, r)) { - /* Bounce to user space */ - return X86EMUL_IO_NEEDED; + if (r) { + if (kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_RDMSR, 0, + complete_emulated_rdmsr, r)) + return X86EMUL_IO_NEEDED; + + trace_kvm_msr_read_ex(msr_index); + return X86EMUL_PROPAGATE_FAULT; } - return r; + trace_kvm_msr_read(msr_index, *pdata); + return X86EMUL_CONTINUE; } static int emulator_set_msr_with_filter(struct x86_emulate_ctxt *ctxt, @@ -7950,14 +8002,20 @@ static int emulator_set_msr_with_filter(struct x86_emulate_ctxt *ctxt, int r; r = kvm_set_msr_with_filter(vcpu, msr_index, data); + if (r < 0) + return X86EMUL_UNHANDLEABLE; - if (r && kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_WRMSR, data, - complete_emulated_msr_access, r)) { - /* Bounce to user space */ - return X86EMUL_IO_NEEDED; + if (r) { + if (kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_WRMSR, data, + complete_emulated_msr_access, r)) + return X86EMUL_IO_NEEDED; + + trace_kvm_msr_write_ex(msr_index, data); + return X86EMUL_PROPAGATE_FAULT; } - return r; + trace_kvm_msr_write(msr_index, data); + return X86EMUL_CONTINUE; } static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, @@ -8161,18 +8219,17 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) } } -static bool inject_emulated_exception(struct kvm_vcpu *vcpu) +static void inject_emulated_exception(struct kvm_vcpu *vcpu) { struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; - if (ctxt->exception.vector == PF_VECTOR) - return kvm_inject_emulated_page_fault(vcpu, &ctxt->exception); - if (ctxt->exception.error_code_valid) + if (ctxt->exception.vector == PF_VECTOR) + kvm_inject_emulated_page_fault(vcpu, &ctxt->exception); + else if (ctxt->exception.error_code_valid) kvm_queue_exception_e(vcpu, ctxt->exception.vector, ctxt->exception.error_code); else kvm_queue_exception(vcpu, ctxt->exception.vector); - return false; } static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu) @@ -8548,8 +8605,46 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction); -static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu, int *r) +static bool kvm_is_code_breakpoint_inhibited(struct kvm_vcpu *vcpu) { + u32 shadow; + + if (kvm_get_rflags(vcpu) & X86_EFLAGS_RF) + return true; + + /* + * Intel CPUs inhibit code #DBs when MOV/POP SS blocking is active, + * but AMD CPUs do not. MOV/POP SS blocking is rare, check that first + * to avoid the relatively expensive CPUID lookup. + */ + shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu); + return (shadow & KVM_X86_SHADOW_INT_MOV_SS) && + guest_cpuid_is_intel(vcpu); +} + +static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu, + int emulation_type, int *r) +{ + WARN_ON_ONCE(emulation_type & EMULTYPE_NO_DECODE); + + /* + * Do not check for code breakpoints if hardware has already done the + * checks, as inferred from the emulation type. On NO_DECODE and SKIP, + * the instruction has passed all exception checks, and all intercepted + * exceptions that trigger emulation have lower priority than code + * breakpoints, i.e. the fact that the intercepted exception occurred + * means any code breakpoints have already been serviced. + * + * Note, KVM needs to check for code #DBs on EMULTYPE_TRAP_UD_FORCED as + * hardware has checked the RIP of the magic prefix, but not the RIP of + * the instruction being emulated. The intent of forced emulation is + * to behave as if KVM intercepted the instruction without an exception + * and without a prefix. + */ + if (emulation_type & (EMULTYPE_NO_DECODE | EMULTYPE_SKIP | + EMULTYPE_TRAP_UD | EMULTYPE_VMWARE_GP | EMULTYPE_PF)) + return false; + if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) && (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) { struct kvm_run *kvm_run = vcpu->run; @@ -8569,7 +8664,7 @@ static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu, int *r) } if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) && - !(kvm_get_rflags(vcpu) & X86_EFLAGS_RF)) { + !kvm_is_code_breakpoint_inhibited(vcpu)) { unsigned long eip = kvm_get_linear_rip(vcpu); u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0, vcpu->arch.dr7, @@ -8671,8 +8766,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * are fault-like and are higher priority than any faults on * the code fetch itself. */ - if (!(emulation_type & EMULTYPE_SKIP) && - kvm_vcpu_check_code_breakpoint(vcpu, &r)) + if (kvm_vcpu_check_code_breakpoint(vcpu, emulation_type, &r)) return r; r = x86_decode_emulated_instruction(vcpu, emulation_type, @@ -8770,8 +8864,7 @@ restart: if (ctxt->have_exception) { r = 1; - if (inject_emulated_exception(vcpu)) - return r; + inject_emulated_exception(vcpu); } else if (vcpu->arch.pio.count) { if (!vcpu->arch.pio.in) { /* FIXME: return into emulator if single-stepping. */ @@ -8801,6 +8894,12 @@ writeback: unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu); toggle_interruptibility(vcpu, ctxt->interruptibility); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; + + /* + * Note, EXCPT_DB is assumed to be fault-like as the emulator + * only supports code breakpoints and general detect #DB, both + * of which are fault-like. + */ if (!ctxt->have_exception || exception_type(ctxt->exception.vector) == EXCPT_TRAP) { kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS); @@ -9662,74 +9761,155 @@ int kvm_check_nested_events(struct kvm_vcpu *vcpu) static void kvm_inject_exception(struct kvm_vcpu *vcpu) { - trace_kvm_inj_exception(vcpu->arch.exception.nr, + trace_kvm_inj_exception(vcpu->arch.exception.vector, vcpu->arch.exception.has_error_code, vcpu->arch.exception.error_code, vcpu->arch.exception.injected); if (vcpu->arch.exception.error_code && !is_protmode(vcpu)) vcpu->arch.exception.error_code = false; - static_call(kvm_x86_queue_exception)(vcpu); + static_call(kvm_x86_inject_exception)(vcpu); } -static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) +/* + * Check for any event (interrupt or exception) that is ready to be injected, + * and if there is at least one event, inject the event with the highest + * priority. This handles both "pending" events, i.e. events that have never + * been injected into the guest, and "injected" events, i.e. events that were + * injected as part of a previous VM-Enter, but weren't successfully delivered + * and need to be re-injected. + * + * Note, this is not guaranteed to be invoked on a guest instruction boundary, + * i.e. doesn't guarantee that there's an event window in the guest. KVM must + * be able to inject exceptions in the "middle" of an instruction, and so must + * also be able to re-inject NMIs and IRQs in the middle of an instruction. + * I.e. for exceptions and re-injected events, NOT invoking this on instruction + * boundaries is necessary and correct. + * + * For simplicity, KVM uses a single path to inject all events (except events + * that are injected directly from L1 to L2) and doesn't explicitly track + * instruction boundaries for asynchronous events. However, because VM-Exits + * that can occur during instruction execution typically result in KVM skipping + * the instruction or injecting an exception, e.g. instruction and exception + * intercepts, and because pending exceptions have higher priority than pending + * interrupts, KVM still honors instruction boundaries in most scenarios. + * + * But, if a VM-Exit occurs during instruction execution, and KVM does NOT skip + * the instruction or inject an exception, then KVM can incorrecty inject a new + * asynchrounous event if the event became pending after the CPU fetched the + * instruction (in the guest). E.g. if a page fault (#PF, #NPF, EPT violation) + * occurs and is resolved by KVM, a coincident NMI, SMI, IRQ, etc... can be + * injected on the restarted instruction instead of being deferred until the + * instruction completes. + * + * In practice, this virtualization hole is unlikely to be observed by the + * guest, and even less likely to cause functional problems. To detect the + * hole, the guest would have to trigger an event on a side effect of an early + * phase of instruction execution, e.g. on the instruction fetch from memory. + * And for it to be a functional problem, the guest would need to depend on the + * ordering between that side effect, the instruction completing, _and_ the + * delivery of the asynchronous event. + */ +static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu, + bool *req_immediate_exit) { + bool can_inject; int r; - bool can_inject = true; - /* try to reinject previous events if any */ + /* + * Process nested events first, as nested VM-Exit supercedes event + * re-injection. If there's an event queued for re-injection, it will + * be saved into the appropriate vmc{b,s}12 fields on nested VM-Exit. + */ + if (is_guest_mode(vcpu)) + r = kvm_check_nested_events(vcpu); + else + r = 0; - if (vcpu->arch.exception.injected) { - kvm_inject_exception(vcpu); - can_inject = false; - } /* - * Do not inject an NMI or interrupt if there is a pending - * exception. Exceptions and interrupts are recognized at - * instruction boundaries, i.e. the start of an instruction. - * Trap-like exceptions, e.g. #DB, have higher priority than - * NMIs and interrupts, i.e. traps are recognized before an - * NMI/interrupt that's pending on the same instruction. - * Fault-like exceptions, e.g. #GP and #PF, are the lowest - * priority, but are only generated (pended) during instruction - * execution, i.e. a pending fault-like exception means the - * fault occurred on the *previous* instruction and must be - * serviced prior to recognizing any new events in order to - * fully complete the previous instruction. + * Re-inject exceptions and events *especially* if immediate entry+exit + * to/from L2 is needed, as any event that has already been injected + * into L2 needs to complete its lifecycle before injecting a new event. + * + * Don't re-inject an NMI or interrupt if there is a pending exception. + * This collision arises if an exception occurred while vectoring the + * injected event, KVM intercepted said exception, and KVM ultimately + * determined the fault belongs to the guest and queues the exception + * for injection back into the guest. + * + * "Injected" interrupts can also collide with pending exceptions if + * userspace ignores the "ready for injection" flag and blindly queues + * an interrupt. In that case, prioritizing the exception is correct, + * as the exception "occurred" before the exit to userspace. Trap-like + * exceptions, e.g. most #DBs, have higher priority than interrupts. + * And while fault-like exceptions, e.g. #GP and #PF, are the lowest + * priority, they're only generated (pended) during instruction + * execution, and interrupts are recognized at instruction boundaries. + * Thus a pending fault-like exception means the fault occurred on the + * *previous* instruction and must be serviced prior to recognizing any + * new events in order to fully complete the previous instruction. */ - else if (!vcpu->arch.exception.pending) { - if (vcpu->arch.nmi_injected) { - static_call(kvm_x86_inject_nmi)(vcpu); - can_inject = false; - } else if (vcpu->arch.interrupt.injected) { - static_call(kvm_x86_inject_irq)(vcpu, true); - can_inject = false; - } - } + if (vcpu->arch.exception.injected) + kvm_inject_exception(vcpu); + else if (kvm_is_exception_pending(vcpu)) + ; /* see above */ + else if (vcpu->arch.nmi_injected) + static_call(kvm_x86_inject_nmi)(vcpu); + else if (vcpu->arch.interrupt.injected) + static_call(kvm_x86_inject_irq)(vcpu, true); + /* + * Exceptions that morph to VM-Exits are handled above, and pending + * exceptions on top of injected exceptions that do not VM-Exit should + * either morph to #DF or, sadly, override the injected exception. + */ WARN_ON_ONCE(vcpu->arch.exception.injected && vcpu->arch.exception.pending); /* - * Call check_nested_events() even if we reinjected a previous event - * in order for caller to determine if it should require immediate-exit - * from L2 to L1 due to pending L1 events which require exit - * from L2 to L1. + * Bail if immediate entry+exit to/from the guest is needed to complete + * nested VM-Enter or event re-injection so that a different pending + * event can be serviced (or if KVM needs to exit to userspace). + * + * Otherwise, continue processing events even if VM-Exit occurred. The + * VM-Exit will have cleared exceptions that were meant for L2, but + * there may now be events that can be injected into L1. */ - if (is_guest_mode(vcpu)) { - r = kvm_check_nested_events(vcpu); - if (r < 0) - goto out; - } + if (r < 0) + goto out; + + /* + * A pending exception VM-Exit should either result in nested VM-Exit + * or force an immediate re-entry and exit to/from L2, and exception + * VM-Exits cannot be injected (flag should _never_ be set). + */ + WARN_ON_ONCE(vcpu->arch.exception_vmexit.injected || + vcpu->arch.exception_vmexit.pending); + + /* + * New events, other than exceptions, cannot be injected if KVM needs + * to re-inject a previous event. See above comments on re-injecting + * for why pending exceptions get priority. + */ + can_inject = !kvm_event_needs_reinjection(vcpu); - /* try to inject new event if pending */ if (vcpu->arch.exception.pending) { - if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT) + /* + * Fault-class exceptions, except #DBs, set RF=1 in the RFLAGS + * value pushed on the stack. Trap-like exception and all #DBs + * leave RF as-is (KVM follows Intel's behavior in this regard; + * AMD states that code breakpoint #DBs excplitly clear RF=0). + * + * Note, most versions of Intel's SDM and AMD's APM incorrectly + * describe the behavior of General Detect #DBs, which are + * fault-like. They do _not_ set RF, a la code breakpoints. + */ + if (exception_type(vcpu->arch.exception.vector) == EXCPT_FAULT) __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) | X86_EFLAGS_RF); - if (vcpu->arch.exception.nr == DB_VECTOR) { - kvm_deliver_exception_payload(vcpu); + if (vcpu->arch.exception.vector == DB_VECTOR) { + kvm_deliver_exception_payload(vcpu, &vcpu->arch.exception); if (vcpu->arch.dr7 & DR7_GD) { vcpu->arch.dr7 &= ~DR7_GD; kvm_update_dr7(vcpu); @@ -9801,11 +9981,11 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) } if (is_guest_mode(vcpu) && - kvm_x86_ops.nested_ops->hv_timer_pending && - kvm_x86_ops.nested_ops->hv_timer_pending(vcpu)) + kvm_x86_ops.nested_ops->has_events && + kvm_x86_ops.nested_ops->has_events(vcpu)) *req_immediate_exit = true; - WARN_ON(vcpu->arch.exception.pending); + WARN_ON(kvm_is_exception_pending(vcpu)); return 0; out: @@ -10110,7 +10290,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) * When APICv gets disabled, we may still have injected interrupts * pending. At the same time, KVM_REQ_EVENT may not be set as APICv was * still active when the interrupt got accepted. Make sure - * inject_pending_event() is called to check for that. + * kvm_check_and_inject_events() is called to check for that. */ if (!apic->apicv_active) kvm_make_request(KVM_REQ_EVENT, vcpu); @@ -10407,7 +10587,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) goto out; } - r = inject_pending_event(vcpu, &req_immediate_exit); + r = kvm_check_and_inject_events(vcpu, &req_immediate_exit); if (r < 0) { r = 0; goto out; @@ -10646,10 +10826,26 @@ static inline int vcpu_block(struct kvm_vcpu *vcpu) if (hv_timer) kvm_lapic_switch_to_hv_timer(vcpu); - if (!kvm_check_request(KVM_REQ_UNHALT, vcpu)) + /* + * If the vCPU is not runnable, a signal or another host event + * of some kind is pending; service it without changing the + * vCPU's activity state. + */ + if (!kvm_arch_vcpu_runnable(vcpu)) return 1; } + /* + * Evaluate nested events before exiting the halted state. This allows + * the halt state to be recorded properly in the VMCS12's activity + * state field (AMD does not have a similar field and a VM-Exit always + * causes a spurious wakeup from HLT). + */ + if (is_guest_mode(vcpu)) { + if (kvm_check_nested_events(vcpu) < 0) + return 0; + } + if (kvm_apic_accept_events(vcpu) < 0) return 0; switch(vcpu->arch.mp_state) { @@ -10673,9 +10869,6 @@ static inline int vcpu_block(struct kvm_vcpu *vcpu) static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu)) - kvm_check_nested_events(vcpu); - return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && !vcpu->arch.apf.halted); } @@ -10824,6 +11017,7 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { + struct kvm_queued_exception *ex = &vcpu->arch.exception; struct kvm_run *kvm_run = vcpu->run; int r; @@ -10852,7 +11046,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) r = 0; goto out; } - kvm_clear_request(KVM_REQ_UNHALT, vcpu); r = -EAGAIN; if (signal_pending(current)) { r = -EINTR; @@ -10882,6 +11075,21 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) } } + /* + * If userspace set a pending exception and L2 is active, convert it to + * a pending VM-Exit if L1 wants to intercept the exception. + */ + if (vcpu->arch.exception_from_userspace && is_guest_mode(vcpu) && + kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, ex->vector, + ex->error_code)) { + kvm_queue_exception_vmexit(vcpu, ex->vector, + ex->has_error_code, ex->error_code, + ex->has_payload, ex->payload); + ex->injected = false; + ex->pending = false; + } + vcpu->arch.exception_from_userspace = false; + if (unlikely(vcpu->arch.complete_userspace_io)) { int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io; vcpu->arch.complete_userspace_io = NULL; @@ -10988,6 +11196,7 @@ static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED); vcpu->arch.exception.pending = false; + vcpu->arch.exception_vmexit.pending = false; kvm_make_request(KVM_REQ_EVENT, vcpu); } @@ -11125,11 +11334,12 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, } /* - * KVM_MP_STATE_INIT_RECEIVED means the processor is in - * INIT state; latched init should be reported using - * KVM_SET_VCPU_EVENTS, so reject it here. + * Pending INITs are reported using KVM_SET_VCPU_EVENTS, disallow + * forcing the guest into INIT/SIPI if those events are supposed to be + * blocked. KVM prioritizes SMI over INIT, so reject INIT/SIPI state + * if an SMI is pending as well. */ - if ((kvm_vcpu_latch_init(vcpu) || vcpu->arch.smi_pending) && + if ((!kvm_apic_init_sipi_allowed(vcpu) || vcpu->arch.smi_pending) && (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED || mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED)) goto out; @@ -11368,7 +11578,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) { r = -EBUSY; - if (vcpu->arch.exception.pending) + if (kvm_is_exception_pending(vcpu)) goto out; if (dbg->control & KVM_GUESTDBG_INJECT_DB) kvm_queue_exception(vcpu, DB_VECTOR); @@ -11750,8 +11960,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate; /* - * To avoid have the INIT path from kvm_apic_has_events() that be - * called with loaded FPU and does not let userspace fix the state. + * All paths that lead to INIT are required to load the guest's + * FPU state (because most paths are buried in KVM_RUN). */ if (init_event) kvm_put_guest_fpu(vcpu); @@ -12080,6 +12290,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (ret) goto out_page_track; + ret = static_call(kvm_x86_vm_init)(kvm); + if (ret) + goto out_uninit_mmu; + INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); atomic_set(&kvm->arch.noncoherent_dma_count, 0); @@ -12115,8 +12329,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_hv_init_vm(kvm); kvm_xen_init_vm(kvm); - return static_call(kvm_x86_vm_init)(kvm); + return 0; +out_uninit_mmu: + kvm_mmu_uninit_vm(kvm); out_page_track: kvm_page_track_cleanup(kvm); out: @@ -12589,13 +12805,14 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) if (!list_empty_careful(&vcpu->async_pf.done)) return true; - if (kvm_apic_has_events(vcpu)) + if (kvm_apic_has_pending_init_or_sipi(vcpu) && + kvm_apic_init_sipi_allowed(vcpu)) return true; if (vcpu->arch.pv.pv_unhalted) return true; - if (vcpu->arch.exception.pending) + if (kvm_is_exception_pending(vcpu)) return true; if (kvm_test_request(KVM_REQ_NMI, vcpu) || @@ -12617,16 +12834,13 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) return true; if (is_guest_mode(vcpu) && - kvm_x86_ops.nested_ops->hv_timer_pending && - kvm_x86_ops.nested_ops->hv_timer_pending(vcpu)) + kvm_x86_ops.nested_ops->has_events && + kvm_x86_ops.nested_ops->has_events(vcpu)) return true; if (kvm_xen_has_pending_events(vcpu)) return true; - if (kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu)) - return true; - return false; } @@ -12850,7 +13064,7 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu) { if (unlikely(!lapic_in_kernel(vcpu) || kvm_event_needs_reinjection(vcpu) || - vcpu->arch.exception.pending)) + kvm_is_exception_pending(vcpu))) return false; if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu)) @@ -13401,7 +13615,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit); |