diff options
38 files changed, 501 insertions, 275 deletions
diff --git a/Documentation/virt/kvm/mmu.txt b/Documentation/virt/kvm/mmu.txt index 1b9880dfba0a..dadb29e8738f 100644 --- a/Documentation/virt/kvm/mmu.txt +++ b/Documentation/virt/kvm/mmu.txt @@ -294,7 +294,7 @@ Handling a page fault is performed as follows: - walk shadow page table - check for valid generation number in the spte (see "Fast invalidation of MMIO sptes" below) - - cache the information to vcpu->arch.mmio_gva, vcpu->arch.access and + - cache the information to vcpu->arch.mmio_gva, vcpu->arch.mmio_access and vcpu->arch.mmio_gfn, and call the emulator - If both P bit and R/W bit of error code are set, this could possibly be handled as a "fast page fault" (fixed without taking the MMU lock). See @@ -304,7 +304,7 @@ Handling a page fault is performed as follows: - if permissions are insufficient, reflect the fault back to the guest - determine the host page - if this is an mmio request, there is no host page; cache the info to - vcpu->arch.mmio_gva, vcpu->arch.access and vcpu->arch.mmio_gfn + vcpu->arch.mmio_gva, vcpu->arch.mmio_access and vcpu->arch.mmio_gfn - walk the shadow page table to find the spte for the translation, instantiating missing intermediate page tables as necessary - If this is an mmio request, cache the mmio info to the spte and set some diff --git a/MAINTAINERS b/MAINTAINERS index e81e60bd7c26..0e8151151d2d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8818,14 +8818,6 @@ F: virt/kvm/* F: tools/kvm/ F: tools/testing/selftests/kvm/ -KERNEL VIRTUAL MACHINE FOR AMD-V (KVM/amd) -M: Joerg Roedel <joro@8bytes.org> -L: kvm@vger.kernel.org -W: http://www.linux-kvm.org/ -S: Maintained -F: arch/x86/include/asm/svm.h -F: arch/x86/kvm/svm.c - KERNEL VIRTUAL MACHINE FOR ARM/ARM64 (KVM/arm, KVM/arm64) M: Marc Zyngier <maz@kernel.org> R: James Morse <james.morse@arm.com> @@ -8868,7 +8860,7 @@ M: Christian Borntraeger <borntraeger@de.ibm.com> M: Janosch Frank <frankja@linux.ibm.com> R: David Hildenbrand <david@redhat.com> R: Cornelia Huck <cohuck@redhat.com> -L: linux-s390@vger.kernel.org +L: kvm@vger.kernel.org W: http://www.ibm.com/developerworks/linux/linux390/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git S: Supported @@ -8883,6 +8875,11 @@ F: tools/testing/selftests/kvm/*/s390x/ KERNEL VIRTUAL MACHINE FOR X86 (KVM/x86) M: Paolo Bonzini <pbonzini@redhat.com> M: Radim Krčmář <rkrcmar@redhat.com> +R: Sean Christopherson <sean.j.christopherson@intel.com> +R: Vitaly Kuznetsov <vkuznets@redhat.com> +R: Wanpeng Li <wanpengli@tencent.com> +R: Jim Mattson <jmattson@google.com> +R: Joerg Roedel <joro@8bytes.org> L: kvm@vger.kernel.org W: http://www.linux-kvm.org T: git git://git.kernel.org/pub/scm/virt/kvm/kvm.git @@ -8890,8 +8887,12 @@ S: Supported F: arch/x86/kvm/ F: arch/x86/kvm/*/ F: arch/x86/include/uapi/asm/kvm* +F: arch/x86/include/uapi/asm/vmx.h +F: arch/x86/include/uapi/asm/svm.h F: arch/x86/include/asm/kvm* F: arch/x86/include/asm/pvclock-abi.h +F: arch/x86/include/asm/svm.h +F: arch/x86/include/asm/vmx.h F: arch/x86/kernel/kvm.c F: arch/x86/kernel/kvmclock.c diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index e6e5f59aaa97..6fb5fb4779e0 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -232,11 +232,25 @@ struct revmap_entry { }; /* - * We use the top bit of each memslot->arch.rmap entry as a lock bit, - * and bit 32 as a present flag. The bottom 32 bits are the - * index in the guest HPT of a HPTE that points to the page. + * The rmap array of size number of guest pages is allocated for each memslot. + * This array is used to store usage specific information about the guest page. + * Below are the encodings of the various possible usage types. */ -#define KVMPPC_RMAP_LOCK_BIT 63 +/* Free bits which can be used to define a new usage */ +#define KVMPPC_RMAP_TYPE_MASK 0xff00000000000000 +#define KVMPPC_RMAP_NESTED 0xc000000000000000 /* Nested rmap array */ +#define KVMPPC_RMAP_HPT 0x0100000000000000 /* HPT guest */ + +/* + * rmap usage definition for a hash page table (hpt) guest: + * 0x0000080000000000 Lock bit + * 0x0000018000000000 RC bits + * 0x0000000100000000 Present bit + * 0x00000000ffffffff HPT index bits + * The bottom 32 bits are the index in the guest HPT of a HPTE that points to + * the page. + */ +#define KVMPPC_RMAP_LOCK_BIT 43 #define KVMPPC_RMAP_RC_SHIFT 32 #define KVMPPC_RMAP_REFERENCED (HPTE_R_R << KVMPPC_RMAP_RC_SHIFT) #define KVMPPC_RMAP_PRESENT 0x100000000ul diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 2484e6a8f5ca..8e8514efb124 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -598,6 +598,7 @@ extern int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val); extern int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val); +extern bool kvmppc_xive_native_supported(void); #else static inline int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server, diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h index e4016985764e..818989e11678 100644 --- a/arch/powerpc/include/asm/xive.h +++ b/arch/powerpc/include/asm/xive.h @@ -46,7 +46,15 @@ struct xive_irq_data { /* Setup/used by frontend */ int target; + /* + * saved_p means that there is a queue entry for this interrupt + * in some CPU's queue (not including guest vcpu queues), even + * if P is not set in the source ESB. + * stale_p means that there is no queue entry for this interrupt + * in some CPU's queue, even if P is set in the source ESB. + */ bool saved_p; + bool stale_p; }; #define XIVE_IRQ_FLAG_STORE_EOI 0x01 #define XIVE_IRQ_FLAG_LSI 0x02 @@ -127,6 +135,7 @@ extern int xive_native_get_queue_state(u32 vp_id, uint32_t prio, u32 *qtoggle, extern int xive_native_set_queue_state(u32 vp_id, uint32_t prio, u32 qtoggle, u32 qindex); extern int xive_native_get_vp_state(u32 vp_id, u64 *out_state); +extern bool xive_native_has_queue_state_support(void); #else diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 9524d92bc45d..d7fcdfa7fee4 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -1083,9 +1083,11 @@ static int kvmppc_book3s_init(void) if (xics_on_xive()) { kvmppc_xive_init_module(); kvm_register_device_ops(&kvm_xive_ops, KVM_DEV_TYPE_XICS); - kvmppc_xive_native_init_module(); - kvm_register_device_ops(&kvm_xive_native_ops, - KVM_DEV_TYPE_XIVE); + if (kvmppc_xive_native_supported()) { + kvmppc_xive_native_init_module(); + kvm_register_device_ops(&kvm_xive_native_ops, + KVM_DEV_TYPE_XIVE); + } } else #endif kvm_register_device_ops(&kvm_xics_ops, KVM_DEV_TYPE_XICS); diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index cde3f5a4b3e4..f8975c620f41 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1678,7 +1678,14 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, *val = get_reg_val(id, vcpu->arch.pspb); break; case KVM_REG_PPC_DPDES: - *val = get_reg_val(id, vcpu->arch.vcore->dpdes); + /* + * On POWER9, where we are emulating msgsndp etc., + * we return 1 bit for each vcpu, which can come from + * either vcore->dpdes or doorbell_request. + * On POWER8, doorbell_request is 0. + */ + *val = get_reg_val(id, vcpu->arch.vcore->dpdes | + vcpu->arch.doorbell_request); break; case KVM_REG_PPC_VTB: *val = get_reg_val(id, vcpu->arch.vcore->vtb); @@ -2860,7 +2867,7 @@ static void collect_piggybacks(struct core_info *cip, int target_threads) if (!spin_trylock(&pvc->lock)) continue; prepare_threads(pvc); - if (!pvc->n_runnable) { + if (!pvc->n_runnable || !pvc->kvm->arch.mmu_ready) { list_del_init(&pvc->preempt_list); if (pvc->runner == NULL) { pvc->vcore_state = VCORE_INACTIVE; @@ -2881,15 +2888,20 @@ static void collect_piggybacks(struct core_info *cip, int target_threads) spin_unlock(&lp->lock); } -static bool recheck_signals(struct core_info *cip) +static bool recheck_signals_and_mmu(struct core_info *cip) { int sub, i; struct kvm_vcpu *vcpu; + struct kvmppc_vcore *vc; - for (sub = 0; sub < cip->n_subcores; ++sub) - for_each_runnable_thread(i, vcpu, cip->vc[sub]) + for (sub = 0; sub < cip->n_subcores; ++sub) { + vc = cip->vc[sub]; + if (!vc->kvm->arch.mmu_ready) + return true; + for_each_runnable_thread(i, vcpu, vc) if (signal_pending(vcpu->arch.run_task)) return true; + } return false; } @@ -3119,7 +3131,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) local_irq_disable(); hard_irq_disable(); if (lazy_irq_pending() || need_resched() || - recheck_signals(&core_info) || !vc->kvm->arch.mmu_ready) { + recheck_signals_and_mmu(&core_info)) { local_irq_enable(); vc->vcore_state = VCORE_INACTIVE; /* Unlock all except the primary vcore */ diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 63e0ce91e29d..7186c65c61c9 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -99,7 +99,7 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, } else { rev->forw = rev->back = pte_index; *rmap = (*rmap & ~KVMPPC_RMAP_INDEX) | - pte_index | KVMPPC_RMAP_PRESENT; + pte_index | KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_HPT; } unlock_rmap(rmap); } diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 337e64468d78..07181d0dfcb7 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -942,6 +942,8 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) ld r11, VCPU_XIVE_SAVED_STATE(r4) li r9, TM_QW1_OS lwz r8, VCPU_XIVE_CAM_WORD(r4) + cmpwi r8, 0 + beq no_xive li r7, TM_QW1_OS + TM_WORD2 mfmsr r0 andi. r0, r0, MSR_DR /* in real mode? */ @@ -2831,29 +2833,39 @@ kvm_cede_prodded: kvm_cede_exit: ld r9, HSTATE_KVM_VCPU(r13) #ifdef CONFIG_KVM_XICS - /* Abort if we still have a pending escalation */ + /* are we using XIVE with single escalation? */ + ld r10, VCPU_XIVE_ESC_VADDR(r9) + cmpdi r10, 0 + beq 3f + li r6, XIVE_ESB_SET_PQ_00 + /* + * If we still have a pending escalation, abort the cede, + * and we must set PQ to 10 rather than 00 so that we don't + * potentially end up with two entries for the escalation + * interrupt in the XIVE interrupt queue. In that case + * we also don't want to set xive_esc_on to 1 here in + * case we race with xive_esc_irq(). + */ lbz r5, VCPU_XIVE_ESC_ON(r9) cmpwi r5, 0 - beq 1f + beq 4f li r0, 0 stb r0, VCPU_CEDED(r9) -1: /* Enable XIVE escalation */ - li r5, XIVE_ESB_SET_PQ_00 + li r6, XIVE_ESB_SET_PQ_10 + b 5f +4: li r0, 1 + stb r0, VCPU_XIVE_ESC_ON(r9) + /* make sure store to xive_esc_on is seen before xive_esc_irq runs */ + sync +5: /* Enable XIVE escalation */ mfmsr r0 andi. r0, r0, MSR_DR /* in real mode? */ beq 1f - ld r10, VCPU_XIVE_ESC_VADDR(r9) - cmpdi r10, 0 - beq 3f - ldx r0, r10, r5 + ldx r0, r10, r6 b 2f 1: ld r10, VCPU_XIVE_ESC_RADDR(r9) - cmpdi r10, 0 - beq 3f - ldcix r0, r10, r5 + ldcix r0, r10, r6 2: sync - li r0, 1 - stb r0, VCPU_XIVE_ESC_ON(r9) #endif /* CONFIG_KVM_XICS */ 3: b guest_exit_cont diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index e3ba67095895..591bfb4bfd0f 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -67,8 +67,14 @@ void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt; u64 pq; - if (!tima) + /* + * Nothing to do if the platform doesn't have a XIVE + * or this vCPU doesn't have its own XIVE context + * (e.g. because it's not using an in-kernel interrupt controller). + */ + if (!tima || !vcpu->arch.xive_cam_word) return; + eieio(); __raw_writeq(vcpu->arch.xive_saved_state.w01, tima + TM_QW1_OS); __raw_writel(vcpu->arch.xive_cam_word, tima + TM_QW1_OS + TM_WORD2); @@ -160,6 +166,9 @@ static irqreturn_t xive_esc_irq(int irq, void *data) */ vcpu->arch.xive_esc_on = false; + /* This orders xive_esc_on = false vs. subsequent stale_p = true */ + smp_wmb(); /* goes with smp_mb() in cleanup_single_escalation */ + return IRQ_HANDLED; } @@ -1113,6 +1122,31 @@ void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu) vcpu->arch.xive_esc_raddr = 0; } +/* + * In single escalation mode, the escalation interrupt is marked so + * that EOI doesn't re-enable it, but just sets the stale_p flag to + * indicate that the P bit has already been dealt with. However, the + * assembly code that enters the guest sets PQ to 00 without clearing + * stale_p (because it has no easy way to address it). Hence we have + * to adjust stale_p before shutting down the interrupt. + */ +void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu, + struct kvmppc_xive_vcpu *xc, int irq) +{ + struct irq_data *d = irq_get_irq_data(irq); + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + + /* + * This slightly odd sequence gives the right result + * (i.e. stale_p set if xive_esc_on is false) even if + * we race with xive_esc_irq() and xive_irq_eoi(). + */ + xd->stale_p = false; + smp_mb(); /* paired with smb_wmb in xive_esc_irq */ + if (!vcpu->arch.xive_esc_on) + xd->stale_p = true; +} + void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) { struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; @@ -1134,20 +1168,28 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) /* Mask the VP IPI */ xive_vm_esb_load(&xc->vp_ipi_data, XIVE_ESB_SET_PQ_01); - /* Disable the VP */ - xive_native_disable_vp(xc->vp_id); - - /* Free the queues & associated interrupts */ + /* Free escalations */ for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { - struct xive_q *q = &xc->queues[i]; - - /* Free the escalation irq */ if (xc->esc_virq[i]) { + if (xc->xive->single_escalation) + xive_cleanup_single_escalation(vcpu, xc, + xc->esc_virq[i]); free_irq(xc->esc_virq[i], vcpu); irq_dispose_mapping(xc->esc_virq[i]); kfree(xc->esc_virq_names[i]); } - /* Free the queue */ + } + + /* Disable the VP */ + xive_native_disable_vp(xc->vp_id); + + /* Clear the cam word so guest entry won't try to push context */ + vcpu->arch.xive_cam_word = 0; + + /* Free the queues */ + for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { + struct xive_q *q = &xc->queues[i]; + xive_native_disable_queue(xc->vp_id, q, i); if (q->qpage) { free_pages((unsigned long)q->qpage, diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h index 50494d0ee375..955b820ffd6d 100644 --- a/arch/powerpc/kvm/book3s_xive.h +++ b/arch/powerpc/kvm/book3s_xive.h @@ -282,6 +282,8 @@ int kvmppc_xive_select_target(struct kvm *kvm, u32 *server, u8 prio); int kvmppc_xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio, bool single_escalation); struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type); +void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu, + struct kvmppc_xive_vcpu *xc, int irq); #endif /* CONFIG_KVM_XICS */ #endif /* _KVM_PPC_BOOK3S_XICS_H */ diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index a998823f68a3..248c1ea9e788 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -67,20 +67,28 @@ void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu) xc->valid = false; kvmppc_xive_disable_vcpu_interrupts(vcpu); - /* Disable the VP */ - xive_native_disable_vp(xc->vp_id); - - /* Free the queues & associated interrupts */ + /* Free escalations */ for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { /* Free the escalation irq */ if (xc->esc_virq[i]) { + if (xc->xive->single_escalation) + xive_cleanup_single_escalation(vcpu, xc, + xc->esc_virq[i]); free_irq(xc->esc_virq[i], vcpu); irq_dispose_mapping(xc->esc_virq[i]); kfree(xc->esc_virq_names[i]); xc->esc_virq[i] = 0; } + } + + /* Disable the VP */ + xive_native_disable_vp(xc->vp_id); + + /* Clear the cam word so guest entry won't try to push context */ + vcpu->arch.xive_cam_word = 0; - /* Free the queue */ + /* Free the queues */ + for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { kvmppc_xive_native_cleanup_queue(vcpu, i); } @@ -1171,6 +1179,11 @@ int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val) return 0; } +bool kvmppc_xive_native_supported(void) +{ + return xive_native_has_queue_state_support(); +} + static int xive_native_debug_show(struct seq_file *m, void *private) { struct kvmppc_xive *xive = m->private; diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index b5a848a55504..00649ca5fa9a 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -440,6 +440,9 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_e500(struct kvm *kvm, struct kvm_vcpu *vcpu; int err; + BUILD_BUG_ON_MSG(offsetof(struct kvmppc_vcpu_e500, vcpu) != 0, + "struct kvm_vcpu must be at offset 0 for arch usercopy region"); + vcpu_e500 = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); if (!vcpu_e500) { err = -ENOMEM; diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index bb4d09c1ad56..6fca38ca791f 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -271,6 +271,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) */ if (inst == KVMPPC_INST_SW_BREAKPOINT) { run->exit_reason = KVM_EXIT_DEBUG; + run->debug.arch.status = 0; run->debug.arch.address = kvmppc_get_pc(vcpu); emulated = EMULATE_EXIT_USER; advance = 0; diff --git a/arch/powerpc/kvm/emulate_loadstore.c b/arch/powerpc/kvm/emulate_loadstore.c index 9208c82ed08d..2e496eb86e94 100644 --- a/arch/powerpc/kvm/emulate_loadstore.c +++ b/arch/powerpc/kvm/emulate_loadstore.c @@ -89,12 +89,6 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu) rs = get_rs(inst); rt = get_rt(inst); - /* - * if mmio_vsx_tx_sx_enabled == 0, copy data between - * VSR[0..31] and memory - * if mmio_vsx_tx_sx_enabled == 1, copy data between - * VSR[32..63] and memory - */ vcpu->arch.mmio_vsx_copy_nums = 0; vcpu->arch.mmio_vsx_offset = 0; vcpu->arch.mmio_copy_type = KVMPPC_VSX_COPY_NONE; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 3e566c2e6066..3a77bb643452 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -561,7 +561,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) * a POWER9 processor) and the PowerNV platform, as * nested is not yet supported. */ - r = xive_enabled() && !!cpu_has_feature(CPU_FTR_HVMODE); + r = xive_enabled() && !!cpu_has_feature(CPU_FTR_HVMODE) && + kvmppc_xive_native_supported(); break; #endif diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index 1cdb39575eae..be86fce1a84e 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -135,7 +135,7 @@ static u32 xive_read_eq(struct xive_q *q, bool just_peek) static u32 xive_scan_interrupts(struct xive_cpu *xc, bool just_peek) { u32 irq = 0; - u8 prio; + u8 prio = 0; /* Find highest pending priority */ while (xc->pending_prio != 0) { @@ -148,8 +148,19 @@ static u32 xive_scan_interrupts(struct xive_cpu *xc, bool just_peek) irq = xive_read_eq(&xc->queue[prio], just_peek); /* Found something ? That's it */ - if (irq) - break; + if (irq) { + if (just_peek || irq_to_desc(irq)) + break; + /* + * We should never get here; if we do then we must + * have failed to synchronize the interrupt properly + * when shutting it down. + */ + pr_crit("xive: got interrupt %d without descriptor, dropping\n", + irq); + WARN_ON(1); + continue; + } /* Clear pending bits */ xc->pending_prio &= ~(1 << prio); @@ -307,6 +318,7 @@ static void xive_do_queue_eoi(struct xive_cpu *xc) */ static void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd) { + xd->stale_p = false; /* If the XIVE supports the new "store EOI facility, use it */ if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI) xive_esb_write(xd, XIVE_ESB_STORE_EOI, 0); @@ -350,7 +362,7 @@ static void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd) } } -/* irq_chip eoi callback */ +/* irq_chip eoi callback, called with irq descriptor lock held */ static void xive_irq_eoi(struct irq_data *d) { struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); @@ -366,6 +378,8 @@ static void xive_irq_eoi(struct irq_data *d) if (!irqd_irq_disabled(d) && !irqd_is_forwarded_to_vcpu(d) && !(xd->flags & XIVE_IRQ_NO_EOI)) xive_do_source_eoi(irqd_to_hwirq(d), xd); + else + xd->stale_p = true; /* * Clear saved_p to indicate that it's no longer occupying @@ -397,11 +411,16 @@ static void xive_do_source_set_mask(struct xive_irq_data *xd, */ if (mask) { val = xive_esb_read(xd, XIVE_ESB_SET_PQ_01); - xd->saved_p = !!(val & XIVE_ESB_VAL_P); - } else if (xd->saved_p) + if (!xd->stale_p && !!(val & XIVE_ESB_VAL_P)) + xd->saved_p = true; + xd->stale_p = false; + } else if (xd->saved_p) { xive_esb_read(xd, XIVE_ESB_SET_PQ_10); - else + xd->saved_p = false; + } else { xive_esb_read(xd, XIVE_ESB_SET_PQ_00); + xd->stale_p = false; + } } /* @@ -541,6 +560,8 @@ static unsigned int xive_irq_startup(struct irq_data *d) unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); int target, rc; + xd->saved_p = false; + xd->stale_p = false; pr_devel("xive_irq_startup: irq %d [0x%x] data @%p\n", d->irq, hw_irq, d); @@ -587,6 +608,7 @@ static unsigned int xive_irq_startup(struct irq_data *d) return 0; } +/* called with irq descriptor lock held */ static void xive_irq_shutdown(struct irq_data *d) { struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); @@ -602,16 +624,6 @@ static void xive_irq_shutdown(struct irq_data *d) xive_do_source_set_mask(xd, true); /* - * The above may have set saved_p. We clear it otherwise it - * will prevent re-enabling later on. It is ok to forget the - * fact that the interrupt might be in a queue because we are - * accounting that already in xive_dec_target_count() and will - * be re-routing it to a new queue with proper accounting when - * it's started up again - */ - xd->saved_p = false; - - /* * Mask the interrupt in HW in the IVT/EAS and set the number * to be the "bad" IRQ number */ @@ -797,6 +809,10 @@ static int xive_irq_retrigger(struct irq_data *d) return 1; } +/* + * Caller holds the irq descriptor lock, so this won't be called + * concurrently with xive_get_irqchip_state on the same interrupt. + */ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state) { struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); @@ -820,6 +836,10 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state) /* Set it to PQ=10 state to prevent further sends */ pq = xive_esb_read(xd, XIVE_ESB_SET_PQ_10); + if (!xd->stale_p) { + xd->saved_p = !!(pq & XIVE_ESB_VAL_P); + xd->stale_p = !xd->saved_p; + } /* No target ? nothing to do */ if (xd->target == XIVE_INVALID_TARGET) { @@ -827,7 +847,7 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state) * An untargetted interrupt should have been * also masked at the source */ - WARN_ON(pq & 2); + WARN_ON(xd->saved_p); return 0; } @@ -847,9 +867,8 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state) * This saved_p is cleared by the host EOI, when we know * for sure the queue slot is no longer in use. */ - if (pq & 2) { - pq = xive_esb_read(xd, XIVE_ESB_SET_PQ_11); - xd->saved_p = true; + if (xd->saved_p) { + xive_esb_read(xd, XIVE_ESB_SET_PQ_11); /* * Sync the XIVE source HW to ensure the interrupt @@ -862,8 +881,7 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state) */ if (xive_ops->sync_source) xive_ops->sync_source(hw_irq); - } else - xd->saved_p = false; + } } else { irqd_clr_forwarded_to_vcpu(d); @@ -914,6 +932,23 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state) return 0; } +/* Called with irq descriptor lock held. */ +static int xive_get_irqchip_state(struct irq_data *data, + enum irqchip_irq_state which, bool *state) +{ + struct xive_irq_data *xd = irq_data_get_irq_handler_data(data); + + switch (which) { + case IRQCHIP_STATE_ACTIVE: + *state = !xd->stale_p && + (xd->saved_p || + !!(xive_esb_read(xd, XIVE_ESB_GET) & XIVE_ESB_VAL_P)); + return 0; + default: + return -EINVAL; + } +} + static struct irq_chip xive_irq_chip = { .name = "XIVE-IRQ", .irq_startup = xive_irq_startup, @@ -925,6 +960,7 @@ static struct irq_chip xive_irq_chip = { .irq_set_type = xive_irq_set_type, .irq_retrigger = xive_irq_retrigger, .irq_set_vcpu_affinity = xive_irq_set_vcpu_affinity, + .irq_get_irqchip_state = xive_get_irqchip_state, }; bool is_xive_irq(struct irq_chip *chip) @@ -1338,6 +1374,11 @@ static void xive_flush_cpu_queue(unsigned int cpu, struct xive_cpu *xc) xd = irq_desc_get_handler_data(desc); /* + * Clear saved_p to indicate that it's no longer pending + */ + xd->saved_p = false; + + /* * For LSIs, we EOI, this will cause a resend if it's * still asserted. Otherwise do an MSI retrigger. */ diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c index 2f26b74f6cfa..37987c815913 100644 --- a/arch/powerpc/sysdev/xive/native.c +++ b/arch/powerpc/sysdev/xive/native.c @@ -800,6 +800,13 @@ int xive_native_set_queue_state(u32 vp_id, u32 prio, u32 qtoggle, u32 qindex) } EXPORT_SYMBOL_GPL(xive_native_set_queue_state); +bool xive_native_has_queue_state_support(void) +{ + return opal_check_token(OPAL_XIVE_GET_QUEUE_STATE) && + opal_check_token(OPAL_XIVE_SET_QUEUE_STATE); +} +EXPORT_SYMBOL_GPL(xive_native_has_queue_state_support); + int xive_native_get_vp_state(u32 vp_id, u64 *out_state) { __be64 state; diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index feab24cac610..77cf6c11f66b 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -229,7 +229,7 @@ struct x86_emulate_ops { int (*pre_leave_smm)(struct x86_emulate_ctxt *ctxt, const char *smstate); void (*post_leave_smm)(struct x86_emulate_ctxt *ctxt); - + int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr); }; typedef u32 __attribute__((vector_size(16))) sse128_t; @@ -429,6 +429,7 @@ enum x86_intercept { x86_intercept_ins, x86_intercept_out, x86_intercept_outs, + x86_intercept_xsetbv, nr_x86_intercepts }; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 74e88e5edd9c..5b14aa1fbeeb 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -717,7 +717,7 @@ struct kvm_vcpu_arch { /* Cache MMIO info */ u64 mmio_gva; - unsigned access; + unsigned mmio_access; gfn_t mmio_gfn; u64 mmio_gen; @@ -1070,7 +1070,7 @@ struct kvm_x86_ops { void (*run)(struct kvm_vcpu *vcpu); int (*handle_exit)(struct kvm_vcpu *vcpu); - void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); + int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu); void (*patch_hypercall)(struct kvm_vcpu *vcpu, @@ -1581,6 +1581,13 @@ bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, struct kvm_lapic_irq *irq); +static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq) +{ + /* We can only post Fixed and LowPrio IRQs */ + return (irq->delivery_mode == dest_Fixed || + irq->delivery_mode == dest_LowestPrio); +} + static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) { if (kvm_x86_ops->vcpu_blocking) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 22c2720cd948..dd5985eb61b4 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -392,6 +392,12 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index) entry->edx &= kvm_cpuid_7_0_edx_x86_features; cpuid_mask(&entry->edx, CPUID_7_EDX); + if (boot_cpu_has(X86_FEATURE_IBPB) && boot_cpu_has(X86_FEATURE_IBRS)) + entry->edx |= F(SPEC_CTRL); + if (boot_cpu_has(X86_FEATURE_STIBP)) + entry->edx |= F(INTEL_STIBP); + if (boot_cpu_has(X86_FEATURE_SSBD)) + entry->edx |= F(SPEC_CTRL_SSBD); /* * We emulate ARCH_CAPABILITIES in software even * if the host doesn't support it. @@ -729,18 +735,23 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, g_phys_as = phys_as; entry->eax = g_phys_as | (virt_as << 8); entry->edx = 0; + entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features; + cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX); /* - * IBRS, IBPB and VIRT_SSBD aren't necessarily present in - * hardware cpuid + * AMD has separate bits for each SPEC_CTRL bit. + * arch/x86/kernel/cpu/bugs.c is kind enough to + * record that in cpufeatures so use them. */ - if (boot_cpu_has(X86_FEATURE_AMD_IBPB)) + if (boot_cpu_has(X86_FEATURE_IBPB)) entry->ebx |= F(AMD_IBPB); - if (boot_cpu_has(X86_FEATURE_AMD_IBRS)) + if (boot_cpu_has(X86_FEATURE_IBRS)) entry->ebx |= F(AMD_IBRS); - if (boot_cpu_has(X86_FEATURE_VIRT_SSBD)) - entry->ebx |= F(VIRT_SSBD); - entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features; - cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX); + if (boot_cpu_has(X86_FEATURE_STIBP)) + entry->ebx |= F(AMD_STIBP); + if (boot_cpu_has(X86_FEATURE_SSBD)) + entry->ebx |= F(AMD_SSBD); + if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) + entry->ebx |= F(AMD_SSB_NO); /* * The preference is to use SPEC CTRL MSR instead of the * VIRT_SPEC MSR. diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 718f7d9afedc..bef3c3c695d7 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4156,6 +4156,20 @@ out: return rc; } +static int em_xsetbv(struct x86_emulate_ctxt *ctxt) +{ + u32 eax, ecx, edx; + + eax = reg_read(ctxt, VCPU_REGS_RAX); + edx = reg_read(ctxt, VCPU_REGS_RDX); + ecx = reg_read(ctxt, VCPU_REGS_RCX); + + if (ctxt->ops->set_xcr(ctxt, ecx, ((u64)edx << 32) | eax)) + return emulate_gp(ctxt, 0); + + return X86EMUL_CONTINUE; +} + static bool valid_cr(int nr) { switch (nr) { @@ -4409,6 +4423,12 @@ static const struct opcode group7_rm1[] = { N, N, N, N, N, N, }; +static const struct opcode group7_rm2[] = { + N, + II(ImplicitOps | Priv, em_xsetbv, xsetbv), + N, N, N, N, N, N, +}; + static const struct opcode group7_rm3[] = { DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa), II(SrcNone | Prot | EmulateOnUD, em_hypercall, vmmcall), @@ -4498,7 +4518,8 @@ static const struct group_dual group7 = { { }, { EXT(0, group7_rm0), EXT(0, group7_rm1), - N, EXT(0, group7_rm3), + EXT(0, group7_rm2), + EXT(0, group7_rm3), II(SrcNone | DstMem | Mov, em_smsw, smsw), N, II(SrcMem16 | Mov | Priv, em_lmsw, lmsw), EXT(0, group7_rm7), @@ -5144,7 +5165,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) else { rc = __do_insn_fetch_bytes(ctxt, 1); if (rc != X86EMUL_CONTINUE) - return rc; + goto done; } switch (mode) { diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 685d17c11461..e904ff06a83d 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -216,6 +216,9 @@ static void recalculate_apic_map(struct kvm *kvm) if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id]) new->phys_map[xapic_id] = apic; + if (!kvm_apic_sw_enabled(apic)) + continue; + ldr = kvm_lapic_get_reg(apic, APIC_LDR); if (apic_x2apic_mode(apic)) { @@ -258,6 +261,8 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) static_key_slow_dec_deferred(&apic_sw_disabled); else static_key_slow_inc(&apic_sw_disabled.key); + + recalculate_apic_map(apic->vcpu->kvm); } } diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 24843cf49579..4c45ff0cfbd0 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -214,6 +214,7 @@ static u64 __read_mostly shadow_accessed_mask; static u64 __read_mostly shadow_dirty_mask; static u64 __read_mostly shadow_mmio_mask; static u64 __read_mostly shadow_mmio_value; +static u64 __read_mostly shadow_mmio_access_mask; static u64 __read_mostly shadow_present_mask; static u64 __read_mostly shadow_me_mask; @@ -299,14 +300,21 @@ static void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, kvm_flush_remote_tlbs_with_range(kvm, &range); } -void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value) +void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value, u64 access_mask) { + BUG_ON((u64)(unsigned)access_mask != access_mask); BUG_ON((mmio_mask & mmio_value) != mmio_value); shadow_mmio_value = mmio_value | SPTE_SPECIAL_MASK; shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK; + shadow_mmio_access_mask = access_mask; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); +static bool is_mmio_spte(u64 spte) +{ + return (spte & shadow_mmio_mask) == shadow_mmio_value; +} + static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) { return sp->role.ad_disabled; @@ -314,19 +322,19 @@ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) static inline bool spte_ad_enabled(u64 spte) { - MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value); + MMU_WARN_ON(is_mmio_spte(spte)); return !(spte & shadow_acc_track_value); } static inline u64 spte_shadow_accessed_mask(u64 spte) { - MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value); + MMU_WARN_ON(is_mmio_spte(spte)); return spte_ad_enabled(spte) ? shadow_accessed_mask : 0; } static inline u64 spte_shadow_dirty_mask(u64 spte) { - MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value); + MMU_WARN_ON(is_mmio_spte(spte)); return spte_ad_enabled(spte) ? shadow_dirty_mask : 0; } @@ -389,7 +397,7 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, u64 mask = generation_mmio_spte_mask(gen); u64 gpa = gfn << PAGE_SHIFT; - access &= ACC_WRITE_MASK | ACC_USER_MASK; + access &= shadow_mmio_access_mask; mask |= shadow_mmio_value | access; mask |= gpa | shadow_nonpresent_or_rsvd_mask; mask |= (gpa & shadow_nonpresent_or_rsvd_mask) @@ -401,11 +409,6 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, mmu_spte_set(sptep, mask); } -static bool is_mmio_spte(u64 spte) -{ - return (spte & shadow_mmio_mask) == shadow_mmio_value; -} - static gfn_t get_mmio_spte_gfn(u64 spte) { u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask; @@ -418,8 +421,7 @@ static gfn_t get_mmio_spte_gfn(u64 spte) static unsigned get_mmio_spte_access(u64 spte) { - u64 mask = generation_mmio_spte_mask(MMIO_SPTE_GEN_MASK) | shadow_mmio_mask; - return (spte & ~mask) & ~PAGE_MASK; + return spte & shadow_mmio_access_mask; } static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, @@ -3290,7 +3292,8 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, } if (unlikely(is_noslot_pfn(pfn))) - vcpu_cache_mmio_info(vcpu, gva, gfn, access); + vcpu_cache_mmio_info(vcpu, gva, gfn, + access & shadow_mmio_access_mask); return false; } @@ -5653,38 +5656,7 @@ static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, struct kvm_page_track_notifier_node *node) { - struct kvm_mmu_page *sp; - LIST_HEAD(invalid_list); - unsigned long i; - bool flush; - gfn_t gfn; - - spin_lock(&kvm->mmu_lock); - - if (list_empty(&kvm->arch.active_mmu_pages)) - goto out_unlock; - - flush = slot_handle_all_level(kvm, slot, kvm_zap_rmapp, false); - - for (i = 0; i < slot->npages; i++) { - gfn = slot->base_gfn + i; - - for_each_valid_sp(kvm, sp, gfn) { - if (sp->gfn != gfn) - continue; - - kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); - } - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { - kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); - flush = false; - cond_resched_lock(&kvm->mmu_lock); - } - } - kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); - -out_unlock: - spin_unlock(&kvm->mmu_lock); + kvm_mmu_zap_all(kvm); } void kvm_mmu_init_vm(struct kvm *kvm) @@ -6028,7 +6000,7 @@ static void kvm_set_mmio_spte_mask(void) if (IS_ENABLED(CONFIG_X86_64) && shadow_phys_bits == 52) mask &= ~1ull; - kvm_mmu_set_mmio_spte_mask(mask, mask); + kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK); } int kvm_mmu_module_init(void) diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 54c2a377795b..11f8ec89433b 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -51,7 +51,7 @@ static inline u64 rsvd_bits(int s, int e) return ((1ULL << (e - s + 1)) - 1) << s; } -void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value); +void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value, u64 access_mask); void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d685491fce4d..f5b03d0c9bc6 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -68,10 +68,8 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); #define SEG_TYPE_LDT 2 #define SEG_TYPE_BUSY_TSS16 3 -#define SVM_FEATURE_NPT (1 << 0) #define SVM_FEATURE_LBRV (1 << 1) #define SVM_FEATURE_SVML (1 << 2) -#define SVM_FEATURE_NRIP (1 << 3) #define SVM_FEATURE_TSC_RATE (1 << 4) #define SVM_FEATURE_VMCB_CLEAN (1 << 5) #define SVM_FEATURE_FLUSH_ASID (1 << 6) @@ -770,7 +768,7 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) } -static void skip_emulated_instruction(struct kvm_vcpu *vcpu) +static int skip_emulated_instruction(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -779,18 +777,17 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) svm->next_rip = svm->vmcb->control.next_rip; } - if (!svm->next_rip) { - if (kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) != - EMULATE_DONE) - printk(KERN_DEBUG "%s: NOP\n", __func__); - return; - } + if (!svm->next_rip) + return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP); + if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE) printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n", __func__, kvm_rip_read(vcpu), svm->next_rip); kvm_rip_write(vcpu, svm->next_rip); svm_set_interrupt_shadow(vcpu, 0); + + return EMULATE_DONE; } static void svm_queue_exception(struct kvm_vcpu *vcpu) @@ -821,7 +818,7 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu) * raises a fault that is not intercepted. Still better than * failing in all cases. */ - skip_emulated_instruction(&svm->vcpu); + (void)skip_emulated_instruction(&svm->vcpu); rip = kvm_rip_read(&svm->vcpu); svm->int3_rip = rip + svm->vmcb->save.cs.base; svm->int3_injected = rip - old_rip; @@ -1714,7 +1711,6 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) if (!entry) return -EINVAL; - new_entry = READ_ONCE(*entry); new_entry = __sme_set((page_to_phys(svm->avic_backing_page) & AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) | AVIC_PHYSICAL_ID_ENTRY_VALID_MASK); @@ -2137,6 +2133,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) struct page *nested_msrpm_pages; int err; + BUILD_BUG_ON_MSG(offsetof(struct vcpu_svm, vcpu) != 0, + "struct kvm_vcpu must be at offset 0 for arch usercopy region"); + svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); if (!svm) { err = -ENOMEM; @@ -2904,13 +2903,11 @@ static int nop_on_interception(struct vcpu_svm *svm) static int halt_interception(struct vcpu_svm *svm) { - svm->next_rip = kvm_rip_read(&svm->vcpu) + 1; return kvm_emulate_halt(&svm->vcpu); } static int vmmcall_interception(struct vcpu_svm *svm) { - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; return kvm_emulate_hypercall(&svm->vcpu); } @@ -3589,9 +3586,9 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, mark_all_dirty(svm->vmcb); } -static bool nested_svm_vmrun(struct vcpu_svm *svm) +static int nested_svm_vmrun(struct vcpu_svm *svm) { - int rc; + int ret; struct vmcb *nested_vmcb; struct vmcb *hsave = svm->nested.hsave; struct vmcb *vmcb = svm->vmcb; @@ -3600,13 +3597,16 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) vmcb_gpa = svm->vmcb->save.rax; - rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb_gpa), &map); - if (rc) { - if (rc == -EINVAL) - kvm_inject_gp(&svm->vcpu, 0); - return false; + ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb_gpa), &map); + if (ret == EINVAL) { + kvm_inject_gp(&svm->vcpu, 0); + return 1; + } else if (ret) { + return kvm_skip_emulated_instruction(&svm->vcpu); } + ret = kvm_skip_emulated_instruction(&svm->vcpu); + nested_vmcb = map.hva; if (!nested_vmcb_checks(nested_vmcb)) { @@ -3617,7 +3617,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) kvm_vcpu_unmap(&svm->vcpu, &map, true); - return false; + return ret; } trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa, @@ -3661,7 +3661,16 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, &map); - return true; + if (!nested_svm_vmrun_msrpm(svm)) { + svm->vmcb->control.exit_code = SVM_EXIT_ERR; + svm->vmcb->control.exit_code_hi = 0; + svm->vmcb->control.exit_info_1 = 0; + svm->vmcb->control.exit_info_2 = 0; + + nested_svm_vmexit(svm); + } + + return ret; } static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) @@ -3698,7 +3707,6 @@ static int vmload_interception(struct vcpu_svm *svm) nested_vmcb = map.hva; - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; ret = kvm_skip_emulated_instruction(&svm->vcpu); nested_svm_vmloadsave(nested_vmcb, svm->vmcb); @@ -3725,7 +3733,6 @@ static int vmsave_interception(struct vcpu_svm *svm) nested_vmcb = map.hva; - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; ret = kvm_skip_emulated_instruction(&svm->vcpu); nested_svm_vmloadsave(svm->vmcb, nested_vmcb); @@ -3739,27 +3746,7 @@ static int vmrun_interception(struct vcpu_svm *svm) if (nested_svm_check_permissions(svm)) return 1; - /* Save rip after vmrun instruction */ - kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3); - - if (!nested_svm_vmrun(svm)) - return 1; - - if (!nested_svm_vmrun_msrpm(svm)) - goto failed; - - return 1; - -failed: - - svm->vmcb->control.exit_code = SVM_EXIT_ERR; - svm->vmcb->control.exit_code_hi = 0; - svm->vmcb->control.exit_info_1 = 0; - svm->vmcb->control.exit_info_2 = 0; - - nested_svm_vmexit(svm); - - return 1; + return nested_svm_vmrun(svm); } static int stgi_interception(struct vcpu_svm *svm) @@ -3776,7 +3763,6 @@ static int stgi_interception(struct vcpu_svm *svm) if (vgif_enabled(svm)) clr_intercept(svm, INTERCEPT_STGI); - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; ret = kvm_skip_emulated_instruction(&svm->vcpu); kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); @@ -3792,7 +3778,6 @@ static int clgi_interception(struct vcpu_svm *svm) if (nested_svm_check_permissions(svm)) return 1; - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; ret = kvm_skip_emulated_instruction(&svm->vcpu); disable_gif(svm); @@ -3817,7 +3802,6 @@ static int invlpga_interception(struct vcpu_svm *svm) /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ kvm_mmu_invlpg(vcpu, kvm_rax_read(&svm->vcpu)); - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; return kvm_skip_emulated_instruction(&svm->vcpu); } @@ -3840,7 +3824,6 @@ static int xsetbv_interception(struct vcpu_svm *svm) u32 index = kvm_rcx_read(&svm->vcpu); if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) { - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; return kvm_skip_emulated_instruction(&svm->vcpu); } @@ -3899,25 +3882,29 @@ static int task_switch_interception(struct vcpu_svm *svm) if (reason != TASK_SWITCH_GATE || int_type == SVM_EXITINTINFO_TYPE_SOFT || (int_type == SVM_EXITINTINFO_TYPE_EXEPT && - (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) - skip_emulated_instruction(&svm->vcpu); + (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) { + if (skip_emulated_instruction(&svm->vcpu) != EMULATE_DONE) + goto fail; + } if (int_type != SVM_EXITINTINFO_TYPE_SOFT) int_vec = -1; if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason, - has_error_code, error_code) == EMULATE_FAIL) { - svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - svm->vcpu.run->internal.ndata = 0; - return 0; - } + has_error_code, error_code) == EMULATE_FAIL) + goto fail; + return 1; + +fail: + svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + svm->vcpu.run->internal.ndata = 0; + return 0; } static int cpuid_interception(struct vcpu_svm *svm) { - svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; return kvm_emulate_cpuid(&svm->vcpu); } @@ -4247,7 +4234,6 @@ static int rdmsr_interception(struct vcpu_svm *svm) kvm_rax_write(&svm->vcpu, msr_info.data & 0xffffffff); kvm_rdx_write(&svm->vcpu, msr_info.data >> 32); - svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; return kvm_skip_emulated_instruction(&svm->vcpu); } } @@ -4447,7 +4433,6 @@ static int wrmsr_interception(struct vcpu_svm *svm) msr.index = ecx; msr.host_initiated = false; - svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; if (kvm_set_msr(&svm->vcpu, &msr)) { trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(&svm->vcpu, 0); @@ -5275,7 +5260,8 @@ get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, kvm_set_msi_irq(kvm, e, &irq); - if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) { + if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || + !kvm_irq_is_postable(&irq)) { pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n", __func__, irq.vector); return -1; @@ -5329,6 +5315,7 @@ static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, * 1. When cannot target interrupt to a specific vcpu. * 2. Unsetting posted interrupt. * 3. APIC virtialization is disabled for the vcpu. + * 4. IRQ has incompatible delivery mode (SMI, INIT, etc) */ if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set && kvm_vcpu_apicv_active(&svm->vcpu)) { @@ -5934,6 +5921,8 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu) guest_cpuid_clear(vcpu, X86_FEATURE_X2APIC); } +#define F(x) bit(X86_FEATURE_##x) + static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) { switch (func) { @@ -5945,6 +5934,11 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) if (nested) entry->ecx |= (1 << 2); /* Set SVM bit */ break; + case 0x80000008: + if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || + boot_cpu_has(X86_FEATURE_AMD_SSBD)) + entry->ebx |= F(VIRT_SSBD); + break; case 0x8000000A: entry->eax = 1; /* SVM revision 1 */ entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper @@ -5955,11 +5949,11 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) /* Support next_rip if host supports it */ if (boot_cpu_has(X86_FEATURE_NRIPS)) - entry->edx |= SVM_FEATURE_NRIP; + entry->edx |= F(NRIPS); /* Support NPT for the guest if enabled */ if (npt_enabled) - entry->edx |= SVM_FEATURE_NPT; + entry->edx |= F(NPT); break; case 0x8000001F: @@ -6068,6 +6062,7 @@ static const struct __x86_intercept { [x86_intercept_ins] = POST_EX(SVM_EXIT_IOIO), [x86_intercept_out] = POST_EX(SVM_EXIT_IOIO), [x86_intercept_outs] = POST_EX(SVM_EXIT_IOIO), + [x86_intercept_xsetbv] = PRE_EX(SVM_EXIT_XSETBV), }; #undef PRE_EX diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index b5c831e79094..6ce853a2e867 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1462,6 +1462,25 @@ TRACE_EVENT(kvm_hv_send_ipi_ex, __entry->vector, __entry->format, __entry->valid_bank_mask) ); + +TRACE_EVENT(kvm_pv_tlb_flush, + TP_PROTO(unsigned int vcpu_id, bool need_flush_tlb), + TP_ARGS(vcpu_id, need_flush_tlb), + + TP_STRUCT__entry( + __field( unsigned int, vcpu_id ) + __field( bool, need_flush_tlb ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->need_flush_tlb = need_flush_tlb; + ), + + TP_printk("vcpu %u need_flush_tlb %s", __entry->vcpu_id, + __entry->need_flush_tlb ? "true" : "false") +); + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 4010d519eb8c..751a384c2eb0 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -94,7 +94,7 @@ ENDPROC(vmx_vmexit) /** * __vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode - * @vmx: struct vcpu_vmx * + * @vmx: struct vcpu_vmx * (forwarded to vmx_update_host_rsp) * @regs: unsigned long * (to guest registers) * @launched: %true if the VMCS has been launched * @@ -151,7 +151,7 @@ ENTRY(__vmx_vcpu_run) mov VCPU_R14(%_ASM_AX), %r14 mov VCPU_R15(%_ASM_AX), %r15 #endif - /* Load guest RAX. This kills the vmx_vcpu pointer! */ + /* Load guest RAX. This kills the @regs pointer! */ mov VCPU_RAX(%_ASM_AX), %_ASM_AX /* Enter guest mode */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 42ed3faa6af8..63f3d88b36cc 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1472,8 +1472,11 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) return 0; } - -static void skip_emulated_instruction(struct kvm_vcpu *vcpu) +/* + * Returns an int to be compatible with SVM implementation (which can fail). + * Do not use directly, use skip_emulated_instruction() instead. + */ +static int __skip_emulated_instruction(struct kvm_vcpu *vcpu) { unsigned long rip; @@ -1483,6 +1486,13 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) /* skipping an emulated instruction also counts */ vmx_set_interrupt_shadow(vcpu, 0); + + return EMULATE_DONE; +} + +static inline void skip_emulated_instruction(struct kvm_vcpu *vcpu) +{ + (void)__skip_emulated_instruction(vcpu); } static void vmx_clear_hlt(struct kvm_vcpu *vcpu) @@ -4026,7 +4036,7 @@ static void ept_set_mmio_spte_mask(void) * of an EPT paging-structure entry is 110b (write/execute). */ kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK, - VMX_EPT_MISCONFIG_WX_VALUE); + VMX_EPT_MISCONFIG_WX_VALUE, 0); } #define VMX_XSS_EXIT_BITMAP 0 @@ -6615,6 +6625,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) unsigned long *msr_bitmap; int cpu; + BUILD_BUG_ON_MSG(offsetof(struct vcpu_vmx, vcpu) != 0, + "struct kvm_vcpu must be at offset 0 for arch usercopy region"); + vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); if (!vmx) return ERR_PTR(-ENOMEM); @@ -7369,10 +7382,14 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, * irqbalance to make the interrupts single-CPU. * * We will support full lowest-priority interrupt later. + * + * In addition, we can only inject generic interrupts using + * the PI mechanism, refuse to route others through it. */ kvm_set_msi_irq(kvm, e, &irq); - if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) { + if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || + !kvm_irq_is_postable(&irq)) { /* * Make sure the IRTE is in remapped mode if * we don't handle it in posted mode. @@ -7705,7 +7722,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .run = vmx_vcpu_run, .handle_exit = vmx_handle_exit, - .skip_emulated_instruction = skip_emulated_instruction, + .skip_emulated_instruction = __skip_emulated_instruction, .set_interrupt_shadow = vmx_set_interrupt_shadow, .get_interrupt_shadow = vmx_get_interrupt_shadow, .patch_hypercall = vmx_patch_hypercall, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 93b0bd45ac73..605f9902f164 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -674,8 +674,14 @@ static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, data, offset, len, access); } +static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) +{ + return rsvd_bits(cpuid_maxphyaddr(vcpu), 63) | rsvd_bits(5, 8) | + rsvd_bits(1, 2); +} + /* - * Load the pae pdptrs. Return true is they are all valid. + * Load the pae pdptrs. Return 1 if they are all valid, 0 otherwise. */ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) { @@ -694,8 +700,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) } for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { if ((pdpte[i] & PT_PRESENT_MASK) && - (pdpte[i] & - vcpu->arch.mmu->guest_rsvd_check.rsvd_bits_mask[0][2])) { + (pdpte[i] & pdptr_rsvd_bits(vcpu))) { ret = 0; goto out; } @@ -1254,6 +1259,13 @@ static u64 kvm_get_arch_capabilities(void) if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER) data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH; + if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) + data |= ARCH_CAP_RDCL_NO; + if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) + data |= ARCH_CAP_SSB_NO; + if (!boot_cpu_has_bug(X86_BUG_MDS)) + data |= ARCH_CAP_MDS_NO; + return data; } @@ -2452,6 +2464,8 @@ static void record_steal_time(struct kvm_vcpu *vcpu) * Doing a TLB flush here, on the guest's behalf, can avoid * expensive IPIs. */ + trace_kvm_pv_tlb_flush(vcpu->vcpu_id, + vcpu->arch.st.steal.preempted & KVM_VCPU_FLUSH_TLB); if (xchg(&vcpu->arch.st.steal.preempted, 0) & KVM_VCPU_FLUSH_TLB) kvm_vcpu_flush_tlb(vcpu, false); @@ -3506,8 +3520,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, for (bank = 0; bank < bank_num; bank++) vcpu->arch.mce_banks[bank*4] = ~(u64)0; - if (kvm_x86_ops->setup_mce) - kvm_x86_ops->setup_mce(vcpu); + kvm_x86_ops->setup_mce(vcpu); out: return r; } @@ -5370,7 +5383,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, */ if (vcpu_match_mmio_gva(vcpu, gva) && !permission_fault(vcpu, vcpu->arch.walk_mmu, - vcpu->arch.access, 0, access)) { + vcpu->arch.mmio_access, 0, access)) { *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | (gva & (PAGE_SIZE - 1)); trace_vcpu_match_mmio(gva, *gpa, write, false); @@ -6068,6 +6081,11 @@ static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt) kvm_smm_changed(emul_to_vcpu(ctxt)); } +static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr) +{ + return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr); +} + static const struct x86_emulate_ops emulate_ops = { .read_gpr = emulator_read_gpr, .write_gpr = emulator_write_gpr, @@ -6109,6 +6127,7 @@ static const struct x86_emulate_ops emulate_ops = { .set_hflags = emulator_set_hflags, .pre_leave_smm = emulator_pre_leave_smm, .post_leave_smm = emulator_post_leave_smm, + .set_xcr = emulator_set_xcr, }; static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) @@ -6383,9 +6402,11 @@ static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r) int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) { unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); - int r = EMULATE_DONE; + int r; - kvm_x86_ops->skip_emulated_instruction(vcpu); + r = kvm_x86_ops->skip_emulated_instruction(vcpu); + if (unlikely(r != EMULATE_DONE)) + return 0; /* * rflags is the old, "raw" value of the flags. The new value has @@ -6537,6 +6558,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, kvm_rip_write(vcpu, ctxt->_eip); if (ctxt->eflags & X86_EFLAGS_RF) kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF); + kvm_x86_ops->set_interrupt_shadow(vcpu, 0); return EMULATE_DONE; } @@ -9314,10 +9336,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_page_track_init(kvm); kvm_mmu_init_vm(kvm); - if (kvm_x86_ops->vm_init) - return kvm_x86_ops->vm_init(kvm); - - return 0; + return kvm_x86_ops->vm_init(kvm); } static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) @@ -10009,7 +10028,7 @@ EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); bool kvm_arch_has_irq_bypass(void) { - return kvm_x86_ops->update_pi_irte != NULL; + return true; } int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, @@ -10049,9 +10068,6 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set) { - if (!kvm_x86_ops->update_pi_irte) - return -EINVAL; - return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set); } diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 6594020c0691..b5274e2a53cf 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -196,7 +196,7 @@ static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu, * actually a nGPA. */ vcpu->arch.mmio_gva = mmu_is_nested(vcpu) ? 0 : gva & PAGE_MASK; - vcpu->arch.access = access; + vcpu->arch.mmio_access = access; vcpu->arch.mmio_gfn = gfn; vcpu->arch.mmio_gen = gen; } diff --git a/tools/testing/selftests/kvm/include/evmcs.h b/tools/testing/selftests/kvm/include/evmcs.h index 4059014d93ea..4912d23844bc 100644 --- a/tools/testing/selftests/kvm/include/evmcs.h +++ b/tools/testing/selftests/kvm/include/evmcs.h @@ -220,6 +220,8 @@ struct hv_enlightened_vmcs { struct hv_enlightened_vmcs *current_evmcs; struct hv_vp_assist_page *current_vp_assist; +int vcpu_enable_evmcs(struct kvm_vm *vm, int vcpu_id); + static inline int enable_vp_assist(uint64_t vp_assist_pa, void *vp_assist) { u64 val = (vp_assist_pa & HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK) | diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 6cb34a0fa200..0a5e487dbc50 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -1060,9 +1060,11 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid) TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XSAVE, r: %i", r); - r = ioctl(vcpu->fd, KVM_GET_XCRS, &state->xcrs); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XCRS, r: %i", - r); + if (kvm_check_cap(KVM_CAP_XCRS)) { + r = ioctl(vcpu->fd, KVM_GET_XCRS, &state->xcrs); + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XCRS, r: %i", + r); + } r = ioctl(vcpu->fd, KVM_GET_SREGS, &state->sregs); TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_SREGS, r: %i", @@ -1103,9 +1105,11 @@ void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *s TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XSAVE, r: %i", r); - r = ioctl(vcpu->fd, KVM_SET_XCRS, &state->xcrs); - TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XCRS, r: %i", - r); + if (kvm_check_cap(KVM_CAP_XCRS)) { + r = ioctl(vcpu->fd, KVM_SET_XCRS, &state->xcrs); + TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XCRS, r: %i", + r); + } r = ioctl(vcpu->fd, KVM_SET_SREGS, &state->sregs); TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_SREGS, r: %i", diff --git a/tools/testing/selftests/kvm/lib/x86_64/vmx.c b/tools/testing/selftests/kvm/lib/x86_64/vmx.c index 204f847bd065..9cef0455b819 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86_64/vmx.c @@ -12,6 +12,26 @@ bool enable_evmcs; +int vcpu_enable_evmcs(struct kvm_vm *vm, int vcpu_id) +{ + uint16_t evmcs_ver; + + struct kvm_enable_cap enable_evmcs_cap = { + .cap = KVM_CAP_HYPERV_ENLIGHTENED_VMCS, + .args[0] = (unsigned long)&evmcs_ver + }; + + vcpu_ioctl(vm, vcpu_id, KVM_ENABLE_CAP, &enable_evmcs_cap); + + /* KVM should return supported EVMCS version range */ + TEST_ASSERT(((evmcs_ver >> 8) >= (evmcs_ver & 0xff)) && + (evmcs_ver & 0xff) > 0, + "Incorrect EVMCS version range: %x:%x\n", + evmcs_ver & 0xff, evmcs_ver >> 8); + + return evmcs_ver; +} + /* Allocate memory regions for nested VMX tests. * * Input Args: diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c index f95c08343b48..92915e6408e7 100644 --- a/tools/testing/selftests/kvm/x86_64/evmcs_test.c +++ b/tools/testing/selftests/kvm/x86_64/evmcs_test.c @@ -79,11 +79,6 @@ int main(int argc, char *argv[]) struct kvm_x86_state *state; struct ucall uc; int stage; - uint16_t evmcs_ver; - struct kvm_enable_cap enable_evmcs_cap = { - .cap = KVM_CAP_HYPERV_ENLIGHTENED_VMCS, - .args[0] = (unsigned long)&evmcs_ver - }; /* Create VM */ vm = vm_create_default(VCPU_ID, 0, guest_code); @@ -96,13 +91,7 @@ int main(int argc, char *argv[]) exit(KSFT_SKIP); } - vcpu_ioctl(vm, VCPU_ID, KVM_ENABLE_CAP, &enable_evmcs_cap); - - /* KVM should return supported EVMCS version range */ - TEST_ASSERT(((evmcs_ver >> 8) >= (evmcs_ver & 0xff)) && - (evmcs_ver & 0xff) > 0, - "Incorrect EVMCS version range: %x:%x\n", - evmcs_ver & 0xff, evmcs_ver >> 8); + vcpu_enable_evmcs(vm, VCPU_ID); run = vcpu_state(vm, VCPU_ID); @@ -146,7 +135,7 @@ int main(int argc, char *argv[]) kvm_vm_restart(vm, O_RDWR); vm_vcpu_add(vm, VCPU_ID); vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); - vcpu_ioctl(vm, VCPU_ID, KVM_ENABLE_CAP, &enable_evmcs_cap); + vcpu_enable_evmcs(vm, VCPU_ID); vcpu_load_state(vm, VCPU_ID, state); run = vcpu_state(vm, VCPU_ID); free(state); diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c index f72b3043db0e..ee59831fbc98 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c @@ -18,6 +18,7 @@ #include "test_util.h" #include "kvm_util.h" #include "processor.h" +#include "vmx.h" #define VCPU_ID 0 @@ -106,12 +107,7 @@ int main(int argc, char *argv[]) { struct kvm_vm *vm; int rv; - uint16_t evmcs_ver; struct kvm_cpuid2 *hv_cpuid_entries; - struct kvm_enable_cap enable_evmcs_cap = { - .cap = KVM_CAP_HYPERV_ENLIGHTENED_VMCS, - .args[0] = (unsigned long)&evmcs_ver - }; /* Tell stdout not to buffer its content */ setbuf(stdout, NULL); @@ -136,14 +132,14 @@ int main(int argc, char *argv[]) free(hv_cpuid_entries); - rv = _vcpu_ioctl(vm, VCPU_ID, KVM_ENABLE_CAP, &enable_evmcs_cap); - - if (rv) { + if (!kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)) { fprintf(stderr, "Enlightened VMCS is unsupported, skip related test\n"); goto vm_free; } + vcpu_enable_evmcs(vm, VCPU_ID); + hv_cpuid_entries = kvm_get_supported_hv_cpuid(vm); if (!hv_cpuid_entries) return 1; diff --git a/tools/testing/selftests/kvm/x86_64/platform_info_test.c b/tools/testing/selftests/kvm/x86_64/platform_info_test.c index 40050e44ec0a..f9334bd3cce9 100644 --- a/tools/testing/selftests/kvm/x86_64/platform_info_test.c +++ b/tools/testing/selftests/kvm/x86_64/platform_info_test.c @@ -99,8 +99,8 @@ int main(int argc, char *argv[]) msr_platform_info = vcpu_get_msr(vm, VCPU_ID, MSR_PLATFORM_INFO); vcpu_set_msr(vm, VCPU_ID, MSR_PLATFORM_INFO, msr_platform_info | MSR_PLATFORM_INFO_MAX_TURBO_RATIO); - test_msr_platform_info_disabled(vm); test_msr_platform_info_enabled(vm); + test_msr_platform_info_disabled(vm); vcpu_set_msr(vm, VCPU_ID, MSR_PLATFORM_INFO, msr_platform_info); kvm_vm_free(vm); diff --git a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c index ed7218d166da..853e370e8a39 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c @@ -25,24 +25,17 @@ #define VMCS12_REVISION 0x11e57ed0 #define VCPU_ID 5 +bool have_evmcs; + void test_nested_state(struct kvm_vm *vm, struct kvm_nested_state *state) { - volatile struct kvm_run *run; - vcpu_nested_state_set(vm, VCPU_ID, state, false); - run = vcpu_state(vm, VCPU_ID); - vcpu_run(vm, VCPU_ID); - TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN, - "Got exit_reason other than KVM_EXIT_SHUTDOWN: %u (%s),\n", - run->exit_reason, - exit_reason_str(run->exit_reason)); } void test_nested_state_expect_errno(struct kvm_vm *vm, struct kvm_nested_state *state, int expected_errno) { - volatile struct kvm_run *run; int rv; rv = vcpu_nested_state_set(vm, VCPU_ID, state, true); @@ -50,12 +43,6 @@ void test_nested_state_expect_errno(struct kvm_vm *vm, "Expected %s (%d) from vcpu_nested_state_set but got rv: %i errno: %s (%d)", strerror(expected_errno), expected_errno, rv, strerror(errno), errno); - run = vcpu_state(vm, VCPU_ID); - vcpu_run(vm, VCPU_ID); - TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN, - "Got exit_reason other than KVM_EXIT_SHUTDOWN: %u (%s),\n", - run->exit_reason, - exit_reason_str(run->exit_reason)); } void test_nested_state_expect_einval(struct kvm_vm *vm, @@ -90,8 +77,9 @@ void set_default_vmx_state(struct kvm_nested_state *state, int size) { memset(state, 0, size); state->flags = KVM_STATE_NESTED_GUEST_MODE | - KVM_STATE_NESTED_RUN_PENDING | - KVM_STATE_NESTED_EVMCS; + KVM_STATE_NESTED_RUN_PENDING; + if (have_evmcs) + state->flags |= KVM_STATE_NESTED_EVMCS; state->format = 0; state->size = size; state->hdr.vmx.vmxon_pa = 0x1000; @@ -141,13 +129,19 @@ void test_vmx_nested_state(struct kvm_vm *vm) /* * Setting vmxon_pa == -1ull and vmcs_pa == -1ull exits early without * setting the nested state but flags other than eVMCS must be clear. + * The eVMCS flag can be set if the enlightened VMCS capability has + * been enabled. */ set_default_vmx_state(state, state_sz); state->hdr.vmx.vmxon_pa = -1ull; state->hdr.vmx.vmcs12_pa = -1ull; test_nested_state_expect_einval(vm, state); - state->flags = KVM_STATE_NESTED_EVMCS; + state->flags &= KVM_STATE_NESTED_EVMCS; + if (have_evmcs) { + test_nested_state_expect_einval(vm, state); + vcpu_enable_evmcs(vm, VCPU_ID); + } test_nested_state(vm, state); /* It is invalid to have vmxon_pa == -1ull and SMM flags non-zero. */ @@ -232,6 +226,8 @@ int main(int argc, char *argv[]) struct kvm_nested_state state; struct kvm_cpuid_entry2 *entry = kvm_get_supported_cpuid_entry(1); + have_evmcs = kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS); + if (!kvm_check_cap(KVM_CAP_NESTED_STATE)) { printf("KVM_CAP_NESTED_STATE not available, skipping test\n"); exit(KSFT_SKIP); |