diff options
author | Ingo Molnar <mingo@kernel.org> | 2022-01-08 10:53:57 +0100 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2022-01-08 10:53:57 +0100 |
commit | 0422fe2666aea4c0986f4c89dc107731aa6a7a81 (patch) | |
tree | 1d42a146a738fb933f061424751755e630aeccb0 /arch/x86/kvm | |
parent | net/mlx4: Use irq_update_affinity_hint() (diff) | |
parent | Merge branch 'for-5.16-fixes' of git://git.kernel.org/pub/scm/linux/kernel/gi... (diff) | |
download | linux-0422fe2666aea4c0986f4c89dc107731aa6a7a81.tar.xz linux-0422fe2666aea4c0986f4c89dc107731aa6a7a81.zip |
Merge branch 'linus' into irq/core, to fix conflict
Conflicts:
drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r-- | arch/x86/kvm/debugfs.c | 3 | ||||
-rw-r--r-- | arch/x86/kvm/hyperv.c | 7 | ||||
-rw-r--r-- | arch/x86/kvm/ioapic.h | 1 | ||||
-rw-r--r-- | arch/x86/kvm/irq.h | 1 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 2 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/mmu.c | 134 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/paging_tmpl.h | 3 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_iter.c | 6 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_iter.h | 6 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_mmu.c | 67 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_mmu.h | 5 | ||||
-rw-r--r-- | arch/x86/kvm/svm/avic.c | 17 | ||||
-rw-r--r-- | arch/x86/kvm/svm/pmu.c | 2 | ||||
-rw-r--r-- | arch/x86/kvm/svm/sev.c | 263 | ||||
-rw-r--r-- | arch/x86/kvm/svm/svm.c | 22 | ||||
-rw-r--r-- | arch/x86/kvm/svm/svm.h | 1 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/nested.c | 53 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/posted_intr.c | 20 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 134 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 100 | ||||
-rw-r--r-- | arch/x86/kvm/x86.h | 7 |
21 files changed, 506 insertions, 348 deletions
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index 54a83a744538..f33c804a922a 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -95,6 +95,9 @@ static int kvm_mmu_rmaps_stat_show(struct seq_file *m, void *v) unsigned int *log[KVM_NR_PAGE_SIZES], *cur; int i, j, k, l, ret; + if (!kvm_memslots_have_rmaps(kvm)) + return 0; + ret = -ENOMEM; memset(log, 0, sizeof(log)); for (i = 0; i < KVM_NR_PAGE_SIZES; i++) { diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 5e19e6e4c2ce..8d8c1cc7cb53 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1922,11 +1922,13 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL; + if (all_cpus) + goto check_and_send_ipi; + if (!sparse_banks_len) goto ret_success; - if (!all_cpus && - kvm_read_guest(kvm, + if (kvm_read_guest(kvm, hc->ingpa + offsetof(struct hv_send_ipi_ex, vp_set.bank_contents), sparse_banks, @@ -1934,6 +1936,7 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool return HV_STATUS_INVALID_HYPERCALL_INPUT; } +check_and_send_ipi: if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) return HV_STATUS_INVALID_HYPERCALL_INPUT; diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index e66e620c3bed..539333ac4b38 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -81,7 +81,6 @@ struct kvm_ioapic { unsigned long irq_states[IOAPIC_NUM_PINS]; struct kvm_io_device dev; struct kvm *kvm; - void (*ack_notifier)(void *opaque, int irq); spinlock_t lock; struct rtc_status rtc_status; struct delayed_work eoi_inject; diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 650642b18d15..c2d7cfe82d00 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -56,7 +56,6 @@ struct kvm_pic { struct kvm_io_device dev_master; struct kvm_io_device dev_slave; struct kvm_io_device dev_elcr; - void (*ack_notifier)(void *opaque, int irq); unsigned long irq_states[PIC_NUM_PINS]; }; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 759952dd1222..f206fc35deff 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -707,7 +707,7 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr) { int highest_irr; - if (apic->vcpu->arch.apicv_active) + if (kvm_x86_ops.sync_pir_to_irr) highest_irr = static_call(kvm_x86_sync_pir_to_irr)(apic->vcpu); else highest_irr = apic_find_highest_irr(apic); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 3be9beea838d..fcdf3f8bb59a 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1582,7 +1582,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp); if (is_tdp_mmu_enabled(kvm)) - flush |= kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); + flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); return flush; } @@ -1936,7 +1936,11 @@ static void mmu_audit_disable(void) { } static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) { - return sp->role.invalid || + if (sp->role.invalid) + return true; + + /* TDP MMU pages due not use the MMU generation. */ + return !sp->tdp_mmu_page && unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); } @@ -2173,10 +2177,10 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato iterator->shadow_addr = root; iterator->level = vcpu->arch.mmu->shadow_root_level; - if (iterator->level == PT64_ROOT_4LEVEL && + if (iterator->level >= PT64_ROOT_4LEVEL && vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL && !vcpu->arch.mmu->direct_map) - --iterator->level; + iterator->level = PT32E_ROOT_LEVEL; if (iterator->level == PT32E_ROOT_LEVEL) { /* @@ -3976,6 +3980,34 @@ out_retry: return true; } +/* + * Returns true if the page fault is stale and needs to be retried, i.e. if the + * root was invalidated by a memslot update or a relevant mmu_notifier fired. + */ +static bool is_page_fault_stale(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault, int mmu_seq) +{ + struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root_hpa); + + /* Special roots, e.g. pae_root, are not backed by shadow pages. */ + if (sp && is_obsolete_sp(vcpu->kvm, sp)) + return true; + + /* + * Roots without an associated shadow page are considered invalid if + * there is a pending request to free obsolete roots. The request is + * only a hint that the current root _may_ be obsolete and needs to be + * reloaded, e.g. if the guest frees a PGD that KVM is tracking as a + * previous root, then __kvm_mmu_prepare_zap_page() signals all vCPUs + * to reload even if no vCPU is actively using the root. + */ + if (!sp && kvm_test_request(KVM_REQ_MMU_RELOAD, vcpu)) + return true; + + return fault->slot && + mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva); +} + static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu); @@ -4013,8 +4045,9 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault else write_lock(&vcpu->kvm->mmu_lock); - if (fault->slot && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva)) + if (is_page_fault_stale(vcpu, fault, mmu_seq)) goto out_unlock; + r = make_mmu_pages_available(vcpu); if (r) goto out_unlock; @@ -4855,7 +4888,7 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, struct kvm_mmu *context = &vcpu->arch.guest_mmu; struct kvm_mmu_role_regs regs = { .cr0 = cr0, - .cr4 = cr4, + .cr4 = cr4 & ~X86_CR4_PKE, .efer = efer, }; union kvm_mmu_role new_role; @@ -4919,7 +4952,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, context->direct_map = false; update_permission_bitmask(context, true); - update_pkru_bitmask(context); + context->pkru_mask = 0; reset_rsvds_bits_mask_ept(vcpu, context, execonly); reset_ept_shadow_zero_bits_mask(vcpu, context, execonly); } @@ -5025,6 +5058,14 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu) /* * Invalidate all MMU roles to force them to reinitialize as CPUID * information is factored into reserved bit calculations. + * + * Correctly handling multiple vCPU models with respect to paging and + * physical address properties) in a single VM would require tracking + * all relevant CPUID information in kvm_mmu_page_role. That is very + * undesirable as it would increase the memory requirements for + * gfn_track (see struct kvm_mmu_page_role comments). For now that + * problem is swept under the rug; KVM's CPUID API is horrific and + * it's all but impossible to solve it without introducing a new API. */ vcpu->arch.root_mmu.mmu_role.ext.valid = 0; vcpu->arch.guest_mmu.mmu_role.ext.valid = 0; @@ -5032,24 +5073,10 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_mmu_reset_context(vcpu); /* - * KVM does not correctly handle changing guest CPUID after KVM_RUN, as - * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't - * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page - * faults due to reusing SPs/SPTEs. Alert userspace, but otherwise - * sweep the problem under the rug. - * - * KVM's horrific CPUID ABI makes the problem all but impossible to - * solve, as correctly handling multiple vCPU models (with respect to - * paging and physical address properties) in a single VM would require - * tracking all relevant CPUID information in kvm_mmu_page_role. That - * is very undesirable as it would double the memory requirements for - * gfn_track (see struct kvm_mmu_page_role comments), and in practice - * no sane VMM mucks with the core vCPU model on the fly. + * Changing guest CPUID after KVM_RUN is forbidden, see the comment in + * kvm_arch_vcpu_ioctl(). */ - if (vcpu->arch.last_vmentry_cpu != -1) { - pr_warn_ratelimited("KVM: KVM_SET_CPUID{,2} after KVM_RUN may cause guest instability\n"); - pr_warn_ratelimited("KVM: KVM_SET_CPUID{,2} will fail after KVM_RUN starting with Linux 5.16\n"); - } + KVM_BUG_ON(vcpu->arch.last_vmentry_cpu != -1, vcpu->kvm); } void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) @@ -5369,7 +5396,7 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { - kvm_mmu_invalidate_gva(vcpu, vcpu->arch.mmu, gva, INVALID_PAGE); + kvm_mmu_invalidate_gva(vcpu, vcpu->arch.walk_mmu, gva, INVALID_PAGE); ++vcpu->stat.invlpg; } EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); @@ -5854,8 +5881,6 @@ restart: void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *slot) { - bool flush = false; - if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); /* @@ -5863,17 +5888,14 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, * logging at a 4k granularity and never creates collapsible * 2m SPTEs during dirty logging. */ - flush = slot_handle_level_4k(kvm, slot, kvm_mmu_zap_collapsible_spte, true); - if (flush) + if (slot_handle_level_4k(kvm, slot, kvm_mmu_zap_collapsible_spte, true)) kvm_arch_flush_remote_tlbs_memslot(kvm, slot); write_unlock(&kvm->mmu_lock); } if (is_tdp_mmu_enabled(kvm)) { read_lock(&kvm->mmu_lock); - flush = kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot, flush); - if (flush) - kvm_arch_flush_remote_tlbs_memslot(kvm, slot); + kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot); read_unlock(&kvm->mmu_lock); } } @@ -6182,23 +6204,46 @@ void kvm_mmu_module_exit(void) mmu_audit_disable(); } +/* + * Calculate the effective recovery period, accounting for '0' meaning "let KVM + * select a halving time of 1 hour". Returns true if recovery is enabled. + */ +static bool calc_nx_huge_pages_recovery_period(uint *period) +{ + /* + * Use READ_ONCE to get the params, this may be called outside of the + * param setters, e.g. by the kthread to compute its next timeout. + */ + bool enabled = READ_ONCE(nx_huge_pages); + uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio); + + if (!enabled || !ratio) + return false; + + *period = READ_ONCE(nx_huge_pages_recovery_period_ms); + if (!*period) { + /* Make sure the period is not less than one second. */ + ratio = min(ratio, 3600u); + *period = 60 * 60 * 1000 / ratio; + } + return true; +} + static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp) { bool was_recovery_enabled, is_recovery_enabled; uint old_period, new_period; int err; - was_recovery_enabled = nx_huge_pages_recovery_ratio; - old_period = nx_huge_pages_recovery_period_ms; + was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period); err = param_set_uint(val, kp); if (err) return err; - is_recovery_enabled = nx_huge_pages_recovery_ratio; - new_period = nx_huge_pages_recovery_period_ms; + is_recovery_enabled = calc_nx_huge_pages_recovery_period(&new_period); - if (READ_ONCE(nx_huge_pages) && is_recovery_enabled && + if (is_recovery_enabled && (!was_recovery_enabled || old_period > new_period)) { struct kvm *kvm; @@ -6262,18 +6307,13 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) static long get_nx_lpage_recovery_timeout(u64 start_time) { - uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio); - uint period = READ_ONCE(nx_huge_pages_recovery_period_ms); + bool enabled; + uint period; - if (!period && ratio) { - /* Make sure the period is not less than one second. */ - ratio = min(ratio, 3600u); - period = 60 * 60 * 1000 / ratio; - } + enabled = calc_nx_huge_pages_recovery_period(&period); - return READ_ONCE(nx_huge_pages) && ratio - ? start_time + msecs_to_jiffies(period) - get_jiffies_64() - : MAX_SCHEDULE_TIMEOUT; + return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64() + : MAX_SCHEDULE_TIMEOUT; } static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index f87d36898c44..708a5d297fe1 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -911,7 +911,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault r = RET_PF_RETRY; write_lock(&vcpu->kvm->mmu_lock); - if (fault->slot && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva)) + + if (is_page_fault_stale(vcpu, fault, mmu_seq)) goto out_unlock; kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c index b3ed302c1a35..caa96c270b95 100644 --- a/arch/x86/kvm/mmu/tdp_iter.c +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -26,6 +26,7 @@ static gfn_t round_gfn_for_level(gfn_t gfn, int level) */ void tdp_iter_restart(struct tdp_iter *iter) { + iter->yielded = false; iter->yielded_gfn = iter->next_last_level_gfn; iter->level = iter->root_level; @@ -160,6 +161,11 @@ static bool try_step_up(struct tdp_iter *iter) */ void tdp_iter_next(struct tdp_iter *iter) { + if (iter->yielded) { + tdp_iter_restart(iter); + return; + } + if (try_step_down(iter)) return; diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index b1748b988d3a..e19cabbcb65c 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -45,6 +45,12 @@ struct tdp_iter { * iterator walks off the end of the paging structure. */ bool valid; + /* + * True if KVM dropped mmu_lock and yielded in the middle of a walk, in + * which case tdp_iter_next() needs to restart the walk at the root + * level instead of advancing to the next entry. + */ + bool yielded; }; /* diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index a54c3491af42..1beb4ca90560 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -317,9 +317,6 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt)); int level = sp->role.level; gfn_t base_gfn = sp->gfn; - u64 old_child_spte; - u64 *sptep; - gfn_t gfn; int i; trace_kvm_mmu_prepare_zap_page(sp); @@ -327,8 +324,9 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, tdp_mmu_unlink_page(kvm, sp, shared); for (i = 0; i < PT64_ENT_PER_PAGE; i++) { - sptep = rcu_dereference(pt) + i; - gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); + u64 *sptep = rcu_dereference(pt) + i; + gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); + u64 old_child_spte; if (shared) { /* @@ -374,7 +372,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, shared); } - kvm_flush_remote_tlbs_with_address(kvm, gfn, + kvm_flush_remote_tlbs_with_address(kvm, base_gfn, KVM_PAGES_PER_HPAGE(level + 1)); call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); @@ -504,6 +502,8 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, struct tdp_iter *iter, u64 new_spte) { + WARN_ON_ONCE(iter->yielded); + lockdep_assert_held_read(&kvm->mmu_lock); /* @@ -577,6 +577,8 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, u64 new_spte, bool record_acc_track, bool record_dirty_log) { + WARN_ON_ONCE(iter->yielded); + lockdep_assert_held_write(&kvm->mmu_lock); /* @@ -642,18 +644,19 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, * If this function should yield and flush is set, it will perform a remote * TLB flush before yielding. * - * If this function yields, it will also reset the tdp_iter's walk over the - * paging structure and the calling function should skip to the next - * iteration to allow the iterator to continue its traversal from the - * paging structure root. + * If this function yields, iter->yielded is set and the caller must skip to + * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk + * over the paging structures to allow the iterator to continue its traversal + * from the paging structure root. * - * Return true if this function yielded and the iterator's traversal was reset. - * Return false if a yield was not needed. + * Returns true if this function yielded. */ -static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, - struct tdp_iter *iter, bool flush, - bool shared) +static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm, + struct tdp_iter *iter, + bool flush, bool shared) { + WARN_ON(iter->yielded); + /* Ensure forward progress has been made before yielding. */ if (iter->next_last_level_gfn == iter->yielded_gfn) return false; @@ -673,12 +676,10 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, WARN_ON(iter->gfn > iter->next_last_level_gfn); - tdp_iter_restart(iter); - - return true; + iter->yielded = true; } - return false; + return iter->yielded; } /* @@ -1033,9 +1034,9 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, { struct kvm_mmu_page *root; - for_each_tdp_mmu_root(kvm, root, range->slot->as_id) - flush |= zap_gfn_range(kvm, root, range->start, range->end, - range->may_block, flush, false); + for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false) + flush = zap_gfn_range(kvm, root, range->start, range->end, + range->may_block, flush, false); return flush; } @@ -1364,10 +1365,9 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, * Clear leaf entries which could be replaced by large mappings, for * GFNs within the slot. */ -static bool zap_collapsible_spte_range(struct kvm *kvm, +static void zap_collapsible_spte_range(struct kvm *kvm, struct kvm_mmu_page *root, - const struct kvm_memory_slot *slot, - bool flush) + const struct kvm_memory_slot *slot) { gfn_t start = slot->base_gfn; gfn_t end = start + slot->npages; @@ -1378,10 +1378,8 @@ static bool zap_collapsible_spte_range(struct kvm *kvm, tdp_root_for_each_pte(iter, root, start, end) { retry: - if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) { - flush = false; + if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) continue; - } if (!is_shadow_present_pte(iter.old_spte) || !is_last_spte(iter.old_spte, iter.level)) @@ -1393,6 +1391,7 @@ retry: pfn, PG_LEVEL_NUM)) continue; + /* Note, a successful atomic zap also does a remote TLB flush. */ if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) { /* * The iter must explicitly re-read the SPTE because @@ -1401,30 +1400,24 @@ retry: iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); goto retry; } - flush = true; } rcu_read_unlock(); - - return flush; } /* * Clear non-leaf entries (and free associated page tables) which could * be replaced by large mappings, for GFNs within the slot. */ -bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *slot, - bool flush) +void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, + const struct kvm_memory_slot *slot) { struct kvm_mmu_page *root; lockdep_assert_held_read(&kvm->mmu_lock); for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) - flush = zap_collapsible_spte_range(kvm, root, slot, flush); - - return flush; + zap_collapsible_spte_range(kvm, root, slot); } /* diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 476b133544dd..3899004a5d91 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -64,9 +64,8 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, unsigned long mask, bool wrprot); -bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *slot, - bool flush); +void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, + const struct kvm_memory_slot *slot); bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index affc0ea98d30..8f9af7b7dbbe 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -900,6 +900,7 @@ out: bool svm_check_apicv_inhibit_reasons(ulong bit) { ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | + BIT(APICV_INHIBIT_REASON_ABSENT) | BIT(APICV_INHIBIT_REASON_HYPERV) | BIT(APICV_INHIBIT_REASON_NESTED) | BIT(APICV_INHIBIT_REASON_IRQWIN) | @@ -989,16 +990,18 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu) static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run) { struct vcpu_svm *svm = to_svm(vcpu); + int cpu = get_cpu(); + WARN_ON(cpu != vcpu->cpu); svm->avic_is_running = is_run; - if (!kvm_vcpu_apicv_active(vcpu)) - return; - - if (is_run) - avic_vcpu_load(vcpu, vcpu->cpu); - else - avic_vcpu_put(vcpu); + if (kvm_vcpu_apicv_active(vcpu)) { + if (is_run) + avic_vcpu_load(vcpu, cpu); + else + avic_vcpu_put(vcpu); + } + put_cpu(); } void svm_vcpu_blocking(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 871c426ec389..b4095dfeeee6 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -281,7 +281,7 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu) pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS; pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1; - pmu->reserved_bits = 0xffffffff00200000ull; + pmu->reserved_bits = 0xfffffff000280000ull; pmu->version = 1; /* not applicable to AMD; but clean them to prevent any fall out */ pmu->counter_bitmask[KVM_PMC_FIXED] = 0; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 21ac0a5de4e0..be2883141220 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1543,28 +1543,50 @@ static bool is_cmd_allowed_from_mirror(u32 cmd_id) return false; } -static int sev_lock_for_migration(struct kvm *kvm) +static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info; + struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info; + int r = -EBUSY; + + if (dst_kvm == src_kvm) + return -EINVAL; /* - * Bail if this VM is already involved in a migration to avoid deadlock - * between two VMs trying to migrate to/from each other. + * Bail if these VMs are already involved in a migration to avoid + * deadlock between two VMs trying to migrate to/from each other. */ - if (atomic_cmpxchg_acquire(&sev->migration_in_progress, 0, 1)) + if (atomic_cmpxchg_acquire(&dst_sev->migration_in_progress, 0, 1)) return -EBUSY; - mutex_lock(&kvm->lock); + if (atomic_cmpxchg_acquire(&src_sev->migration_in_progress, 0, 1)) + goto release_dst; + r = -EINTR; + if (mutex_lock_killable(&dst_kvm->lock)) + goto release_src; + if (mutex_lock_killable_nested(&src_kvm->lock, SINGLE_DEPTH_NESTING)) + goto unlock_dst; return 0; + +unlock_dst: + mutex_unlock(&dst_kvm->lock); +release_src: + atomic_set_release(&src_sev->migration_in_progress, 0); +release_dst: + atomic_set_release(&dst_sev->migration_in_progress, 0); + return r; } -static void sev_unlock_after_migration(struct kvm *kvm) +static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info; + struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info; - mutex_unlock(&kvm->lock); - atomic_set_release(&sev->migration_in_progress, 0); + mutex_unlock(&dst_kvm->lock); + mutex_unlock(&src_kvm->lock); + atomic_set_release(&dst_sev->migration_in_progress, 0); + atomic_set_release(&src_sev->migration_in_progress, 0); } @@ -1607,14 +1629,15 @@ static void sev_migrate_from(struct kvm_sev_info *dst, dst->asid = src->asid; dst->handle = src->handle; dst->pages_locked = src->pages_locked; + dst->enc_context_owner = src->enc_context_owner; src->asid = 0; src->active = false; src->handle = 0; src->pages_locked = 0; + src->enc_context_owner = NULL; - INIT_LIST_HEAD(&dst->regions_list); - list_replace_init(&src->regions_list, &dst->regions_list); + list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list); } static int sev_es_migrate_from(struct kvm *dst, struct kvm *src) @@ -1666,15 +1689,6 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd) bool charged = false; int ret; - ret = sev_lock_for_migration(kvm); - if (ret) - return ret; - - if (sev_guest(kvm)) { - ret = -EINVAL; - goto out_unlock; - } - source_kvm_file = fget(source_fd); if (!file_is_kvm(source_kvm_file)) { ret = -EBADF; @@ -1682,16 +1696,26 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd) } source_kvm = source_kvm_file->private_data; - ret = sev_lock_for_migration(source_kvm); + ret = sev_lock_two_vms(kvm, source_kvm); if (ret) goto out_fput; - if (!sev_guest(source_kvm)) { + if (sev_guest(kvm) || !sev_guest(source_kvm)) { ret = -EINVAL; - goto out_source; + goto out_unlock; } src_sev = &to_kvm_svm(source_kvm)->sev_info; + + /* + * VMs mirroring src's encryption context rely on it to keep the + * ASID allocated, but below we are clearing src_sev->asid. + */ + if (src_sev->num_mirrored_vms) { + ret = -EBUSY; + goto out_unlock; + } + dst_sev->misc_cg = get_current_misc_cg(); cg_cleanup_sev = dst_sev; if (dst_sev->misc_cg != src_sev->misc_cg) { @@ -1728,13 +1752,11 @@ out_dst_cgroup: sev_misc_cg_uncharge(cg_cleanup_sev); put_misc_cg(cg_cleanup_sev->misc_cg); cg_cleanup_sev->misc_cg = NULL; -out_source: - sev_unlock_after_migration(source_kvm); +out_unlock: + sev_unlock_two_vms(kvm, source_kvm); out_fput: if (source_kvm_file) fput(source_kvm_file); -out_unlock: - sev_unlock_after_migration(kvm); return ret; } @@ -1953,76 +1975,60 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd) { struct file *source_kvm_file; struct kvm *source_kvm; - struct kvm_sev_info source_sev, *mirror_sev; + struct kvm_sev_info *source_sev, *mirror_sev; int ret; source_kvm_file = fget(source_fd); if (!file_is_kvm(source_kvm_file)) { ret = -EBADF; - goto e_source_put; + goto e_source_fput; } source_kvm = source_kvm_file->private_data; - mutex_lock(&source_kvm->lock); - - if (!sev_guest(source_kvm)) { - ret = -EINVAL; - goto e_source_unlock; - } + ret = sev_lock_two_vms(kvm, source_kvm); + if (ret) + goto e_source_fput; - /* Mirrors of mirrors should work, but let's not get silly */ - if (is_mirroring_enc_context(source_kvm) || source_kvm == kvm) { + /* + * Mirrors of mirrors should work, but let's not get silly. Also + * disallow out-of-band SEV/SEV-ES init if the target is already an + * SEV guest, or if vCPUs have been created. KVM relies on vCPUs being + * created after SEV/SEV-ES initialization, e.g. to init intercepts. + */ + if (sev_guest(kvm) || !sev_guest(source_kvm) || + is_mirroring_enc_context(source_kvm) || kvm->created_vcpus) { ret = -EINVAL; - goto e_source_unlock; + goto e_unlock; } - memcpy(&source_sev, &to_kvm_svm(source_kvm)->sev_info, - sizeof(source_sev)); - /* * The mirror kvm holds an enc_context_owner ref so its asid can't * disappear until we're done with it */ + source_sev = &to_kvm_svm(source_kvm)->sev_info; kvm_get_kvm(source_kvm); - - fput(source_kvm_file); - mutex_unlock(&source_kvm->lock); - mutex_lock(&kvm->lock); - - /* - * Disallow out-of-band SEV/SEV-ES init if the target is already an - * SEV guest, or if vCPUs have been created. KVM relies on vCPUs being - * created after SEV/SEV-ES initialization, e.g. to init intercepts. - */ - if (sev_guest(kvm) || kvm->created_vcpus) { - ret = -EINVAL; - goto e_mirror_unlock; - } + source_sev->num_mirrored_vms++; /* Set enc_context_owner and copy its encryption context over */ mirror_sev = &to_kvm_svm(kvm)->sev_info; mirror_sev->enc_context_owner = source_kvm; mirror_sev->active = true; - mirror_sev->asid = source_sev.asid; - mirror_sev->fd = source_sev.fd; - mirror_sev->es_active = source_sev.es_active; - mirror_sev->handle = source_sev.handle; + mirror_sev->asid = source_sev->asid; + mirror_sev->fd = source_sev->fd; + mirror_sev->es_active = source_sev->es_active; + mirror_sev->handle = source_sev->handle; + INIT_LIST_HEAD(&mirror_sev->regions_list); + ret = 0; + /* * Do not copy ap_jump_table. Since the mirror does not share the same * KVM contexts as the original, and they may have different * memory-views. */ - mutex_unlock(&kvm->lock); - return 0; - -e_mirror_unlock: - mutex_unlock(&kvm->lock); - kvm_put_kvm(source_kvm); - return ret; -e_source_unlock: - mutex_unlock(&source_kvm->lock); -e_source_put: +e_unlock: + sev_unlock_two_vms(kvm, source_kvm); +e_source_fput: if (source_kvm_file) fput(source_kvm_file); return ret; @@ -2034,17 +2040,24 @@ void sev_vm_destroy(struct kvm *kvm) struct list_head *head = &sev->regions_list; struct list_head *pos, *q; + WARN_ON(sev->num_mirrored_vms); + if (!sev_guest(kvm)) return; /* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */ if (is_mirroring_enc_context(kvm)) { - kvm_put_kvm(sev->enc_context_owner); + struct kvm *owner_kvm = sev->enc_context_owner; + struct kvm_sev_info *owner_sev = &to_kvm_svm(owner_kvm)->sev_info; + + mutex_lock(&owner_kvm->lock); + if (!WARN_ON(!owner_sev->num_mirrored_vms)) + owner_sev->num_mirrored_vms--; + mutex_unlock(&owner_kvm->lock); + kvm_put_kvm(owner_kvm); return; } - mutex_lock(&kvm->lock); - /* * Ensure that all guest tagged cache entries are flushed before * releasing the pages back to the system for use. CLFLUSH will @@ -2064,8 +2077,6 @@ void sev_vm_destroy(struct kvm *kvm) } } - mutex_unlock(&kvm->lock); - sev_unbind_asid(kvm, sev->handle); sev_asid_free(sev); } @@ -2249,7 +2260,7 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu) __free_page(virt_to_page(svm->sev_es.vmsa)); if (svm->sev_es.ghcb_sa_free) - kfree(svm->sev_es.ghcb_sa); + kvfree(svm->sev_es.ghcb_sa); } static void dump_ghcb(struct vcpu_svm *svm) @@ -2341,24 +2352,29 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); } -static int sev_es_validate_vmgexit(struct vcpu_svm *svm) +static bool sev_es_validate_vmgexit(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu; struct ghcb *ghcb; - u64 exit_code = 0; + u64 exit_code; + u64 reason; ghcb = svm->sev_es.ghcb; - /* Only GHCB Usage code 0 is supported */ - if (ghcb->ghcb_usage) - goto vmgexit_err; - /* - * Retrieve the exit code now even though is may not be marked valid + * Retrieve the exit code now even though it may not be marked valid * as it could help with debugging. */ exit_code = ghcb_get_sw_exit_code(ghcb); + /* Only GHCB Usage code 0 is supported */ + if (ghcb->ghcb_usage) { + reason = GHCB_ERR_INVALID_USAGE; + goto vmgexit_err; + } + + reason = GHCB_ERR_MISSING_INPUT; + if (!ghcb_sw_exit_code_is_valid(ghcb) || !ghcb_sw_exit_info_1_is_valid(ghcb) || !ghcb_sw_exit_info_2_is_valid(ghcb)) @@ -2437,30 +2453,34 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) case SVM_VMGEXIT_UNSUPPORTED_EVENT: break; default: + reason = GHCB_ERR_INVALID_EVENT; goto vmgexit_err; } - return 0; + return true; vmgexit_err: vcpu = &svm->vcpu; - if (ghcb->ghcb_usage) { + if (reason == GHCB_ERR_INVALID_USAGE) { vcpu_unimpl(vcpu, "vmgexit: ghcb usage %#x is not valid\n", ghcb->ghcb_usage); + } else if (reason == GHCB_ERR_INVALID_EVENT) { + vcpu_unimpl(vcpu, "vmgexit: exit code %#llx is not valid\n", + exit_code); } else { - vcpu_unimpl(vcpu, "vmgexit: exit reason %#llx is not valid\n", + vcpu_unimpl(vcpu, "vmgexit: exit code %#llx input is not valid\n", exit_code); dump_ghcb(svm); } - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; - vcpu->run->internal.ndata = 2; - vcpu->run->internal.data[0] = exit_code; - vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; + /* Clear the valid entries fields */ + memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); + + ghcb_set_sw_exit_info_1(ghcb, 2); + ghcb_set_sw_exit_info_2(ghcb, reason); - return -EINVAL; + return false; } void sev_es_unmap_ghcb(struct vcpu_svm *svm) @@ -2482,7 +2502,7 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm) svm->sev_es.ghcb_sa_sync = false; } - kfree(svm->sev_es.ghcb_sa); + kvfree(svm->sev_es.ghcb_sa); svm->sev_es.ghcb_sa = NULL; svm->sev_es.ghcb_sa_free = false; } @@ -2530,14 +2550,14 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) scratch_gpa_beg = ghcb_get_sw_scratch(ghcb); if (!scratch_gpa_beg) { pr_err("vmgexit: scratch gpa not provided\n"); - return false; + goto e_scratch; } scratch_gpa_end = scratch_gpa_beg + len; if (scratch_gpa_end < scratch_gpa_beg) { pr_err("vmgexit: scratch length (%#llx) not valid for scratch address (%#llx)\n", len, scratch_gpa_beg); - return false; + goto e_scratch; } if ((scratch_gpa_beg & PAGE_MASK) == control->ghcb_gpa) { @@ -2555,7 +2575,7 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) scratch_gpa_end > ghcb_scratch_end) { pr_err("vmgexit: scratch area is outside of GHCB shared buffer area (%#llx - %#llx)\n", scratch_gpa_beg, scratch_gpa_end); - return false; + goto e_scratch; } scratch_va = (void *)svm->sev_es.ghcb; @@ -2568,18 +2588,18 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) if (len > GHCB_SCRATCH_AREA_LIMIT) { pr_err("vmgexit: scratch area exceeds KVM limits (%#llx requested, %#llx limit)\n", len, GHCB_SCRATCH_AREA_LIMIT); - return false; + goto e_scratch; } - scratch_va = kzalloc(len, GFP_KERNEL_ACCOUNT); + scratch_va = kvzalloc(len, GFP_KERNEL_ACCOUNT); if (!scratch_va) - return false; + goto e_scratch; if (kvm_read_guest(svm->vcpu.kvm, scratch_gpa_beg, scratch_va, len)) { /* Unable to copy scratch area from guest */ pr_err("vmgexit: kvm_read_guest for scratch area failed\n"); - kfree(scratch_va); - return false; + kvfree(scratch_va); + goto e_scratch; } /* @@ -2596,6 +2616,12 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) svm->sev_es.ghcb_sa_len = len; return true; + +e_scratch: + ghcb_set_sw_exit_info_1(ghcb, 2); + ghcb_set_sw_exit_info_2(ghcb, GHCB_ERR_INVALID_SCRATCH_AREA); + + return false; } static void set_ghcb_msr_bits(struct vcpu_svm *svm, u64 value, u64 mask, @@ -2646,7 +2672,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_CPUID); if (!ret) { - ret = -EINVAL; + /* Error, keep GHCB MSR value as-is */ break; } @@ -2682,10 +2708,13 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) GHCB_MSR_TERM_REASON_POS); pr_info("SEV-ES guest requested termination: %#llx:%#llx\n", reason_set, reason_code); - fallthrough; + + ret = -EINVAL; + break; } default: - ret = -EINVAL; + /* Error, keep GHCB MSR value as-is */ + break; } trace_kvm_vmgexit_msr_protocol_exit(svm->vcpu.vcpu_id, @@ -2709,14 +2738,18 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) if (!ghcb_gpa) { vcpu_unimpl(vcpu, "vmgexit: GHCB gpa is not set\n"); - return -EINVAL; + + /* Without a GHCB, just return right back to the guest */ + return 1; } if (kvm_vcpu_map(vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->sev_es.ghcb_map)) { /* Unable to map GHCB from guest */ vcpu_unimpl(vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n", ghcb_gpa); - return -EINVAL; + + /* Without a GHCB, just return right back to the guest */ + return 1; } svm->sev_es.ghcb = svm->sev_es.ghcb_map.hva; @@ -2726,15 +2759,14 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) exit_code = ghcb_get_sw_exit_code(ghcb); - ret = sev_es_validate_vmgexit(svm); - if (ret) - return ret; + if (!sev_es_validate_vmgexit(svm)) + return 1; sev_es_sync_from_ghcb(svm); ghcb_set_sw_exit_info_1(ghcb, 0); ghcb_set_sw_exit_info_2(ghcb, 0); - ret = -EINVAL; + ret = 1; switch (exit_code) { case SVM_VMGEXIT_MMIO_READ: if (!setup_vmgexit_scratch(svm, true, control->exit_info_2)) @@ -2775,20 +2807,17 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) default: pr_err("svm: vmgexit: unsupported AP jump table request - exit_info_1=%#llx\n", control->exit_info_1); - ghcb_set_sw_exit_info_1(ghcb, 1); - ghcb_set_sw_exit_info_2(ghcb, - X86_TRAP_UD | - SVM_EVTINJ_TYPE_EXEPT | - SVM_EVTINJ_VALID); + ghcb_set_sw_exit_info_1(ghcb, 2); + ghcb_set_sw_exit_info_2(ghcb, GHCB_ERR_INVALID_INPUT); } - ret = 1; break; } case SVM_VMGEXIT_UNSUPPORTED_EVENT: vcpu_unimpl(vcpu, "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n", control->exit_info_1, control->exit_info_2); + ret = -EINVAL; break; default: ret = svm_invoke_exit_handler(vcpu, exit_code); @@ -2810,7 +2839,7 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in) return -EINVAL; if (!setup_vmgexit_scratch(svm, in, bytes)) - return -EINVAL; + return 1; return kvm_sev_es_string_io(&svm->vcpu, size, port, svm->sev_es.ghcb_sa, count, in); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 5630c241d5f6..5151efa424ac 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1585,6 +1585,15 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) to_svm(vcpu)->vmcb->save.rflags = rflags; } +static bool svm_get_if_flag(struct kvm_vcpu *vcpu) +{ + struct vmcb *vmcb = to_svm(vcpu)->vmcb; + + return sev_es_guest(vcpu->kvm) + ? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK + : kvm_get_rflags(vcpu) & X86_EFLAGS_IF; +} + static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) { switch (reg) { @@ -3568,14 +3577,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) if (!gif_set(svm)) return true; - if (sev_es_guest(vcpu->kvm)) { - /* - * SEV-ES guests to not expose RFLAGS. Use the VMCB interrupt mask - * bit to determine the state of the IF flag. - */ - if (!(vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK)) - return true; - } else if (is_guest_mode(vcpu)) { + if (is_guest_mode(vcpu)) { /* As long as interrupts are being delivered... */ if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF) @@ -3586,7 +3588,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) if (nested_exit_on_intr(svm)) return false; } else { - if (!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF)) + if (!svm_get_if_flag(vcpu)) return true; } @@ -4621,6 +4623,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, + .get_if_flag = svm_get_if_flag, .tlb_flush_all = svm_flush_tlb, .tlb_flush_current = svm_flush_tlb, @@ -4651,7 +4654,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .load_eoi_exitmap = svm_load_eoi_exitmap, .hwapic_irr_update = svm_hwapic_irr_update, .hwapic_isr_update = svm_hwapic_isr_update, - .sync_pir_to_irr = kvm_lapic_find_highest_irr, .apicv_post_state_restore = avic_post_state_restore, .set_tss_addr = svm_set_tss_addr, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 5faad3dc10e2..1c7306c370fa 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -79,6 +79,7 @@ struct kvm_sev_info { struct list_head regions_list; /* List of registered regions */ u64 ap_jump_table; /* SEV-ES AP Jump Table address */ struct kvm *enc_context_owner; /* Owner of copied encryption context */ + unsigned long num_mirrored_vms; /* Number of VMs sharing this ASID */ struct misc_cg *misc_cg; /* For misc cgroup accounting */ atomic_t migration_in_progress; }; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 1e2f66951566..9c941535f78c 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1162,29 +1162,26 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, WARN_ON(!enable_vpid); /* - * If VPID is enabled and used by vmc12, but L2 does not have a unique - * TLB tag (ASID), i.e. EPT is disabled and KVM was unable to allocate - * a VPID for L2, flush the current context as the effective ASID is - * common to both L1 and L2. - * - * Defer the flush so that it runs after vmcs02.EPTP has been set by - * KVM_REQ_LOAD_MMU_PGD (if nested EPT is enabled) and to avoid - * redundant flushes further down the nested pipeline. - * - * If a TLB flush isn't required due to any of the above, and vpid12 is - * changing then the new "virtual" VPID (vpid12) will reuse the same - * "real" VPID (vpid02), and so needs to be flushed. There's no direct - * mapping between vpid02 and vpid12, vpid02 is per-vCPU and reused for - * all nested vCPUs. Remember, a flush on VM-Enter does not invalidate - * guest-physical mappings, so there is no need to sync the nEPT MMU. + * VPID is enabled and in use by vmcs12. If vpid12 is changing, then + * emulate a guest TLB flush as KVM does not track vpid12 history nor + * is the VPID incorporated into the MMU context. I.e. KVM must assume + * that the new vpid12 has never been used and thus represents a new + * guest ASID that cannot have entries in the TLB. */ - if (!nested_has_guest_tlb_tag(vcpu)) { - kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); - } else if (is_vmenter && - vmcs12->virtual_processor_id != vmx->nested.last_vpid) { + if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) { vmx->nested.last_vpid = vmcs12->virtual_processor_id; - vpid_sync_context(nested_get_vpid02(vcpu)); + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); + return; } + + /* + * If VPID is enabled, used by vmc12, and vpid12 is not changing but + * does not have a unique TLB tag (ASID), i.e. EPT is disabled and + * KVM was unable to allocate a VPID for L2, flush the current context + * as the effective ASID is common to both L1 and L2. + */ + if (!nested_has_guest_tlb_tag(vcpu)) + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) @@ -2594,8 +2591,10 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, - vmcs12->guest_ia32_perf_global_ctrl))) + vmcs12->guest_ia32_perf_global_ctrl))) { + *entry_failure_code = ENTRY_FAIL_DEFAULT; return -EINVAL; + } kvm_rsp_write(vcpu, vmcs12->guest_rsp); kvm_rip_write(vcpu, vmcs12->guest_rip); @@ -3344,8 +3343,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, }; u32 failed_index; - if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) - kvm_vcpu_flush_tlb_current(vcpu); + kvm_service_local_tlb_flush_requests(vcpu); evaluate_pending_interrupts = exec_controls_get(vmx) & (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING); @@ -4502,9 +4500,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, (void)nested_get_evmcs_page(vcpu); } - /* Service the TLB flush request for L2 before switching to L1. */ - if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) - kvm_vcpu_flush_tlb_current(vcpu); + /* Service pending TLB flush requests for L2 before switching to L1. */ + kvm_service_local_tlb_flush_requests(vcpu); /* * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between @@ -4857,6 +4854,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) if (!vmx->nested.cached_vmcs12) goto out_cached_vmcs12; + vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA; vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); if (!vmx->nested.cached_shadow_vmcs12) goto out_cached_shadow_vmcs12; @@ -5289,8 +5287,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache; struct vmcs_hdr hdr; - if (ghc->gpa != vmptr && - kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) { + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) { /* * Reads from an unbacked page return all 1s, * which means that the 32 bits located at the diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 5f81ef092bd4..1c94783b5a54 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -5,6 +5,7 @@ #include <asm/cpu.h> #include "lapic.h" +#include "irq.h" #include "posted_intr.h" #include "trace.h" #include "vmx.h" @@ -77,13 +78,18 @@ after_clear_sn: pi_set_on(pi_desc); } +static bool vmx_can_use_vtd_pi(struct kvm *kvm) +{ + return irqchip_in_kernel(kvm) && enable_apicv && + kvm_arch_has_assigned_device(kvm) && + irq_remapping_cap(IRQ_POSTING_CAP); +} + void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) + if (!vmx_can_use_vtd_pi(vcpu->kvm)) return; /* Set SN when the vCPU is preempted */ @@ -141,9 +147,7 @@ int pi_pre_block(struct kvm_vcpu *vcpu) struct pi_desc old, new; struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) + if (!vmx_can_use_vtd_pi(vcpu->kvm)) return 0; WARN_ON(irqs_disabled()); @@ -270,9 +274,7 @@ int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, struct vcpu_data vcpu_info; int idx, ret = 0; - if (!kvm_arch_has_assigned_device(kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(kvm->vcpus[0])) + if (!vmx_can_use_vtd_pi(kvm)) return 0; idx = srcu_read_lock(&kvm->irq_srcu); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ba66c171d951..0dbf94eb954f 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1363,6 +1363,11 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) vmx->emulation_required = vmx_emulation_required(vcpu); } +static bool vmx_get_if_flag(struct kvm_vcpu *vcpu) +{ + return vmx_get_rflags(vcpu) & X86_EFLAGS_IF; +} + u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) { u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); @@ -2646,15 +2651,6 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) if (!loaded_vmcs->msr_bitmap) goto out_vmcs; memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE); - - if (IS_ENABLED(CONFIG_HYPERV) && - static_branch_unlikely(&enable_evmcs) && - (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) { - struct hv_enlightened_vmcs *evmcs = - (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs; - - evmcs->hv_enlightenments_control.msr_bitmap = 1; - } } memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state)); @@ -2918,6 +2914,13 @@ static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) } } +static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu) +{ + if (is_guest_mode(vcpu)) + return nested_get_vpid02(vcpu); + return to_vmx(vcpu)->vpid; +} + static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; @@ -2930,31 +2933,29 @@ static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) if (enable_ept) ept_sync_context(construct_eptp(vcpu, root_hpa, mmu->shadow_root_level)); - else if (!is_guest_mode(vcpu)) - vpid_sync_context(to_vmx(vcpu)->vpid); else - vpid_sync_context(nested_get_vpid02(vcpu)); + vpid_sync_context(vmx_get_current_vpid(vcpu)); } static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) { /* - * vpid_sync_vcpu_addr() is a nop if vmx->vpid==0, see the comment in + * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in * vmx_flush_tlb_guest() for an explanation of why this is ok. */ - vpid_sync_vcpu_addr(to_vmx(vcpu)->vpid, addr); + vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr); } static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu) { /* - * vpid_sync_context() is a nop if vmx->vpid==0, e.g. if enable_vpid==0 - * or a vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit - * are required to flush GVA->{G,H}PA mappings from the TLB if vpid is + * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a + * vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are + * required to flush GVA->{G,H}PA mappings from the TLB if vpid is * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed), * i.e. no explicit INVVPID is necessary. */ - vpid_sync_context(to_vmx(vcpu)->vpid); + vpid_sync_context(vmx_get_current_vpid(vcpu)); } void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu) @@ -3963,8 +3964,7 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) if (pi_test_and_set_on(&vmx->pi_desc)) return 0; - if (vcpu != kvm_get_running_vcpu() && - !kvm_vcpu_trigger_posted_interrupt(vcpu, false)) + if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false)) kvm_vcpu_kick(vcpu); return 0; @@ -5881,18 +5881,14 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) vmx_flush_pml_buffer(vcpu); /* - * We should never reach this point with a pending nested VM-Enter, and - * more specifically emulation of L2 due to invalid guest state (see - * below) should never happen as that means we incorrectly allowed a - * nested VM-Enter with an invalid vmcs12. + * KVM should never reach this point with a pending nested VM-Enter. + * More specifically, short-circuiting VM-Entry to emulate L2 due to + * invalid guest state should never happen as that means KVM knowingly + * allowed a nested VM-Enter with an invalid vmcs12. More below. */ if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm)) return -EIO; - /* If guest state is invalid, start emulating */ - if (vmx->emulation_required) - return handle_invalid_guest_state(vcpu); - if (is_guest_mode(vcpu)) { /* * PML is never enabled when running L2, bail immediately if a @@ -5914,10 +5910,30 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) */ nested_mark_vmcs12_pages_dirty(vcpu); + /* + * Synthesize a triple fault if L2 state is invalid. In normal + * operation, nested VM-Enter rejects any attempt to enter L2 + * with invalid state. However, those checks are skipped if + * state is being stuffed via RSM or KVM_SET_NESTED_STATE. If + * L2 state is invalid, it means either L1 modified SMRAM state + * or userspace provided bad state. Synthesize TRIPLE_FAULT as + * doing so is architecturally allowed in the RSM case, and is + * the least awful solution for the userspace case without + * risking false positives. + */ + if (vmx->emulation_required) { + nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); + return 1; + } + if (nested_vmx_reflect_vmexit(vcpu)) return 1; } + /* If guest state is invalid, start emulating. L2 is handled above. */ + if (vmx->emulation_required) + return handle_invalid_guest_state(vcpu); + if (exit_reason.failed_vmentry) { dump_vmcs(vcpu); vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; @@ -6262,9 +6278,9 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); int max_irr; - bool max_irr_updated; + bool got_posted_interrupt; - if (KVM_BUG_ON(!vcpu->arch.apicv_active, vcpu->kvm)) + if (KVM_BUG_ON(!enable_apicv, vcpu->kvm)) return -EIO; if (pi_test_on(&vmx->pi_desc)) { @@ -6274,22 +6290,33 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) * But on x86 this is just a compiler barrier anyway. */ smp_mb__after_atomic(); - max_irr_updated = + got_posted_interrupt = kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr); - - /* - * If we are running L2 and L1 has a new pending interrupt - * which can be injected, this may cause a vmexit or it may - * be injected into L2. Either way, this interrupt will be - * processed via KVM_REQ_EVENT, not RVI, because we do not use - * virtual interrupt delivery to inject L1 interrupts into L2. - */ - if (is_guest_mode(vcpu) && max_irr_updated) - kvm_make_request(KVM_REQ_EVENT, vcpu); } else { max_irr = kvm_lapic_find_highest_irr(vcpu); + got_posted_interrupt = false; } - vmx_hwapic_irr_update(vcpu, max_irr); + + /* + * Newly recognized interrupts are injected via either virtual interrupt + * delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is + * disabled in two cases: + * + * 1) If L2 is running and the vCPU has a new pending interrupt. If L1 + * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a + * VM-Exit to L1. If L1 doesn't want to exit, the interrupt is injected + * into L2, but KVM doesn't use virtual interrupt delivery to inject + * interrupts into L2, and so KVM_REQ_EVENT is again needed. + * + * 2) If APICv is disabled for this vCPU, assigned devices may still + * attempt to post interrupts. The posted interrupt vector will cause + * a VM-Exit and the subsequent entry will call sync_pir_to_irr. + */ + if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu)) + vmx_set_rvi(max_irr); + else if (got_posted_interrupt) + kvm_make_request(KVM_REQ_EVENT, vcpu); + return max_irr; } @@ -6601,9 +6628,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) * consistency check VM-Exit due to invalid guest state and bail. */ if (unlikely(vmx->emulation_required)) { - - /* We don't emulate invalid state of a nested guest */ - vmx->fail = is_guest_mode(vcpu); + vmx->fail = 0; vmx->exit_reason.full = EXIT_REASON_INVALID_STATE; vmx->exit_reason.failed_vmentry = 1; @@ -6826,6 +6851,19 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) if (err < 0) goto free_pml; + /* + * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a + * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the + * feature only for vmcs01, KVM currently isn't equipped to realize any + * performance benefits from enabling it for vmcs02. + */ + if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs) && + (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) { + struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; + + evmcs->hv_enlightenments_control.msr_bitmap = 1; + } + /* The MSR bitmap starts with all ones */ bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS); bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS); @@ -7509,6 +7547,7 @@ static void hardware_unsetup(void) static bool vmx_check_apicv_inhibit_reasons(ulong bit) { ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | + BIT(APICV_INHIBIT_REASON_ABSENT) | BIT(APICV_INHIBIT_REASON_HYPERV) | BIT(APICV_INHIBIT_REASON_BLOCKIRQ); @@ -7558,6 +7597,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, + .get_if_flag = vmx_get_if_flag, .tlb_flush_all = vmx_flush_tlb_all, .tlb_flush_current = vmx_flush_tlb_current, @@ -7761,10 +7801,10 @@ static __init int hardware_setup(void) ple_window_shrink = 0; } - if (!cpu_has_vmx_apicv()) { + if (!cpu_has_vmx_apicv()) enable_apicv = 0; + if (!enable_apicv) vmx_x86_ops.sync_pir_to_irr = NULL; - } if (cpu_has_vmx_tsc_scaling()) { kvm_has_tsc_control = true; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5a403d92833f..e50e97ac4408 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -890,7 +890,8 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) !load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu))) return 1; - if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) + if (!(cr0 & X86_CR0_PG) && + (is_64_bit_mode(vcpu) || kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))) return 1; static_call(kvm_x86_set_cr0)(vcpu, cr0); @@ -1330,7 +1331,7 @@ static const u32 msrs_to_save_all[] = { MSR_IA32_UMWAIT_CONTROL, MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, - MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3, + MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL, MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, @@ -3258,6 +3259,29 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) static_call(kvm_x86_tlb_flush_guest)(vcpu); } + +static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu) +{ + ++vcpu->stat.tlb_flush; + static_call(kvm_x86_tlb_flush_current)(vcpu); +} + +/* + * Service "local" TLB flush requests, which are specific to the current MMU + * context. In addition to the generic event handling in vcpu_enter_guest(), + * TLB flushes that are targeted at an MMU context also need to be serviced + * prior before nested VM-Enter/VM-Exit. + */ +void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu) +{ + if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) + kvm_vcpu_flush_tlb_current(vcpu); + + if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu)) + kvm_vcpu_flush_tlb_guest(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_service_local_tlb_flush_requests); + static void record_steal_time(struct kvm_vcpu *vcpu) { struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache; @@ -3389,7 +3413,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated) return 1; - if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM) && kvm_get_msr_feature(&msr_ent)) + if (kvm_get_msr_feature(&msr_ent)) return 1; if (data & ~msr_ent.data) return 1; @@ -4133,6 +4157,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SGX_ATTRIBUTE: #endif case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM: + case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM: case KVM_CAP_SREGS2: case KVM_CAP_EXIT_ON_EMULATION_FAILURE: case KVM_CAP_VCPU_ATTRIBUTES: @@ -4448,8 +4473,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { - if (vcpu->arch.apicv_active) - static_call(kvm_x86_sync_pir_to_irr)(vcpu); + static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); return kvm_apic_get_state(vcpu, s); } @@ -5124,6 +5148,17 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_cpuid __user *cpuid_arg = argp; struct kvm_cpuid cpuid; + /* + * KVM does not correctly handle changing guest CPUID after KVM_RUN, as + * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't + * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page + * faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with + * the core vCPU model on the fly, so fail. + */ + r = -EINVAL; + if (vcpu->arch.last_vmentry_cpu != -1) + goto out; + r = -EFAULT; if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) goto out; @@ -5134,6 +5169,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_cpuid2 __user *cpuid_arg = argp; struct kvm_cpuid2 cpuid; + /* + * KVM_SET_CPUID{,2} after KVM_RUN is forbidded, see the comment in + * KVM_SET_CPUID case above. + */ + r = -EINVAL; + if (vcpu->arch.last_vmentry_cpu != -1) + goto out; + r = -EFAULT; if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) goto out; @@ -5698,6 +5741,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, smp_wmb(); kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT; kvm->arch.nr_reserved_ioapic_pins = cap->args[0]; + kvm_request_apicv_update(kvm, true, APICV_INHIBIT_REASON_ABSENT); r = 0; split_irqchip_unlock: mutex_unlock(&kvm->lock); @@ -6078,6 +6122,7 @@ set_identity_unlock: /* Write kvm->irq_routing before enabling irqchip_in_kernel. */ smp_wmb(); kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL; + kvm_request_apicv_update(kvm, true, APICV_INHIBIT_REASON_ABSENT); create_irqchip_unlock: mutex_unlock(&kvm->lock); break; @@ -7077,7 +7122,13 @@ static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, unsigned short port, void *val, unsigned int count) { if (vcpu->arch.pio.count) { - /* Complete previous iteration. */ + /* + * Complete a previous iteration that required userspace I/O. + * Note, @count isn't guaranteed to match pio.count as userspace + * can modify ECX before rerunning the vCPU. Ignore any such + * shenanigans as KVM doesn't support modifying the rep count, + * and the emulator ensures @count doesn't overflow the buffer. + */ } else { int r = __emulator_pio_in(vcpu, size, port, count); if (!r) @@ -7086,7 +7137,6 @@ static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, /* Results already available, fall through. */ } - WARN_ON(count != vcpu->arch.pio.count); complete_emulator_pio_in(vcpu, val); return 1; } @@ -8776,10 +8826,9 @@ static void kvm_apicv_init(struct kvm *kvm) { init_rwsem(&kvm->arch.apicv_update_lock); - if (enable_apicv) - clear_bit(APICV_INHIBIT_REASON_DISABLE, - &kvm->arch.apicv_inhibit_reasons); - else + set_bit(APICV_INHIBIT_REASON_ABSENT, + &kvm->arch.apicv_inhibit_reasons); + if (!enable_apicv) set_bit(APICV_INHIBIT_REASON_DISABLE, &kvm->arch.apicv_inhibit_reasons); } @@ -8952,14 +9001,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu) { struct kvm_run *kvm_run = vcpu->run; - /* - * if_flag is obsolete and useless, so do not bother - * setting it for SEV-ES guests. Userspace can just - * use kvm_run->ready_for_interrupt_injection. - */ - kvm_run->if_flag = !vcpu->arch.guest_state_protected - && (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0; - + kvm_run->if_flag = static_call(kvm_x86_get_if_flag)(vcpu); kvm_run->cr8 = kvm_get_cr8(vcpu); kvm_run->apic_base = kvm_get_apic_base(vcpu); @@ -9528,8 +9570,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) if (irqchip_split(vcpu->kvm)) kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors); else { - if (vcpu->arch.apicv_active) - static_call(kvm_x86_sync_pir_to_irr)(vcpu); + static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); if (ioapic_in_kernel(vcpu->kvm)) kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); } @@ -9648,10 +9689,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) /* Flushing all ASIDs flushes the current ASID... */ kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } - if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) - kvm_vcpu_flush_tlb_current(vcpu); - if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu)) - kvm_vcpu_flush_tlb_guest(vcpu); + kvm_service_local_tlb_flush_requests(vcpu); if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; @@ -9802,10 +9840,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) /* * This handles the case where a posted interrupt was - * notified with kvm_vcpu_kick. + * notified with kvm_vcpu_kick. Assigned devices can + * use the POSTED_INTR_VECTOR even if APICv is disabled, + * so do it even if APICv is disabled on this vCPU. */ - if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active) - static_call(kvm_x86_sync_pir_to_irr)(vcpu); + if (kvm_lapic_enabled(vcpu)) + static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); if (kvm_vcpu_exit_request(vcpu)) { vcpu->mode = OUTSIDE_GUEST_MODE; @@ -9849,8 +9889,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) break; - if (vcpu->arch.apicv_active) - static_call(kvm_x86_sync_pir_to_irr)(vcpu); + if (kvm_lapic_enabled(vcpu)) + static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); if (unlikely(kvm_vcpu_exit_request(vcpu))) { exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 997669ae9caa..4abcd8d9836d 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -103,6 +103,7 @@ static inline unsigned int __shrink_ple_window(unsigned int val, #define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL +void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu); int kvm_check_nested_events(struct kvm_vcpu *vcpu); static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) @@ -185,12 +186,6 @@ static inline bool mmu_is_nested(struct kvm_vcpu *vcpu) return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu; } -static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu) -{ - ++vcpu->stat.tlb_flush; - static_call(kvm_x86_tlb_flush_current)(vcpu); -} - static inline int is_pae(struct kvm_vcpu *vcpu) { return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); |