diff options
Diffstat (limited to 'arch/x86/kvm/mmu')
-rw-r--r-- | arch/x86/kvm/mmu/mmu.c | 456 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/mmu_internal.h | 10 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/paging_tmpl.h | 31 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/spte.c | 102 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/spte.h | 95 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_mmu.c | 281 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_mmu.h | 6 |
7 files changed, 426 insertions, 555 deletions
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 8e853a5fc867..2401606db260 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -179,7 +179,6 @@ struct kvm_shadow_walk_iterator { static struct kmem_cache *pte_list_desc_cache; struct kmem_cache *mmu_page_header_cache; -static struct percpu_counter kvm_total_used_mmu_pages; static void mmu_spte_set(u64 *sptep, u64 spte); @@ -485,11 +484,12 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte) __set_spte(sptep, new_spte); } -/* - * Update the SPTE (excluding the PFN), but do not track changes in its - * accessed/dirty status. +/* Rules for using mmu_spte_update: + * Update the state bits, it means the mapped pfn is not changed. + * + * Returns true if the TLB needs to be flushed */ -static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte) +static bool mmu_spte_update(u64 *sptep, u64 new_spte) { u64 old_spte = *sptep; @@ -498,7 +498,7 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte) if (!is_shadow_present_pte(old_spte)) { mmu_spte_set(sptep, new_spte); - return old_spte; + return false; } if (!spte_has_volatile_bits(old_spte)) @@ -506,53 +506,10 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte) else old_spte = __update_clear_spte_slow(sptep, new_spte); - WARN_ON_ONCE(spte_to_pfn(old_spte) != spte_to_pfn(new_spte)); - - return old_spte; -} - -/* Rules for using mmu_spte_update: - * Update the state bits, it means the mapped pfn is not changed. - * - * Whenever an MMU-writable SPTE is overwritten with a read-only SPTE, remote - * TLBs must be flushed. Otherwise rmap_write_protect will find a read-only - * spte, even though the writable spte might be cached on a CPU's TLB. - * - * Returns true if the TLB needs to be flushed - */ -static bool mmu_spte_update(u64 *sptep, u64 new_spte) -{ - bool flush = false; - u64 old_spte = mmu_spte_update_no_track(sptep, new_spte); - - if (!is_shadow_present_pte(old_spte)) - return false; - - /* - * For the spte updated out of mmu-lock is safe, since - * we always atomically update it, see the comments in - * spte_has_volatile_bits(). - */ - if (is_mmu_writable_spte(old_spte) && - !is_writable_pte(new_spte)) - flush = true; + WARN_ON_ONCE(!is_shadow_present_pte(old_spte) || + spte_to_pfn(old_spte) != spte_to_pfn(new_spte)); - /* - * Flush TLB when accessed/dirty states are changed in the page tables, - * to guarantee consistency between TLB and page tables. - */ - - if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) { - flush = true; - kvm_set_pfn_accessed(spte_to_pfn(old_spte)); - } - - if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) { - flush = true; - kvm_set_pfn_dirty(spte_to_pfn(old_spte)); - } - - return flush; + return leaf_spte_change_needs_tlb_flush(old_spte, new_spte); } /* @@ -563,10 +520,8 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) */ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) { - kvm_pfn_t pfn; u64 old_spte = *sptep; int level = sptep_to_sp(sptep)->role.level; - struct page *page; if (!is_shadow_present_pte(old_spte) || !spte_has_volatile_bits(old_spte)) @@ -578,24 +533,6 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) return old_spte; kvm_update_page_stats(kvm, level, -1); - - pfn = spte_to_pfn(old_spte); - - /* - * KVM doesn't hold a reference to any pages mapped into the guest, and - * instead uses the mmu_notifier to ensure that KVM unmaps any pages - * before they are reclaimed. Sanity check that, if the pfn is backed - * by a refcounted page, the refcount is elevated. - */ - page = kvm_pfn_to_refcounted_page(pfn); - WARN_ON_ONCE(page && !page_count(page)); - - if (is_accessed_spte(old_spte)) - kvm_set_pfn_accessed(pfn); - - if (is_dirty_spte(old_spte)) - kvm_set_pfn_dirty(pfn); - return old_spte; } @@ -1250,16 +1187,6 @@ static bool spte_clear_dirty(u64 *sptep) return mmu_spte_update(sptep, spte); } -static bool spte_wrprot_for_clear_dirty(u64 *sptep) -{ - bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT, - (unsigned long *)sptep); - if (was_writable && !spte_ad_enabled(*sptep)) - kvm_set_pfn_dirty(spte_to_pfn(*sptep)); - - return was_writable; -} - /* * Gets the GFN ready for another round of dirty logging by clearing the * - D bit on ad-enabled SPTEs, and @@ -1275,7 +1202,8 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, for_each_rmap_spte(rmap_head, &iter, sptep) if (spte_ad_need_write_protect(*sptep)) - flush |= spte_wrprot_for_clear_dirty(sptep); + flush |= test_and_clear_bit(PT_WRITABLE_SHIFT, + (unsigned long *)sptep); else flush |= spte_clear_dirty(sptep); @@ -1640,15 +1568,12 @@ static bool kvm_rmap_age_gfn_range(struct kvm *kvm, (unsigned long *)sptep); } else { /* - * Capture the dirty status of the page, so that - * it doesn't get lost when the SPTE is marked - * for access tracking. + * WARN if mmu_spte_update() signals the need + * for a TLB flush, as Access tracking a SPTE + * should never trigger an _immediate_ flush. */ - if (is_writable_pte(spte)) - kvm_set_pfn_dirty(spte_to_pfn(spte)); - spte = mark_spte_for_access_track(spte); - mmu_spte_update_no_track(sptep, spte); + WARN_ON_ONCE(mmu_spte_update(sptep, spte)); } young = true; } @@ -1696,27 +1621,15 @@ static void kvm_mmu_check_sptes_at_free(struct kvm_mmu_page *sp) #endif } -/* - * This value is the sum of all of the kvm instances's - * kvm->arch.n_used_mmu_pages values. We need a global, - * aggregate version in order to make the slab shrinker - * faster - */ -static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr) -{ - kvm->arch.n_used_mmu_pages += nr; - percpu_counter_add(&kvm_total_used_mmu_pages, nr); -} - static void kvm_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { - kvm_mod_used_mmu_pages(kvm, +1); + kvm->arch.n_used_mmu_pages++; kvm_account_pgtable_pages((void *)sp->spt, +1); } static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { - kvm_mod_used_mmu_pages(kvm, -1); + kvm->arch.n_used_mmu_pages--; kvm_account_pgtable_pages((void *)sp->spt, -1); } @@ -2802,7 +2715,7 @@ static void kvm_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) * be write-protected. */ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, - gfn_t gfn, bool can_unsync, bool prefetch) + gfn_t gfn, bool synchronizing, bool prefetch) { struct kvm_mmu_page *sp; bool locked = false; @@ -2817,12 +2730,12 @@ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, /* * The page is not write-tracked, mark existing shadow pages unsync - * unless KVM is synchronizing an unsync SP (can_unsync = false). In - * that case, KVM must complete emulation of the guest TLB flush before - * allowing shadow pages to become unsync (writable by the guest). + * unless KVM is synchronizing an unsync SP. In that case, KVM must + * complete emulation of the guest TLB flush before allowing shadow + * pages to become unsync (writable by the guest). */ for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) { - if (!can_unsync) + if (synchronizing) return -EPERM; if (sp->unsync) @@ -2926,6 +2839,9 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, } if (is_shadow_present_pte(*sptep)) { + if (prefetch) + return RET_PF_SPURIOUS; + /* * If we overwrite a PTE page pointer with a 2MB PMD, unlink * the parent of the now unreachable PTE. @@ -2945,7 +2861,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, } wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch, - true, host_writable, &spte); + false, host_writable, &spte); if (*sptep == spte) { ret = RET_PF_SPURIOUS; @@ -2971,32 +2887,51 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, return ret; } -static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, - u64 *start, u64 *end) +static bool kvm_mmu_prefetch_sptes(struct kvm_vcpu *vcpu, gfn_t gfn, u64 *sptep, + int nr_pages, unsigned int access) { struct page *pages[PTE_PREFETCH_NUM]; struct kvm_memory_slot *slot; - unsigned int access = sp->role.access; - int i, ret; - gfn_t gfn; + int i; + + if (WARN_ON_ONCE(nr_pages > PTE_PREFETCH_NUM)) + return false; - gfn = kvm_mmu_page_get_gfn(sp, spte_index(start)); slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK); if (!slot) - return -1; + return false; - ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start); - if (ret <= 0) - return -1; + nr_pages = kvm_prefetch_pages(slot, gfn, pages, nr_pages); + if (nr_pages <= 0) + return false; - for (i = 0; i < ret; i++, gfn++, start++) { - mmu_set_spte(vcpu, slot, start, access, gfn, + for (i = 0; i < nr_pages; i++, gfn++, sptep++) { + mmu_set_spte(vcpu, slot, sptep, access, gfn, page_to_pfn(pages[i]), NULL); - put_page(pages[i]); + + /* + * KVM always prefetches writable pages from the primary MMU, + * and KVM can make its SPTE writable in the fast page handler, + * without notifying the primary MMU. Mark pages/folios dirty + * now to ensure file data is written back if it ends up being + * written by the guest. Because KVM's prefetching GUPs + * writable PTEs, the probability of unnecessary writeback is + * extremely low. + */ + kvm_release_page_dirty(pages[i]); } - return 0; + return true; +} + +static bool direct_pte_prefetch_many(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, + u64 *start, u64 *end) +{ + gfn_t gfn = kvm_mmu_page_get_gfn(sp, spte_index(start)); + unsigned int access = sp->role.access; + + return kvm_mmu_prefetch_sptes(vcpu, gfn, start, end - start, access); } static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, @@ -3014,8 +2949,9 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, if (is_shadow_present_pte(*spte) || spte == sptep) { if (!start) continue; - if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0) + if (!direct_pte_prefetch_many(vcpu, sp, start, spte)) return; + start = NULL; } else if (!start) start = spte; @@ -3165,13 +3101,12 @@ static int __kvm_mmu_max_mapping_level(struct kvm *kvm, } int kvm_mmu_max_mapping_level(struct kvm *kvm, - const struct kvm_memory_slot *slot, gfn_t gfn, - int max_level) + const struct kvm_memory_slot *slot, gfn_t gfn) { bool is_private = kvm_slot_can_be_private(slot) && kvm_mem_is_private(kvm, gfn); - return __kvm_mmu_max_mapping_level(kvm, slot, gfn, max_level, is_private); + return __kvm_mmu_max_mapping_level(kvm, slot, gfn, PG_LEVEL_NUM, is_private); } void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) @@ -3322,7 +3257,6 @@ static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu, fault->slot = NULL; fault->pfn = KVM_PFN_NOSLOT; fault->map_writable = false; - fault->hva = KVM_HVA_ERR_BAD; /* * If MMIO caching is disabled, emulate immediately without @@ -3392,7 +3326,7 @@ static bool page_fault_can_be_fast(struct kvm *kvm, struct kvm_page_fault *fault * by setting the Writable bit, which can be done out of mmu_lock. */ if (!fault->present) - return !kvm_ad_enabled(); + return !kvm_ad_enabled; /* * Note, instruction fetches and writes are mutually exclusive, ignore @@ -3419,7 +3353,7 @@ static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, * harm. This also avoids the TLB flush needed after setting dirty bit * so non-PML cases won't be impacted. * - * Compare with set_spte where instead shadow_dirty_mask is set. + * Compare with make_spte() where instead shadow_dirty_mask is set. */ if (!try_cmpxchg64(sptep, &old_spte, new_spte)) return false; @@ -3430,18 +3364,6 @@ static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, return true; } -static bool is_access_allowed(struct kvm_page_fault *fault, u64 spte) -{ - if (fault->exec) - return is_executable_pte(spte); - - if (fault->write) - return is_writable_pte(spte); - - /* Fault was on Read access */ - return spte & PT_PRESENT_MASK; -} - /* * Returns the last level spte pointer of the shadow page walk for the given * gpa, and sets *spte to the spte value. This spte may be non-preset. If no @@ -3527,8 +3449,9 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) * uses A/D bits for non-nested MMUs. Thus, if A/D bits are * enabled, the SPTE can't be an access-tracked SPTE. */ - if (unlikely(!kvm_ad_enabled()) && is_access_track_spte(spte)) - new_spte = restore_acc_track_spte(new_spte); + if (unlikely(!kvm_ad_enabled) && is_access_track_spte(spte)) + new_spte = restore_acc_track_spte(new_spte) | + shadow_accessed_mask; /* * To keep things simple, only SPTEs that are MMU-writable can @@ -4376,8 +4299,15 @@ static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, return max_level; } -static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, - struct kvm_page_fault *fault) +static void kvm_mmu_finish_page_fault(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault, int r) +{ + kvm_release_faultin_page(vcpu->kvm, fault->refcounted_page, + r == RET_PF_RETRY, fault->map_writable); +} + +static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) { int max_order, r; @@ -4387,7 +4317,7 @@ static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, } r = kvm_gmem_get_pfn(vcpu->kvm, fault->slot, fault->gfn, &fault->pfn, - &max_order); + &fault->refcounted_page, &max_order); if (r) { kvm_mmu_prepare_memory_fault_exit(vcpu, fault); return r; @@ -4400,19 +4330,26 @@ static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, return RET_PF_CONTINUE; } -static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) +static int __kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) { - bool async; + unsigned int foll = fault->write ? FOLL_WRITE : 0; if (fault->is_private) - return kvm_faultin_pfn_private(vcpu, fault); + return kvm_mmu_faultin_pfn_private(vcpu, fault); + + foll |= FOLL_NOWAIT; + fault->pfn = __kvm_faultin_pfn(fault->slot, fault->gfn, foll, + &fault->map_writable, &fault->refcounted_page); - async = false; - fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, false, - &async, fault->write, - &fault->map_writable, &fault->hva); - if (!async) - return RET_PF_CONTINUE; /* *pfn has correct page already */ + /* + * If resolving the page failed because I/O is needed to fault-in the + * page, then either set up an asynchronous #PF to do the I/O, or if + * doing an async #PF isn't possible, retry with I/O allowed. All + * other failures are terminal, i.e. retrying won't help. + */ + if (fault->pfn != KVM_PFN_ERR_NEEDS_IO) + return RET_PF_CONTINUE; if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) { trace_kvm_try_async_get_page(fault->addr, fault->gfn); @@ -4430,14 +4367,16 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault * to wait for IO. Note, gup always bails if it is unable to quickly * get a page and a fatal signal, i.e. SIGKILL, is pending. */ - fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, true, - NULL, fault->write, - &fault->map_writable, &fault->hva); + foll |= FOLL_INTERRUPTIBLE; + foll &= ~FOLL_NOWAIT; + fault->pfn = __kvm_faultin_pfn(fault->slot, fault->gfn, foll, + &fault->map_writable, &fault->refcounted_page); + return RET_PF_CONTINUE; } -static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, - unsigned int access) +static int kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault, unsigned int access) { struct kvm_memory_slot *slot = fault->slot; int ret; @@ -4520,7 +4459,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) return RET_PF_RETRY; - ret = __kvm_faultin_pfn(vcpu, fault); + ret = __kvm_mmu_faultin_pfn(vcpu, fault); if (ret != RET_PF_CONTINUE) return ret; @@ -4538,7 +4477,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, * mmu_lock is acquired. */ if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) { - kvm_release_pfn_clean(fault->pfn); + kvm_mmu_finish_page_fault(vcpu, fault, RET_PF_RETRY); return RET_PF_RETRY; } @@ -4597,7 +4536,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (r) return r; - r = kvm_faultin_pfn(vcpu, fault, ACC_ALL); + r = kvm_mmu_faultin_pfn(vcpu, fault, ACC_ALL); if (r != RET_PF_CONTINUE) return r; @@ -4614,8 +4553,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault r = direct_map(vcpu, fault); out_unlock: + kvm_mmu_finish_page_fault(vcpu, fault, r); write_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(fault->pfn); return r; } @@ -4688,7 +4627,7 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu, if (r) return r; - r = kvm_faultin_pfn(vcpu, fault, ACC_ALL); + r = kvm_mmu_faultin_pfn(vcpu, fault, ACC_ALL); if (r != RET_PF_CONTINUE) return r; @@ -4701,8 +4640,8 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu, r = kvm_tdp_mmu_map(vcpu, fault); out_unlock: + kvm_mmu_finish_page_fault(vcpu, fault, r); read_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(fault->pfn); return r; } #endif @@ -5488,7 +5427,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, role.efer_nx = true; role.smm = cpu_role.base.smm; role.guest_mode = cpu_role.base.guest_mode; - role.ad_disabled = !kvm_ad_enabled(); + role.ad_disabled = !kvm_ad_enabled; role.level = kvm_mmu_get_tdp_level(vcpu); role.direct = true; role.has_4_byte_gpte = false; @@ -6228,7 +6167,7 @@ void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, /* It's actually a GPA for vcpu->arch.guest_mmu. */ if (mmu != &vcpu->arch.guest_mmu) { /* INVLPG on a non-canonical address is a NOP according to the SDM. */ - if (is_noncanonical_address(addr, vcpu)) + if (is_noncanonical_invlpg_address(addr, vcpu)) return; kvm_x86_call(flush_tlb_gva)(vcpu, addr); @@ -6416,8 +6355,11 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; int nr_zapped, batch = 0; + LIST_HEAD(invalid_list); bool unstable; + lockdep_assert_held(&kvm->slots_lock); + restart: list_for_each_entry_safe_reverse(sp, node, &kvm->arch.active_mmu_pages, link) { @@ -6449,7 +6391,7 @@ restart: } unstable = __kvm_mmu_prepare_zap_page(kvm, sp, - &kvm->arch.zapped_obsolete_pages, &nr_zapped); + &invalid_list, &nr_zapped); batch += nr_zapped; if (unstable) @@ -6465,7 +6407,7 @@ restart: * kvm_mmu_load()), and the reload in the caller ensure no vCPUs are * running with an obsolete MMU. */ - kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages); + kvm_mmu_commit_zap_page(kvm, &invalid_list); } /* @@ -6528,16 +6470,10 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) kvm_tdp_mmu_zap_invalidated_roots(kvm); } -static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) -{ - return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); -} - void kvm_mmu_init_vm(struct kvm *kvm) { kvm->arch.shadow_mmio_value = shadow_mmio_value; INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); - INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages); spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); @@ -6771,7 +6707,7 @@ static void shadow_mmu_split_huge_page(struct kvm *kvm, continue; } - spte = make_huge_page_split_spte(kvm, huge_spte, sp->role, index); + spte = make_small_spte(kvm, huge_spte, sp->role, index); mmu_spte_set(sptep, spte); __rmap_add(kvm, cache, slot, sptep, gfn, sp->role.access); } @@ -6954,8 +6890,7 @@ restart: * mapping if the indirect sp has level = 1. */ if (sp->role.direct && - sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, - PG_LEVEL_NUM)) { + sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn)) { kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); if (kvm_available_flush_remote_tlbs_range()) @@ -6983,8 +6918,8 @@ static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm, kvm_flush_remote_tlbs_memslot(kvm, slot); } -void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *slot) +void kvm_mmu_recover_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *slot) { if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); @@ -6994,7 +6929,7 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, if (tdp_mmu_enabled) { read_lock(&kvm->mmu_lock); - kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot); + kvm_tdp_mmu_recover_huge_pages(kvm, slot); read_unlock(&kvm->mmu_lock); } } @@ -7149,72 +7084,6 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) } } -static unsigned long mmu_shrink_scan(struct shrinker *shrink, - struct shrink_control *sc) -{ - struct kvm *kvm; - int nr_to_scan = sc->nr_to_scan; - unsigned long freed = 0; - - mutex_lock(&kvm_lock); - - list_for_each_entry(kvm, &vm_list, vm_list) { - int idx; - - /* - * Never scan more than sc->nr_to_scan VM instances. - * Will not hit this condition practically since we do not try - * to shrink more than one VM and it is very unlikely to see - * !n_used_mmu_pages so many times. - */ - if (!nr_to_scan--) - break; - /* - * n_used_mmu_pages is accessed without holding kvm->mmu_lock - * here. We may skip a VM instance errorneosly, but we do not - * want to shrink a VM that only started to populate its MMU - * anyway. - */ - if (!kvm->arch.n_used_mmu_pages && - !kvm_has_zapped_obsolete_pages(kvm)) - continue; - - idx = srcu_read_lock(&kvm->srcu); - write_lock(&kvm->mmu_lock); - - if (kvm_has_zapped_obsolete_pages(kvm)) { - kvm_mmu_commit_zap_page(kvm, - &kvm->arch.zapped_obsolete_pages); - goto unlock; - } - - freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan); - -unlock: - write_unlock(&kvm->mmu_lock); - srcu_read_unlock(&kvm->srcu, idx); - - /* - * unfair on small ones - * per-vm shrinkers cry out - * sadness comes quickly - */ - list_move_tail(&kvm->vm_list, &vm_list); - break; - } - - mutex_unlock(&kvm_lock); - return freed; -} - -static unsigned long mmu_shrink_count(struct shrinker *shrink, - struct shrink_control *sc) -{ - return percpu_counter_read_positive(&kvm_total_used_mmu_pages); -} - -static struct shrinker *mmu_shrinker; - static void mmu_destroy_caches(void) { kmem_cache_destroy(pte_list_desc_cache); @@ -7281,7 +7150,7 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) kvm_mmu_zap_all_fast(kvm); mutex_unlock(&kvm->slots_lock); - wake_up_process(kvm->arch.nx_huge_page_recovery_thread); + vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread); } mutex_unlock(&kvm_lock); } @@ -7341,23 +7210,8 @@ int kvm_mmu_vendor_module_init(void) if (!mmu_page_header_cache) goto out; - if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL)) - goto out; - - mmu_shrinker = shrinker_alloc(0, "x86-mmu"); - if (!mmu_shrinker) - goto out_shrinker; - - mmu_shrinker->count_objects = mmu_shrink_count; - mmu_shrinker->scan_objects = mmu_shrink_scan; - mmu_shrinker->seeks = DEFAULT_SEEKS * 10; - - shrinker_register(mmu_shrinker); - return 0; -out_shrinker: - percpu_counter_destroy(&kvm_total_used_mmu_pages); out: mmu_destroy_caches(); return ret; @@ -7374,8 +7228,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu) void kvm_mmu_vendor_module_exit(void) { mmu_destroy_caches(); - percpu_counter_destroy(&kvm_total_used_mmu_pages); - shrinker_free(mmu_shrinker); } /* @@ -7427,7 +7279,7 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) - wake_up_process(kvm->arch.nx_huge_page_recovery_thread); + vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread); mutex_unlock(&kvm_lock); } @@ -7530,62 +7382,56 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm) srcu_read_unlock(&kvm->srcu, rcu_idx); } -static long get_nx_huge_page_recovery_timeout(u64 start_time) +static void kvm_nx_huge_page_recovery_worker_kill(void *data) { - bool enabled; - uint period; - - enabled = calc_nx_huge_pages_recovery_period(&period); - - return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64() - : MAX_SCHEDULE_TIMEOUT; } -static int kvm_nx_huge_page_recovery_worker(struct kvm *kvm, uintptr_t data) +static bool kvm_nx_huge_page_recovery_worker(void *data) { - u64 start_time; + struct kvm *kvm = data; + bool enabled; + uint period; long remaining_time; - while (true) { - start_time = get_jiffies_64(); - remaining_time = get_nx_huge_page_recovery_timeout(start_time); - - set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop() && remaining_time > 0) { - schedule_timeout(remaining_time); - remaining_time = get_nx_huge_page_recovery_timeout(start_time); - set_current_state(TASK_INTERRUPTIBLE); - } - - set_current_state(TASK_RUNNING); - - if (kthread_should_stop()) - return 0; + enabled = calc_nx_huge_pages_recovery_period(&period); + if (!enabled) + return false; - kvm_recover_nx_huge_pages(kvm); + remaining_time = kvm->arch.nx_huge_page_last + msecs_to_jiffies(period) + - get_jiffies_64(); + if (remaining_time > 0) { + schedule_timeout(remaining_time); + /* check for signals and come back */ + return true; } + + __set_current_state(TASK_RUNNING); + kvm_recover_nx_huge_pages(kvm); + kvm->arch.nx_huge_page_last = get_jiffies_64(); + return true; } int kvm_mmu_post_init_vm(struct kvm *kvm) { - int err; - if (nx_hugepage_mitigation_hard_disabled) return 0; - err = kvm_vm_create_worker_thread(kvm, kvm_nx_huge_page_recovery_worker, 0, - "kvm-nx-lpage-recovery", - &kvm->arch.nx_huge_page_recovery_thread); - if (!err) - kthread_unpark(kvm->arch.nx_huge_page_recovery_thread); + kvm->arch.nx_huge_page_last = get_jiffies_64(); + kvm->arch.nx_huge_page_recovery_thread = vhost_task_create( + kvm_nx_huge_page_recovery_worker, kvm_nx_huge_page_recovery_worker_kill, + kvm, "kvm-nx-lpage-recovery"); - return err; + if (!kvm->arch.nx_huge_page_recovery_thread) + return -ENOMEM; + + vhost_task_start(kvm->arch.nx_huge_page_recovery_thread); + return 0; } void kvm_mmu_pre_destroy_vm(struct kvm *kvm) { if (kvm->arch.nx_huge_page_recovery_thread) - kthread_stop(kvm->arch.nx_huge_page_recovery_thread); + vhost_task_stop(kvm->arch.nx_huge_page_recovery_thread); } #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index c98827840e07..b00abbe3f6cf 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -164,7 +164,7 @@ static inline gfn_t gfn_round_for_level(gfn_t gfn, int level) } int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, - gfn_t gfn, bool can_unsync, bool prefetch); + gfn_t gfn, bool synchronizing, bool prefetch); void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn); void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn); @@ -235,10 +235,10 @@ struct kvm_page_fault { /* The memslot containing gfn. May be NULL. */ struct kvm_memory_slot *slot; - /* Outputs of kvm_faultin_pfn. */ + /* Outputs of kvm_mmu_faultin_pfn(). */ unsigned long mmu_seq; kvm_pfn_t pfn; - hva_t hva; + struct page *refcounted_page; bool map_writable; /* @@ -313,7 +313,6 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, .is_private = err & PFERR_PRIVATE_ACCESS, .pfn = KVM_PFN_ERR_FAULT, - .hva = KVM_HVA_ERR_BAD, }; int r; @@ -347,8 +346,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, } int kvm_mmu_max_mapping_level(struct kvm *kvm, - const struct kvm_memory_slot *slot, gfn_t gfn, - int max_level); + const struct kvm_memory_slot *slot, gfn_t gfn); void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index ae7d39ff2d07..f4711674c47b 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -533,10 +533,8 @@ static bool FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, pt_element_t gpte) { - struct kvm_memory_slot *slot; unsigned pte_access; gfn_t gfn; - kvm_pfn_t pfn; if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) return false; @@ -545,17 +543,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, pte_access = sp->role.access & FNAME(gpte_access)(gpte); FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); - slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, pte_access & ACC_WRITE_MASK); - if (!slot) - return false; - - pfn = gfn_to_pfn_memslot_atomic(slot, gfn); - if (is_error_pfn(pfn)) - return false; - - mmu_set_spte(vcpu, slot, spte, pte_access, gfn, pfn, NULL); - kvm_release_pfn_clean(pfn); - return true; + return kvm_mmu_prefetch_sptes(vcpu, gfn, spte, 1, pte_access); } static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, @@ -813,7 +801,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (r) return r; - r = kvm_faultin_pfn(vcpu, fault, walker.pte_access); + r = kvm_mmu_faultin_pfn(vcpu, fault, walker.pte_access); if (r != RET_PF_CONTINUE) return r; @@ -848,8 +836,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault r = FNAME(fetch)(vcpu, fault, &walker); out_unlock: + kvm_mmu_finish_page_fault(vcpu, fault, r); write_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(fault->pfn); return r; } @@ -892,9 +880,9 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, /* * Using the information in sp->shadowed_translation (kvm_mmu_page_get_gfn()) is - * safe because: - * - The spte has a reference to the struct page, so the pfn for a given gfn - * can't change unless all sptes pointing to it are nuked first. + * safe because SPTEs are protected by mmu_notifiers and memslot generations, so + * the pfn for a given gfn can't change unless all SPTEs pointing to the gfn are + * nuked first. * * Returns * < 0: failed to sync spte @@ -963,9 +951,14 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int host_writable = spte & shadow_host_writable_mask; slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); make_spte(vcpu, sp, slot, pte_access, gfn, - spte_to_pfn(spte), spte, true, false, + spte_to_pfn(spte), spte, true, true, host_writable, &spte); + /* + * There is no need to mark the pfn dirty, as the new protections must + * be a subset of the old protections, i.e. synchronizing a SPTE cannot + * change the SPTE from read-only to writable. + */ return mmu_spte_update(sptep, spte); } diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 8f7eb3ad88fc..22551e2f1d00 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -24,6 +24,8 @@ static bool __ro_after_init allow_mmio_caching; module_param_named(mmio_caching, enable_mmio_caching, bool, 0444); EXPORT_SYMBOL_GPL(enable_mmio_caching); +bool __read_mostly kvm_ad_enabled; + u64 __read_mostly shadow_host_writable_mask; u64 __read_mostly shadow_mmu_writable_mask; u64 __read_mostly shadow_nx_mask; @@ -133,12 +135,6 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) */ bool spte_has_volatile_bits(u64 spte) { - /* - * Always atomically update spte if it can be updated - * out of mmu-lock, it can ensure dirty bit is not lost, - * also, it can help us to get a stable is_writable_pte() - * to ensure tlb flush is not missed. - */ if (!is_writable_pte(spte) && is_mmu_writable_spte(spte)) return true; @@ -157,7 +153,7 @@ bool spte_has_volatile_bits(u64 spte) bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, const struct kvm_memory_slot *slot, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, - u64 old_spte, bool prefetch, bool can_unsync, + u64 old_spte, bool prefetch, bool synchronizing, bool host_writable, u64 *new_spte) { int level = sp->role.level; @@ -178,8 +174,8 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, spte |= SPTE_TDP_AD_WRPROT_ONLY; spte |= shadow_present_mask; - if (!prefetch) - spte |= spte_shadow_accessed_mask(spte); + if (!prefetch || synchronizing) + spte |= shadow_accessed_mask; /* * For simplicity, enforce the NX huge page mitigation even if not @@ -223,41 +219,39 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, spte |= (u64)pfn << PAGE_SHIFT; if (pte_access & ACC_WRITE_MASK) { - spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask; - - /* - * Optimization: for pte sync, if spte was writable the hash - * lookup is unnecessary (and expensive). Write protection - * is responsibility of kvm_mmu_get_page / kvm_mmu_sync_roots. - * Same reasoning can be applied to dirty page accounting. - */ - if (is_writable_pte(old_spte)) - goto out; - /* * Unsync shadow pages that are reachable by the new, writable * SPTE. Write-protect the SPTE if the page can't be unsync'd, * e.g. it's write-tracked (upper-level SPs) or has one or more * shadow pages and unsync'ing pages is not allowed. + * + * When overwriting an existing leaf SPTE, and the old SPTE was + * writable, skip trying to unsync shadow pages as any relevant + * shadow pages must already be unsync, i.e. the hash lookup is + * unnecessary (and expensive). Note, this relies on KVM not + * changing PFNs without first zapping the old SPTE, which is + * guaranteed by both the shadow MMU and the TDP MMU. */ - if (mmu_try_to_unsync_pages(vcpu->kvm, slot, gfn, can_unsync, prefetch)) { + if ((!is_last_spte(old_spte, level) || !is_writable_pte(old_spte)) && + mmu_try_to_unsync_pages(vcpu->kvm, slot, gfn, synchronizing, prefetch)) wrprot = true; - pte_access &= ~ACC_WRITE_MASK; - spte &= ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); - } + else + spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask | + shadow_dirty_mask; } - if (pte_access & ACC_WRITE_MASK) - spte |= spte_shadow_dirty_mask(spte); - -out: - if (prefetch) + if (prefetch && !synchronizing) spte = mark_spte_for_access_track(spte); WARN_ONCE(is_rsvd_spte(&vcpu->arch.mmu->shadow_zero_check, spte, level), "spte = 0x%llx, level = %d, rsvd bits = 0x%llx", spte, level, get_rsvd_bits(&vcpu->arch.mmu->shadow_zero_check, spte, level)); + /* + * Mark the memslot dirty *after* modifying it for access tracking. + * Unlike folios, memslots can be safely marked dirty out of mmu_lock, + * i.e. in the fast page fault handler. + */ if ((spte & PT_WRITABLE_MASK) && kvm_slot_dirty_track_enabled(slot)) { /* Enforced by kvm_mmu_hugepage_adjust. */ WARN_ON_ONCE(level > PG_LEVEL_4K); @@ -268,15 +262,15 @@ out: return wrprot; } -static u64 make_spte_executable(u64 spte) +static u64 modify_spte_protections(u64 spte, u64 set, u64 clear) { bool is_access_track = is_access_track_spte(spte); if (is_access_track) spte = restore_acc_track_spte(spte); - spte &= ~shadow_nx_mask; - spte |= shadow_x_mask; + KVM_MMU_WARN_ON(set & clear); + spte = (spte | set) & ~clear; if (is_access_track) spte = mark_spte_for_access_track(spte); @@ -284,6 +278,16 @@ static u64 make_spte_executable(u64 spte) return spte; } +static u64 make_spte_executable(u64 spte) +{ + return modify_spte_protections(spte, shadow_x_mask, shadow_nx_mask); +} + +static u64 make_spte_nonexecutable(u64 spte) +{ + return modify_spte_protections(spte, shadow_nx_mask, shadow_x_mask); +} + /* * Construct an SPTE that maps a sub-page of the given huge page SPTE where * `index` identifies which sub-page. @@ -291,8 +295,8 @@ static u64 make_spte_executable(u64 spte) * This is used during huge page splitting to build the SPTEs that make up the * new page table. */ -u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, - union kvm_mmu_page_role role, int index) +u64 make_small_spte(struct kvm *kvm, u64 huge_spte, + union kvm_mmu_page_role role, int index) { u64 child_spte = huge_spte; @@ -320,6 +324,26 @@ u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, return child_spte; } +u64 make_huge_spte(struct kvm *kvm, u64 small_spte, int level) +{ + u64 huge_spte; + + KVM_BUG_ON(!is_shadow_present_pte(small_spte) || level == PG_LEVEL_4K, kvm); + + huge_spte = small_spte | PT_PAGE_SIZE_MASK; + + /* + * huge_spte already has the address of the sub-page being collapsed + * from small_spte, so just clear the lower address bits to create the + * huge page address. + */ + huge_spte &= KVM_HPAGE_MASK(level) | ~PAGE_MASK; + + if (is_nx_huge_page_enabled(kvm)) + huge_spte = make_spte_nonexecutable(huge_spte); + + return huge_spte; +} u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) { @@ -352,7 +376,7 @@ u64 mark_spte_for_access_track(u64 spte) spte |= (spte & SHADOW_ACC_TRACK_SAVED_BITS_MASK) << SHADOW_ACC_TRACK_SAVED_BITS_SHIFT; - spte &= ~shadow_acc_track_mask; + spte &= ~(shadow_acc_track_mask | shadow_accessed_mask); return spte; } @@ -422,9 +446,11 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_me_spte_mask); void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only) { + kvm_ad_enabled = has_ad_bits; + shadow_user_mask = VMX_EPT_READABLE_MASK; - shadow_accessed_mask = has_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull; - shadow_dirty_mask = has_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull; + shadow_accessed_mask = VMX_EPT_ACCESS_BIT; + shadow_dirty_mask = VMX_EPT_DIRTY_BIT; shadow_nx_mask = 0ull; shadow_x_mask = VMX_EPT_EXECUTABLE_MASK; /* VMX_EPT_SUPPRESS_VE_BIT is needed for W or X violation. */ @@ -455,6 +481,8 @@ void kvm_mmu_reset_all_pte_masks(void) u8 low_phys_bits; u64 mask; + kvm_ad_enabled = true; + /* * If the CPU has 46 or less physical address bits, then set an * appropriate mask to guard against L1TF attacks. Otherwise, it is diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 2cb816ea2430..af10bc0380a3 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -167,6 +167,15 @@ static_assert(!(SHADOW_NONPRESENT_VALUE & SPTE_MMU_PRESENT_MASK)); #define SHADOW_NONPRESENT_VALUE 0ULL #endif + +/* + * True if A/D bits are supported in hardware and are enabled by KVM. When + * enabled, KVM uses A/D bits for all non-nested MMUs. Because L1 can disable + * A/D bits in EPTP12, SP and SPTE variants are needed to handle the scenario + * where KVM is using A/D bits for L1, but not L2. + */ +extern bool __read_mostly kvm_ad_enabled; + extern u64 __read_mostly shadow_host_writable_mask; extern u64 __read_mostly shadow_mmu_writable_mask; extern u64 __read_mostly shadow_nx_mask; @@ -285,17 +294,6 @@ static inline bool is_ept_ve_possible(u64 spte) (spte & VMX_EPT_RWX_MASK) != VMX_EPT_MISCONFIG_WX_VALUE; } -/* - * Returns true if A/D bits are supported in hardware and are enabled by KVM. - * When enabled, KVM uses A/D bits for all non-nested MMUs. Because L1 can - * disable A/D bits in EPTP12, SP and SPTE variants are needed to handle the - * scenario where KVM is using A/D bits for L1, but not L2. - */ -static inline bool kvm_ad_enabled(void) -{ - return !!shadow_accessed_mask; -} - static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) { return sp->role.ad_disabled; @@ -318,18 +316,6 @@ static inline bool spte_ad_need_write_protect(u64 spte) return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED; } -static inline u64 spte_shadow_accessed_mask(u64 spte) -{ - KVM_MMU_WARN_ON(!is_shadow_present_pte(spte)); - return spte_ad_enabled(spte) ? shadow_accessed_mask : 0; -} - -static inline u64 spte_shadow_dirty_mask(u64 spte) -{ - KVM_MMU_WARN_ON(!is_shadow_present_pte(spte)); - return spte_ad_enabled(spte) ? shadow_dirty_mask : 0; -} - static inline bool is_access_track_spte(u64 spte) { return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0; @@ -357,17 +343,7 @@ static inline kvm_pfn_t spte_to_pfn(u64 pte) static inline bool is_accessed_spte(u64 spte) { - u64 accessed_mask = spte_shadow_accessed_mask(spte); - - return accessed_mask ? spte & accessed_mask - : !is_access_track_spte(spte); -} - -static inline bool is_dirty_spte(u64 spte) -{ - u64 dirty_mask = spte_shadow_dirty_mask(spte); - - return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK; + return spte & shadow_accessed_mask; } static inline u64 get_rsvd_bits(struct rsvd_bits_validate *rsvd_check, u64 pte, @@ -485,6 +461,50 @@ static inline bool is_mmu_writable_spte(u64 spte) return spte & shadow_mmu_writable_mask; } +/* + * Returns true if the access indicated by @fault is allowed by the existing + * SPTE protections. Note, the caller is responsible for checking that the + * SPTE is a shadow-present, leaf SPTE (either before or after). + */ +static inline bool is_access_allowed(struct kvm_page_fault *fault, u64 spte) +{ + if (fault->exec) + return is_executable_pte(spte); + + if (fault->write) + return is_writable_pte(spte); + + /* Fault was on Read access */ + return spte & PT_PRESENT_MASK; +} + +/* + * If the MMU-writable flag is cleared, i.e. the SPTE is write-protected for + * write-tracking, remote TLBs must be flushed, even if the SPTE was read-only, + * as KVM allows stale Writable TLB entries to exist. When dirty logging, KVM + * flushes TLBs based on whether or not dirty bitmap/ring entries were reaped, + * not whether or not SPTEs were modified, i.e. only the write-tracking case + * needs to flush at the time the SPTEs is modified, before dropping mmu_lock. + * + * Don't flush if the Accessed bit is cleared, as access tracking tolerates + * false negatives, e.g. KVM x86 omits TLB flushes even when aging SPTEs for a + * mmu_notifier.clear_flush_young() event. + * + * Lastly, don't flush if the Dirty bit is cleared, as KVM unconditionally + * flushes when enabling dirty logging (see kvm_mmu_slot_apply_flags()), and + * when clearing dirty logs, KVM flushes based on whether or not dirty entries + * were reaped from the bitmap/ring, not whether or not dirty SPTEs were found. + * + * Note, this logic only applies to shadow-present leaf SPTEs. The caller is + * responsible for checking that the old SPTE is shadow-present, and is also + * responsible for determining whether or not a TLB flush is required when + * modifying a shadow-present non-leaf SPTE. + */ +static inline bool leaf_spte_change_needs_tlb_flush(u64 old_spte, u64 new_spte) +{ + return is_mmu_writable_spte(old_spte) && !is_mmu_writable_spte(new_spte); +} + static inline u64 get_mmio_spte_generation(u64 spte) { u64 gen; @@ -499,10 +519,11 @@ bool spte_has_volatile_bits(u64 spte); bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, const struct kvm_memory_slot *slot, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, - u64 old_spte, bool prefetch, bool can_unsync, + u64 old_spte, bool prefetch, bool synchronizing, bool host_writable, u64 *new_spte); -u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, - union kvm_mmu_page_role role, int index); +u64 make_small_spte(struct kvm *kvm, u64 huge_spte, + union kvm_mmu_page_role role, int index); +u64 make_huge_spte(struct kvm *kvm, u64 small_spte, int level); u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled); u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access); u64 mark_spte_for_access_track(u64 spte); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 3b996c1fdaab..2f15e0e33903 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -511,10 +511,6 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, if (is_leaf != was_leaf) kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1); - if (was_leaf && is_dirty_spte(old_spte) && - (!is_present || !is_dirty_spte(new_spte) || pfn_changed)) - kvm_set_pfn_dirty(spte_to_pfn(old_spte)); - /* * Recursively handle child PTs if the change removed a subtree from * the paging structure. Note the WARN on the PFN changing without the @@ -524,10 +520,6 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, if (was_present && !was_leaf && (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed))) handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared); - - if (was_leaf && is_accessed_spte(old_spte) && - (!is_present || !is_accessed_spte(new_spte) || pfn_changed)) - kvm_set_pfn_accessed(spte_to_pfn(old_spte)); } static inline int __must_check __tdp_mmu_set_spte_atomic(struct tdp_iter *iter, @@ -591,48 +583,6 @@ static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm, return 0; } -static inline int __must_check tdp_mmu_zap_spte_atomic(struct kvm *kvm, - struct tdp_iter *iter) -{ - int ret; - - lockdep_assert_held_read(&kvm->mmu_lock); - - /* - * Freeze the SPTE by setting it to a special, non-present value. This - * will stop other threads from immediately installing a present entry - * in its place before the TLBs are flushed. - * - * Delay processing of the zapped SPTE until after TLBs are flushed and - * the FROZEN_SPTE is replaced (see below). - */ - ret = __tdp_mmu_set_spte_atomic(iter, FROZEN_SPTE); - if (ret) - return ret; - - kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level); - - /* - * No other thread can overwrite the frozen SPTE as they must either - * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not - * overwrite the special frozen SPTE value. Use the raw write helper to - * avoid an unnecessary check on volatile bits. - */ - __kvm_tdp_mmu_write_spte(iter->sptep, SHADOW_NONPRESENT_VALUE); - - /* - * Process the zapped SPTE after flushing TLBs, and after replacing - * FROZEN_SPTE with 0. This minimizes the amount of time vCPUs are - * blocked by the FROZEN_SPTE and reduces contention on the child - * SPTEs. - */ - handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, - SHADOW_NONPRESENT_VALUE, iter->level, true); - - return 0; -} - - /* * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping * @kvm: KVM instance @@ -688,6 +638,16 @@ static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter, #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ for_each_tdp_pte(_iter, root_to_sp(_mmu->root.hpa), _start, _end) +static inline bool __must_check tdp_mmu_iter_need_resched(struct kvm *kvm, + struct tdp_iter *iter) +{ + if (!need_resched() && !rwlock_needbreak(&kvm->mmu_lock)) + return false; + + /* Ensure forward progress has been made before yielding. */ + return iter->next_last_level_gfn != iter->yielded_gfn; +} + /* * Yield if the MMU lock is contended or this thread needs to return control * to the scheduler. @@ -706,31 +666,27 @@ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm, struct tdp_iter *iter, bool flush, bool shared) { - WARN_ON_ONCE(iter->yielded); + KVM_MMU_WARN_ON(iter->yielded); - /* Ensure forward progress has been made before yielding. */ - if (iter->next_last_level_gfn == iter->yielded_gfn) + if (!tdp_mmu_iter_need_resched(kvm, iter)) return false; - if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { - if (flush) - kvm_flush_remote_tlbs(kvm); - - rcu_read_unlock(); + if (flush) + kvm_flush_remote_tlbs(kvm); - if (shared) - cond_resched_rwlock_read(&kvm->mmu_lock); - else - cond_resched_rwlock_write(&kvm->mmu_lock); + rcu_read_unlock(); - rcu_read_lock(); + if (shared) + cond_resched_rwlock_read(&kvm->mmu_lock); + else + cond_resched_rwlock_write(&kvm->mmu_lock); - WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn); + rcu_read_lock(); - iter->yielded = true; - } + WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn); - return iter->yielded; + iter->yielded = true; + return true; } static inline gfn_t tdp_mmu_max_gfn_exclusive(void) @@ -1026,19 +982,28 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, if (WARN_ON_ONCE(sp->role.level != fault->goal_level)) return RET_PF_RETRY; + if (fault->prefetch && is_shadow_present_pte(iter->old_spte)) + return RET_PF_SPURIOUS; + + if (is_shadow_present_pte(iter->old_spte) && + is_access_allowed(fault, iter->old_spte) && + is_last_spte(iter->old_spte, iter->level)) + return RET_PF_SPURIOUS; + if (unlikely(!fault->slot)) new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); else wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn, - fault->pfn, iter->old_spte, fault->prefetch, true, - fault->map_writable, &new_spte); + fault->pfn, iter->old_spte, fault->prefetch, + false, fault->map_writable, &new_spte); if (new_spte == iter->old_spte) ret = RET_PF_SPURIOUS; else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) return RET_PF_RETRY; else if (is_shadow_present_pte(iter->old_spte) && - !is_last_spte(iter->old_spte, iter->level)) + (!is_last_spte(iter->old_spte, iter->level) || + WARN_ON_ONCE(leaf_spte_change_needs_tlb_flush(iter->old_spte, new_spte)))) kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level); /* @@ -1078,7 +1043,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, struct kvm_mmu_page *sp, bool shared) { - u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled()); + u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled); int ret = 0; if (shared) { @@ -1195,33 +1160,6 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, return flush; } -typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter, - struct kvm_gfn_range *range); - -static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm, - struct kvm_gfn_range *range, - tdp_handler_t handler) -{ - struct kvm_mmu_page *root; - struct tdp_iter iter; - bool ret = false; - - /* - * Don't support rescheduling, none of the MMU notifiers that funnel - * into this helper allow blocking; it'd be dead, wasteful code. - */ - for_each_tdp_mmu_root(kvm, root, range->slot->as_id) { - rcu_read_lock(); - - tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) - ret |= handler(kvm, &iter, range); - - rcu_read_unlock(); - } - - return ret; -} - /* * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero * if any of the GFNs in the range have been accessed. @@ -1230,15 +1168,10 @@ static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm, * from the clear_young() or clear_flush_young() notifier, which uses the * return value to determine if the page has been accessed. */ -static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter, - struct kvm_gfn_range *range) +static void kvm_tdp_mmu_age_spte(struct tdp_iter *iter) { u64 new_spte; - /* If we have a non-accessed entry we don't need to change the pte. */ - if (!is_accessed_spte(iter->old_spte)) - return false; - if (spte_ad_enabled(iter->old_spte)) { iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep, iter->old_spte, @@ -1246,13 +1179,6 @@ static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter, iter->level); new_spte = iter->old_spte & ~shadow_accessed_mask; } else { - /* - * Capture the dirty status of the page, so that it doesn't get - * lost when the SPTE is marked for access tracking. - */ - if (is_writable_pte(iter->old_spte)) - kvm_set_pfn_dirty(spte_to_pfn(iter->old_spte)); - new_spte = mark_spte_for_access_track(iter->old_spte); iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep, iter->old_spte, new_spte, @@ -1261,23 +1187,48 @@ static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter, trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level, iter->old_spte, new_spte); - return true; } -bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) +static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, + struct kvm_gfn_range *range, + bool test_only) { - return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range); + struct kvm_mmu_page *root; + struct tdp_iter iter; + bool ret = false; + + /* + * Don't support rescheduling, none of the MMU notifiers that funnel + * into this helper allow blocking; it'd be dead, wasteful code. Note, + * this helper must NOT be used to unmap GFNs, as it processes only + * valid roots! + */ + for_each_valid_tdp_mmu_root(kvm, root, range->slot->as_id) { + guard(rcu)(); + + tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) { + if (!is_accessed_spte(iter.old_spte)) + continue; + + if (test_only) + return true; + + ret = true; + kvm_tdp_mmu_age_spte(&iter); + } + } + + return ret; } -static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter, - struct kvm_gfn_range *range) +bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { - return is_accessed_spte(iter->old_spte); + return __kvm_tdp_mmu_age_gfn_range(kvm, range, false); } bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn); + return __kvm_tdp_mmu_age_gfn_range(kvm, range, true); } /* @@ -1368,7 +1319,7 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, * not been linked in yet and thus is not reachable from any other CPU. */ for (i = 0; i < SPTE_ENT_PER_PAGE; i++) - sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i); + sp->spt[i] = make_small_spte(kvm, huge_spte, sp->role, i); /* * Replace the huge spte with a pointer to the populated lower level @@ -1501,16 +1452,15 @@ static bool tdp_mmu_need_write_protect(struct kvm_mmu_page *sp) * from level, so it is valid to key off any shadow page to determine if * write protection is needed for an entire tree. */ - return kvm_mmu_page_ad_need_write_protect(sp) || !kvm_ad_enabled(); + return kvm_mmu_page_ad_need_write_protect(sp) || !kvm_ad_enabled; } -static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, - gfn_t start, gfn_t end) +static void clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t start, gfn_t end) { const u64 dbit = tdp_mmu_need_write_protect(root) ? PT_WRITABLE_MASK : shadow_dirty_mask; struct tdp_iter iter; - bool spte_set = false; rcu_read_lock(); @@ -1531,31 +1481,24 @@ retry: if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit)) goto retry; - - spte_set = true; } rcu_read_unlock(); - return spte_set; } /* * Clear the dirty status (D-bit or W-bit) of all the SPTEs mapping GFNs in the - * memslot. Returns true if an SPTE has been changed and the TLBs need to be - * flushed. + * memslot. */ -bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, +void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, const struct kvm_memory_slot *slot) { struct kvm_mmu_page *root; - bool spte_set = false; lockdep_assert_held_read(&kvm->mmu_lock); for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) - spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, - slot->base_gfn + slot->npages); - - return spte_set; + clear_dirty_gfn_range(kvm, root, slot->base_gfn, + slot->base_gfn + slot->npages); } static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, @@ -1593,7 +1536,6 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level, iter.old_spte, iter.old_spte & ~dbit); - kvm_set_pfn_dirty(spte_to_pfn(iter.old_spte)); } rcu_read_unlock(); @@ -1615,21 +1557,55 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); } -static void zap_collapsible_spte_range(struct kvm *kvm, - struct kvm_mmu_page *root, - const struct kvm_memory_slot *slot) +static int tdp_mmu_make_huge_spte(struct kvm *kvm, + struct tdp_iter *parent, + u64 *huge_spte) +{ + struct kvm_mmu_page *root = spte_to_child_sp(parent->old_spte); + gfn_t start = parent->gfn; + gfn_t end = start + KVM_PAGES_PER_HPAGE(parent->level); + struct tdp_iter iter; + + tdp_root_for_each_leaf_pte(iter, root, start, end) { + /* + * Use the parent iterator when checking for forward progress so + * that KVM doesn't get stuck continuously trying to yield (i.e. + * returning -EAGAIN here and then failing the forward progress + * check in the caller ad nauseam). + */ + if (tdp_mmu_iter_need_resched(kvm, parent)) + return -EAGAIN; + + *huge_spte = make_huge_spte(kvm, iter.old_spte, parent->level); + return 0; + } + + return -ENOENT; +} + +static void recover_huge_pages_range(struct kvm *kvm, + struct kvm_mmu_page *root, + const struct kvm_memory_slot *slot) { gfn_t start = slot->base_gfn; gfn_t end = start + slot->npages; struct tdp_iter iter; int max_mapping_level; + bool flush = false; + u64 huge_spte; + int r; + + if (WARN_ON_ONCE(kvm_slot_dirty_track_enabled(slot))) + return; rcu_read_lock(); for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) { retry: - if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) + if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) { + flush = false; continue; + } if (iter.level > KVM_MAX_HUGEPAGE_LEVEL || !is_shadow_present_pte(iter.old_spte)) @@ -1653,31 +1629,40 @@ retry: if (iter.gfn < start || iter.gfn >= end) continue; - max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, - iter.gfn, PG_LEVEL_NUM); + max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, iter.gfn); if (max_mapping_level < iter.level) continue; - /* Note, a successful atomic zap also does a remote TLB flush. */ - if (tdp_mmu_zap_spte_atomic(kvm, &iter)) + r = tdp_mmu_make_huge_spte(kvm, &iter, &huge_spte); + if (r == -EAGAIN) + goto retry; + else if (r) + continue; + + if (tdp_mmu_set_spte_atomic(kvm, &iter, huge_spte)) goto retry; + + flush = true; } + if (flush) + kvm_flush_remote_tlbs_memslot(kvm, slot); + rcu_read_unlock(); } /* - * Zap non-leaf SPTEs (and free their associated page tables) which could - * be replaced by huge pages, for GFNs within the slot. + * Recover huge page mappings within the slot by replacing non-leaf SPTEs with + * huge SPTEs where possible. */ -void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *slot) +void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *slot) { struct kvm_mmu_page *root; lockdep_assert_held_read(&kvm->mmu_lock); for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) - zap_collapsible_spte_range(kvm, root, slot); + recover_huge_pages_range(kvm, root, slot); } /* diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 1b74e058a81c..f03ca0dd13d9 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -34,14 +34,14 @@ bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, const struct kvm_memory_slot *slot, int min_level); -bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, +void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, const struct kvm_memory_slot *slot); void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, unsigned long mask, bool wrprot); -void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *slot); +void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *slot); bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, |