diff options
Diffstat (limited to 'virt/kvm/kvm_main.c')
-rw-r--r-- | virt/kvm/kvm_main.c | 197 |
1 files changed, 147 insertions, 50 deletions
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b50dbe269f4b..439d3b4cd1a9 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -189,16 +189,6 @@ bool kvm_is_reserved_pfn(kvm_pfn_t pfn) return true; } -bool kvm_is_transparent_hugepage(kvm_pfn_t pfn) -{ - struct page *page = pfn_to_page(pfn); - - if (!PageTransCompoundMap(page)) - return false; - - return is_transparent_hugepage(compound_head(page)); -} - /* * Switches to specified vcpu, until a matching vcpu_put() */ @@ -318,6 +308,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm) */ long dirty_count = smp_load_acquire(&kvm->tlbs_dirty); + ++kvm->stat.generic.remote_tlb_flush_requests; /* * We want to publish modifications to the page tables before reading * mode. Pairs with a memory barrier in arch-specific code. @@ -415,6 +406,7 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) vcpu->preempted = false; vcpu->ready = false; preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); + vcpu->last_used_slot = 0; } void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) @@ -496,17 +488,6 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, idx = srcu_read_lock(&kvm->srcu); - /* The on_lock() path does not yet support lock elision. */ - if (!IS_KVM_NULL_FN(range->on_lock)) { - locked = true; - KVM_MMU_LOCK(kvm); - - range->on_lock(kvm, range->start, range->end); - - if (IS_KVM_NULL_FN(range->handler)) - goto out_unlock; - } - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { slots = __kvm_memslots(kvm, i); kvm_for_each_memslot(slot, slots) { @@ -538,6 +519,10 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, if (!locked) { locked = true; KVM_MMU_LOCK(kvm); + if (!IS_KVM_NULL_FN(range->on_lock)) + range->on_lock(kvm, range->start, range->end); + if (IS_KVM_NULL_FN(range->handler)) + break; } ret |= range->handler(kvm, &gfn_range); } @@ -546,7 +531,6 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, if (range->flush_on_ret && (ret || kvm->tlbs_dirty)) kvm_flush_remote_tlbs(kvm); -out_unlock: if (locked) KVM_MMU_UNLOCK(kvm); @@ -604,16 +588,20 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, trace_kvm_set_spte_hva(address); /* - * .change_pte() must be surrounded by .invalidate_range_{start,end}(), - * and so always runs with an elevated notifier count. This obviates - * the need to bump the sequence count. + * .change_pte() must be surrounded by .invalidate_range_{start,end}(). + * If mmu_notifier_count is zero, then no in-progress invalidations, + * including this one, found a relevant memslot at start(); rechecking + * memslots here is unnecessary. Note, a false positive (count elevated + * by a different invalidation) is sub-optimal but functionally ok. */ - WARN_ON_ONCE(!kvm->mmu_notifier_count); + WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count)); + if (!READ_ONCE(kvm->mmu_notifier_count)) + return; kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn); } -static void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start, +void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start, unsigned long end) { /* @@ -658,12 +646,24 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, trace_kvm_unmap_hva_range(range->start, range->end); + /* + * Prevent memslot modification between range_start() and range_end() + * so that conditionally locking provides the same result in both + * functions. Without that guarantee, the mmu_notifier_count + * adjustments will be imbalanced. + * + * Pairs with the decrement in range_end(). + */ + spin_lock(&kvm->mn_invalidate_lock); + kvm->mn_active_invalidate_count++; + spin_unlock(&kvm->mn_invalidate_lock); + __kvm_handle_hva_range(kvm, &hva_range); return 0; } -static void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start, +void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start, unsigned long end) { /* @@ -694,9 +694,22 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, .flush_on_ret = false, .may_block = mmu_notifier_range_blockable(range), }; + bool wake; __kvm_handle_hva_range(kvm, &hva_range); + /* Pairs with the increment in range_start(). */ + spin_lock(&kvm->mn_invalidate_lock); + wake = (--kvm->mn_active_invalidate_count == 0); + spin_unlock(&kvm->mn_invalidate_lock); + + /* + * There can only be one waiter, since the wait happens under + * slots_lock. + */ + if (wake) + rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait); + BUG_ON(kvm->mmu_notifier_count < 0); } @@ -897,7 +910,7 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) char dir_name[ITOA_MAX_LEN * 2]; struct kvm_stat_data *stat_data; const struct _kvm_stats_desc *pdesc; - int i; + int i, ret; int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc + kvm_vcpu_stats_header.num_desc; @@ -954,6 +967,13 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) kvm->debugfs_dentry, stat_data, &stat_fops_per_vm); } + + ret = kvm_arch_create_vm_debugfs(kvm); + if (ret) { + kvm_destroy_vm_debugfs(kvm); + return i; + } + return 0; } @@ -974,6 +994,17 @@ void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm) { } +/* + * Called after per-vm debugfs created. When called kvm->debugfs_dentry should + * be setup already, so we can create arch-specific debugfs entries under it. + * Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so + * a per-arch destroy interface is not needed. + */ +int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm) +{ + return 0; +} + static struct kvm *kvm_create_vm(unsigned long type) { struct kvm *kvm = kvm_arch_alloc_vm(); @@ -991,6 +1022,9 @@ static struct kvm *kvm_create_vm(unsigned long type) mutex_init(&kvm->irq_lock); mutex_init(&kvm->slots_lock); mutex_init(&kvm->slots_arch_lock); + spin_lock_init(&kvm->mn_invalidate_lock); + rcuwait_init(&kvm->mn_memslots_update_rcuwait); + INIT_LIST_HEAD(&kvm->devices); BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); @@ -1113,6 +1147,16 @@ static void kvm_destroy_vm(struct kvm *kvm) kvm_coalesced_mmio_free(kvm); #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); + /* + * At this point, pending calls to invalidate_range_start() + * have completed but no more MMU notifiers will run, so + * mn_active_invalidate_count may remain unbalanced. + * No threads can be waiting in install_new_memslots as the + * last reference on KVM has been dropped, but freeing + * memslots would deadlock without this manual intervention. + */ + WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait)); + kvm->mn_active_invalidate_count = 0; #else kvm_arch_flush_shadow_all(kvm); #endif @@ -1134,6 +1178,16 @@ void kvm_get_kvm(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_get_kvm); +/* + * Make sure the vm is not during destruction, which is a safe version of + * kvm_get_kvm(). Return true if kvm referenced successfully, false otherwise. + */ +bool kvm_get_kvm_safe(struct kvm *kvm) +{ + return refcount_inc_not_zero(&kvm->users_count); +} +EXPORT_SYMBOL_GPL(kvm_get_kvm_safe); + void kvm_put_kvm(struct kvm *kvm) { if (refcount_dec_and_test(&kvm->users_count)) @@ -1194,8 +1248,8 @@ static inline void kvm_memslot_delete(struct kvm_memslots *slots, slots->used_slots--; - if (atomic_read(&slots->lru_slot) >= slots->used_slots) - atomic_set(&slots->lru_slot, 0); + if (atomic_read(&slots->last_used_slot) >= slots->used_slots) + atomic_set(&slots->last_used_slot, 0); for (i = slots->id_to_index[memslot->id]; i < slots->used_slots; i++) { mslots[i] = mslots[i + 1]; @@ -1364,7 +1418,22 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm, WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; + /* + * Do not store the new memslots while there are invalidations in + * progress, otherwise the locking in invalidate_range_start and + * invalidate_range_end will be unbalanced. + */ + spin_lock(&kvm->mn_invalidate_lock); + prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait); + while (kvm->mn_active_invalidate_count) { + set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(&kvm->mn_invalidate_lock); + schedule(); + spin_lock(&kvm->mn_invalidate_lock); + } + finish_rcuwait(&kvm->mn_memslots_update_rcuwait); rcu_assign_pointer(kvm->memslots[as_id], slots); + spin_unlock(&kvm->mn_invalidate_lock); /* * Acquired in kvm_set_memslot. Must be released before synchronize @@ -1980,7 +2049,26 @@ EXPORT_SYMBOL_GPL(gfn_to_memslot); struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn) { - return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn); + struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu); + struct kvm_memory_slot *slot; + int slot_index; + + slot = try_get_memslot(slots, vcpu->last_used_slot, gfn); + if (slot) + return slot; + + /* + * Fall back to searching all memslots. We purposely use + * search_memslots() instead of __gfn_to_memslot() to avoid + * thrashing the VM-wide last_used_index in kvm_memslots. + */ + slot = search_memslots(slots, gfn, &slot_index); + if (slot) { + vcpu->last_used_slot = slot_index; + return slot; + } + + return NULL; } EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_memslot); @@ -2239,7 +2327,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, * Get a reference here because callers of *hva_to_pfn* and * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the * returned pfn. This is only needed if the VMA has VM_MIXEDMAP - * set, but the kvm_get_pfn/kvm_release_pfn_clean pair will + * set, but the kvm_try_get_pfn/kvm_release_pfn_clean pair will * simply do nothing for reserved pfns. * * Whoever called remap_pfn_range is also going to call e.g. @@ -2636,13 +2724,6 @@ void kvm_set_pfn_accessed(kvm_pfn_t pfn) } EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); -void kvm_get_pfn(kvm_pfn_t pfn) -{ - if (!kvm_is_reserved_pfn(pfn)) - get_page(pfn_to_page(pfn)); -} -EXPORT_SYMBOL_GPL(kvm_get_pfn); - static int next_segment(unsigned long len, int offset) { if (len > PAGE_SIZE - offset) @@ -3122,13 +3203,23 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) ++vcpu->stat.generic.halt_successful_poll; if (!vcpu_valid_wakeup(vcpu)) ++vcpu->stat.generic.halt_poll_invalid; + + KVM_STATS_LOG_HIST_UPDATE( + vcpu->stat.generic.halt_poll_success_hist, + ktime_to_ns(ktime_get()) - + ktime_to_ns(start)); goto out; } cpu_relax(); poll_end = cur = ktime_get(); } while (kvm_vcpu_can_poll(cur, stop)); + + KVM_STATS_LOG_HIST_UPDATE( + vcpu->stat.generic.halt_poll_fail_hist, + ktime_to_ns(ktime_get()) - ktime_to_ns(start)); } + prepare_to_rcuwait(&vcpu->wait); for (;;) { set_current_state(TASK_INTERRUPTIBLE); @@ -3141,6 +3232,12 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) } finish_rcuwait(&vcpu->wait); cur = ktime_get(); + if (waited) { + vcpu->stat.generic.halt_wait_ns += + ktime_to_ns(cur) - ktime_to_ns(poll_end); + KVM_STATS_LOG_HIST_UPDATE(vcpu->stat.generic.halt_wait_hist, + ktime_to_ns(cur) - ktime_to_ns(poll_end)); + } out: kvm_arch_vcpu_unblocking(vcpu); block_ns = ktime_to_ns(cur) - ktime_to_ns(start); @@ -3612,7 +3709,7 @@ static long kvm_vcpu_ioctl(struct file *filp, struct kvm_fpu *fpu = NULL; struct kvm_sregs *kvm_sregs = NULL; - if (vcpu->kvm->mm != current->mm) + if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_bugged) return -EIO; if (unlikely(_IOC_TYPE(ioctl) != KVMIO)) @@ -3822,7 +3919,7 @@ static long kvm_vcpu_compat_ioctl(struct file *filp, void __user *argp = compat_ptr(arg); int r; - if (vcpu->kvm->mm != current->mm) + if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_bugged) return -EIO; switch (ioctl) { @@ -3888,7 +3985,7 @@ static long kvm_device_ioctl(struct file *filp, unsigned int ioctl, { struct kvm_device *dev = filp->private_data; - if (dev->kvm->mm != current->mm) + if (dev->kvm->mm != current->mm || dev->kvm->vm_bugged) return -EIO; switch (ioctl) { @@ -4210,7 +4307,7 @@ static long kvm_vm_ioctl(struct file *filp, void __user *argp = (void __user *)arg; int r; - if (kvm->mm != current->mm) + if (kvm->mm != current->mm || kvm->vm_bugged) return -EIO; switch (ioctl) { case KVM_CREATE_VCPU: @@ -4421,7 +4518,7 @@ static long kvm_vm_compat_ioctl(struct file *filp, struct kvm *kvm = filp->private_data; int r; - if (kvm->mm != current->mm) + if (kvm->mm != current->mm || kvm->vm_bugged) return -EIO; switch (ioctl) { #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT @@ -4983,12 +5080,12 @@ static int kvm_debugfs_open(struct inode *inode, struct file *file, struct kvm_stat_data *stat_data = (struct kvm_stat_data *) inode->i_private; - /* The debugfs files are a reference to the kvm struct which - * is still valid when kvm_destroy_vm is called. - * To avoid the race between open and the removal of the debugfs - * directory we test against the users count. + /* + * The debugfs files are a reference to the kvm struct which + * is still valid when kvm_destroy_vm is called. kvm_get_kvm_safe + * avoids the race between open and the removal of the debugfs directory. */ - if (!refcount_inc_not_zero(&stat_data->kvm->users_count)) + if (!kvm_get_kvm_safe(stat_data->kvm)) return -ENOENT; if (simple_attr_open(inode, file, get, |