diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-05-01 19:14:08 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-05-01 19:14:08 +0200 |
commit | 152d32aa846835987966fd20ee1143b0e05036a0 (patch) | |
tree | 728cfb095b62bb3cb3ede5ff12f70d0042db49d4 /virt/kvm/kvm_main.c | |
parent | Merge tag 'iommu-updates-v5.13' of git://git.kernel.org/pub/scm/linux/kernel/... (diff) | |
parent | KVM: selftests: Speed up set_memory_region_test (diff) | |
download | linux-152d32aa846835987966fd20ee1143b0e05036a0.tar.xz linux-152d32aa846835987966fd20ee1143b0e05036a0.zip |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm updates from Paolo Bonzini:
"This is a large update by KVM standards, including AMD PSP (Platform
Security Processor, aka "AMD Secure Technology") and ARM CoreSight
(debug and trace) changes.
ARM:
- CoreSight: Add support for ETE and TRBE
- Stage-2 isolation for the host kernel when running in protected
mode
- Guest SVE support when running in nVHE mode
- Force W^X hypervisor mappings in nVHE mode
- ITS save/restore for guests using direct injection with GICv4.1
- nVHE panics now produce readable backtraces
- Guest support for PTP using the ptp_kvm driver
- Performance improvements in the S2 fault handler
x86:
- AMD PSP driver changes
- Optimizations and cleanup of nested SVM code
- AMD: Support for virtual SPEC_CTRL
- Optimizations of the new MMU code: fast invalidation, zap under
read lock, enable/disably dirty page logging under read lock
- /dev/kvm API for AMD SEV live migration (guest API coming soon)
- support SEV virtual machines sharing the same encryption context
- support SGX in virtual machines
- add a few more statistics
- improved directed yield heuristics
- Lots and lots of cleanups
Generic:
- Rework of MMU notifier interface, simplifying and optimizing the
architecture-specific code
- a handful of "Get rid of oprofile leftovers" patches
- Some selftests improvements"
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (379 commits)
KVM: selftests: Speed up set_memory_region_test
selftests: kvm: Fix the check of return value
KVM: x86: Take advantage of kvm_arch_dy_has_pending_interrupt()
KVM: SVM: Skip SEV cache flush if no ASIDs have been used
KVM: SVM: Remove an unnecessary prototype declaration of sev_flush_asids()
KVM: SVM: Drop redundant svm_sev_enabled() helper
KVM: SVM: Move SEV VMCB tracking allocation to sev.c
KVM: SVM: Explicitly check max SEV ASID during sev_hardware_setup()
KVM: SVM: Unconditionally invoke sev_hardware_teardown()
KVM: SVM: Enable SEV/SEV-ES functionality by default (when supported)
KVM: SVM: Condition sev_enabled and sev_es_enabled on CONFIG_KVM_AMD_SEV=y
KVM: SVM: Append "_enabled" to module-scoped SEV/SEV-ES control variables
KVM: SEV: Mask CPUID[0x8000001F].eax according to supported features
KVM: SVM: Move SEV module params/variables to sev.c
KVM: SVM: Disable SEV/SEV-ES if NPT is disabled
KVM: SVM: Free sev_asid_bitmap during init if SEV setup fails
KVM: SVM: Zero out the VMCB array used to track SEV ASID association
x86/sev: Drop redundant and potentially misleading 'sev_enabled'
KVM: x86: Move reverse CPUID helpers to separate header file
KVM: x86: Rename GPR accessors to make mode-aware variants the defaults
...
Diffstat (limited to 'virt/kvm/kvm_main.c')
-rw-r--r-- | virt/kvm/kvm_main.c | 303 |
1 files changed, 228 insertions, 75 deletions
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 383df23514b9..2799c6660cce 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -451,35 +451,170 @@ static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn, srcu_read_unlock(&kvm->srcu, idx); } -static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long address, - pte_t pte) +typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range); + +typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start, + unsigned long end); + +struct kvm_hva_range { + unsigned long start; + unsigned long end; + pte_t pte; + hva_handler_t handler; + on_lock_fn_t on_lock; + bool flush_on_ret; + bool may_block; +}; + +/* + * Use a dedicated stub instead of NULL to indicate that there is no callback + * function/handler. The compiler technically can't guarantee that a real + * function will have a non-zero address, and so it will generate code to + * check for !NULL, whereas comparing against a stub will be elided at compile + * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9). + */ +static void kvm_null_fn(void) { - struct kvm *kvm = mmu_notifier_to_kvm(mn); - int idx; + +} +#define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn) + +static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, + const struct kvm_hva_range *range) +{ + bool ret = false, locked = false; + struct kvm_gfn_range gfn_range; + struct kvm_memory_slot *slot; + struct kvm_memslots *slots; + int i, idx; + + /* A null handler is allowed if and only if on_lock() is provided. */ + if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) && + IS_KVM_NULL_FN(range->handler))) + return 0; idx = srcu_read_lock(&kvm->srcu); - KVM_MMU_LOCK(kvm); + /* The on_lock() path does not yet support lock elision. */ + if (!IS_KVM_NULL_FN(range->on_lock)) { + locked = true; + KVM_MMU_LOCK(kvm); - kvm->mmu_notifier_seq++; + range->on_lock(kvm, range->start, range->end); + + if (IS_KVM_NULL_FN(range->handler)) + goto out_unlock; + } - if (kvm_set_spte_hva(kvm, address, pte)) + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); + kvm_for_each_memslot(slot, slots) { + unsigned long hva_start, hva_end; + + hva_start = max(range->start, slot->userspace_addr); + hva_end = min(range->end, slot->userspace_addr + + (slot->npages << PAGE_SHIFT)); + if (hva_start >= hva_end) + continue; + + /* + * To optimize for the likely case where the address + * range is covered by zero or one memslots, don't + * bother making these conditional (to avoid writes on + * the second or later invocation of the handler). + */ + gfn_range.pte = range->pte; + gfn_range.may_block = range->may_block; + + /* + * {gfn(page) | page intersects with [hva_start, hva_end)} = + * {gfn_start, gfn_start+1, ..., gfn_end-1}. + */ + gfn_range.start = hva_to_gfn_memslot(hva_start, slot); + gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot); + gfn_range.slot = slot; + + if (!locked) { + locked = true; + KVM_MMU_LOCK(kvm); + } + ret |= range->handler(kvm, &gfn_range); + } + } + + if (range->flush_on_ret && (ret || kvm->tlbs_dirty)) kvm_flush_remote_tlbs(kvm); - KVM_MMU_UNLOCK(kvm); +out_unlock: + if (locked) + KVM_MMU_UNLOCK(kvm); + srcu_read_unlock(&kvm->srcu, idx); + + /* The notifiers are averse to booleans. :-( */ + return (int)ret; } -static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, - const struct mmu_notifier_range *range) +static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn, + unsigned long start, + unsigned long end, + pte_t pte, + hva_handler_t handler) { struct kvm *kvm = mmu_notifier_to_kvm(mn); - int need_tlb_flush = 0, idx; + const struct kvm_hva_range range = { + .start = start, + .end = end, + .pte = pte, + .handler = handler, + .on_lock = (void *)kvm_null_fn, + .flush_on_ret = true, + .may_block = false, + }; + + return __kvm_handle_hva_range(kvm, &range); +} + +static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn, + unsigned long start, + unsigned long end, + hva_handler_t handler) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + const struct kvm_hva_range range = { + .start = start, + .end = end, + .pte = __pte(0), + .handler = handler, + .on_lock = (void *)kvm_null_fn, + .flush_on_ret = false, + .may_block = false, + }; + + return __kvm_handle_hva_range(kvm, &range); +} +static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address, + pte_t pte) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + + trace_kvm_set_spte_hva(address); - idx = srcu_read_lock(&kvm->srcu); - KVM_MMU_LOCK(kvm); + /* + * .change_pte() must be surrounded by .invalidate_range_{start,end}(), + * and so always runs with an elevated notifier count. This obviates + * the need to bump the sequence count. + */ + WARN_ON_ONCE(!kvm->mmu_notifier_count); + + kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn); +} + +static void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start, + unsigned long end) +{ /* * The count increase must become visible at unlock time as no * spte can be established without taking the mmu_lock and @@ -487,8 +622,8 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, */ kvm->mmu_notifier_count++; if (likely(kvm->mmu_notifier_count == 1)) { - kvm->mmu_notifier_range_start = range->start; - kvm->mmu_notifier_range_end = range->end; + kvm->mmu_notifier_range_start = start; + kvm->mmu_notifier_range_end = end; } else { /* * Fully tracking multiple concurrent ranges has dimishing @@ -500,28 +635,36 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, * complete. */ kvm->mmu_notifier_range_start = - min(kvm->mmu_notifier_range_start, range->start); + min(kvm->mmu_notifier_range_start, start); kvm->mmu_notifier_range_end = - max(kvm->mmu_notifier_range_end, range->end); + max(kvm->mmu_notifier_range_end, end); } - need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end, - range->flags); - /* we've to flush the tlb before the pages can be freed */ - if (need_tlb_flush || kvm->tlbs_dirty) - kvm_flush_remote_tlbs(kvm); - - KVM_MMU_UNLOCK(kvm); - srcu_read_unlock(&kvm->srcu, idx); - - return 0; } -static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, +static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, const struct mmu_notifier_range *range) { struct kvm *kvm = mmu_notifier_to_kvm(mn); + const struct kvm_hva_range hva_range = { + .start = range->start, + .end = range->end, + .pte = __pte(0), + .handler = kvm_unmap_gfn_range, + .on_lock = kvm_inc_notifier_count, + .flush_on_ret = true, + .may_block = mmu_notifier_range_blockable(range), + }; - KVM_MMU_LOCK(kvm); + trace_kvm_unmap_hva_range(range->start, range->end); + + __kvm_handle_hva_range(kvm, &hva_range); + + return 0; +} + +static void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start, + unsigned long end) +{ /* * This sequence increase will notify the kvm page fault that * the page that is going to be mapped in the spte could have @@ -535,7 +678,23 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, * in conjunction with the smp_rmb in mmu_notifier_retry(). */ kvm->mmu_notifier_count--; - KVM_MMU_UNLOCK(kvm); +} + +static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, + const struct mmu_notifier_range *range) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + const struct kvm_hva_range hva_range = { + .start = range->start, + .end = range->end, + .pte = __pte(0), + .handler = (void *)kvm_null_fn, + .on_lock = kvm_dec_notifier_count, + .flush_on_ret = false, + .may_block = mmu_notifier_range_blockable(range), + }; + + __kvm_handle_hva_range(kvm, &hva_range); BUG_ON(kvm->mmu_notifier_count < 0); } @@ -545,20 +704,9 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, unsigned long start, unsigned long end) { - struct kvm *kvm = mmu_notifier_to_kvm(mn); - int young, idx; + trace_kvm_age_hva(start, end); - idx = srcu_read_lock(&kvm->srcu); - KVM_MMU_LOCK(kvm); - - young = kvm_age_hva(kvm, start, end); - if (young) - kvm_flush_remote_tlbs(kvm); - - KVM_MMU_UNLOCK(kvm); - srcu_read_unlock(&kvm->srcu, idx); - - return young; + return kvm_handle_hva_range(mn, start, end, __pte(0), kvm_age_gfn); } static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, @@ -566,11 +714,8 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, unsigned long start, unsigned long end) { - struct kvm *kvm = mmu_notifier_to_kvm(mn); - int young, idx; + trace_kvm_age_hva(start, end); - idx = srcu_read_lock(&kvm->srcu); - KVM_MMU_LOCK(kvm); /* * Even though we do not flush TLB, this will still adversely * affect performance on pre-Haswell Intel EPT, where there is @@ -584,27 +729,17 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, * cadence. If we find this inaccurate, we might come up with a * more sophisticated heuristic later. */ - young = kvm_age_hva(kvm, start, end); - KVM_MMU_UNLOCK(kvm); - srcu_read_unlock(&kvm->srcu, idx); - - return young; + return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn); } static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long address) { - struct kvm *kvm = mmu_notifier_to_kvm(mn); - int young, idx; + trace_kvm_test_age_hva(address); - idx = srcu_read_lock(&kvm->srcu); - KVM_MMU_LOCK(kvm); - young = kvm_test_age_hva(kvm, address); - KVM_MMU_UNLOCK(kvm); - srcu_read_unlock(&kvm->srcu, idx); - - return young; + return kvm_handle_hva_range_no_flush(mn, address, address + 1, + kvm_test_age_gfn); } static void kvm_mmu_notifier_release(struct mmu_notifier *mn, @@ -3002,6 +3137,11 @@ static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu) return false; } +bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) +{ + return false; +} + void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) { struct kvm *kvm = me->kvm; @@ -3035,7 +3175,8 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) !vcpu_dy_runnable(vcpu)) continue; if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode && - !kvm_arch_vcpu_in_kernel(vcpu)) + !kvm_arch_dy_has_pending_interrupt(vcpu) && + !kvm_arch_vcpu_in_kernel(vcpu)) continue; if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) continue; @@ -3182,7 +3323,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) if (r) goto vcpu_decrement; - vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); + vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); if (!vcpu) { r = -ENOMEM; goto vcpu_decrement; @@ -4062,6 +4203,12 @@ static struct file_operations kvm_vm_fops = { KVM_COMPAT(kvm_vm_compat_ioctl), }; +bool file_is_kvm(struct file *file) +{ + return file && file->f_op == &kvm_vm_fops; +} +EXPORT_SYMBOL_GPL(file_is_kvm); + static int kvm_dev_ioctl_create_vm(unsigned long type) { int r; @@ -4485,24 +4632,26 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, return 0; } -/* Caller must hold slots_lock. */ -void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, - struct kvm_io_device *dev) +int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, + struct kvm_io_device *dev) { int i, j; struct kvm_io_bus *new_bus, *bus; + lockdep_assert_held(&kvm->slots_lock); + bus = kvm_get_bus(kvm, bus_idx); if (!bus) - return; + return 0; - for (i = 0; i < bus->dev_count; i++) + for (i = 0; i < bus->dev_count; i++) { if (bus->range[i].dev == dev) { break; } + } if (i == bus->dev_count) - return; + return 0; new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1), GFP_KERNEL_ACCOUNT); @@ -4511,7 +4660,13 @@ void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, new_bus->dev_count--; memcpy(new_bus->range + i, bus->range + i + 1, flex_array_size(new_bus, range, new_bus->dev_count - i)); - } else { + } + + rcu_assign_pointer(kvm->buses[bus_idx], new_bus); + synchronize_srcu_expedited(&kvm->srcu); + + /* Destroy the old bus _after_ installing the (null) bus. */ + if (!new_bus) { pr_err("kvm: failed to shrink bus, removing it completely\n"); for (j = 0; j < bus->dev_count; j++) { if (j == i) @@ -4520,10 +4675,8 @@ void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, } } - rcu_assign_pointer(kvm->buses[bus_idx], new_bus); - synchronize_srcu_expedited(&kvm->srcu); kfree(bus); - return; + return new_bus ? 0 : -ENOMEM; } struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, |