diff options
author | Paolo Bonzini <pbonzini@redhat.com> | 2024-11-13 12:24:19 +0100 |
---|---|---|
committer | Paolo Bonzini <pbonzini@redhat.com> | 2024-11-13 12:24:19 +0100 |
commit | b39d1578d504122527e88127fdafb6109c3916be (patch) | |
tree | 43e4e2b931cb161e2eeaa4ff47e1a89bb5e9f636 /virt/kvm | |
parent | Merge tag 'kvm-s390-next-6.13-1' of https://git.kernel.org/pub/scm/linux/kern... (diff) | |
parent | KVM: Protect vCPU's "last run PID" with rwlock, not RCU (diff) | |
download | linux-b39d1578d504122527e88127fdafb6109c3916be.tar.xz linux-b39d1578d504122527e88127fdafb6109c3916be.zip |
Merge tag 'kvm-x86-generic-6.13' of https://github.com/kvm-x86/linux into HEAD
KVM generic changes for 6.13
- Rework kvm_vcpu_on_spin() to use a single for-loop instead of making two
partial poasses over "all" vCPUs. Opportunistically expand the comment
to better explain the motivation and logic.
- Protect vcpu->pid accesses outside of vcpu->mutex with a rwlock instead
of RCU, so that running a vCPU on a different task doesn't encounter
long stalls due to having to wait for all CPUs become quiescent.
Diffstat (limited to 'virt/kvm')
-rw-r--r-- | virt/kvm/kvm_main.c | 143 |
1 files changed, 83 insertions, 60 deletions
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b684c332782c..ec6fc8164f66 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -447,6 +447,7 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) vcpu->kvm = kvm; vcpu->vcpu_id = id; vcpu->pid = NULL; + rwlock_init(&vcpu->pid_lock); #ifndef __KVM_HAVE_ARCH_WQP rcuwait_init(&vcpu->wait); #endif @@ -474,7 +475,7 @@ static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) * the vcpu->pid pointer, and at destruction time all file descriptors * are already gone. */ - put_pid(rcu_dereference_protected(vcpu->pid, 1)); + put_pid(vcpu->pid); free_page((unsigned long)vcpu->run); kmem_cache_free(kvm_vcpu_cache, vcpu); @@ -3770,17 +3771,19 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_kick); int kvm_vcpu_yield_to(struct kvm_vcpu *target) { - struct pid *pid; struct task_struct *task = NULL; - int ret = 0; + int ret; + + if (!read_trylock(&target->pid_lock)) + return 0; + + if (target->pid) + task = get_pid_task(target->pid, PIDTYPE_PID); + + read_unlock(&target->pid_lock); - rcu_read_lock(); - pid = rcu_dereference(target->pid); - if (pid) - task = get_pid_task(pid, PIDTYPE_PID); - rcu_read_unlock(); if (!task) - return ret; + return 0; ret = yield_to(task, 1); put_task_struct(task); @@ -3869,59 +3872,71 @@ bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) { + int nr_vcpus, start, i, idx, yielded; struct kvm *kvm = me->kvm; struct kvm_vcpu *vcpu; - int last_boosted_vcpu; - unsigned long i; - int yielded = 0; int try = 3; - int pass; - last_boosted_vcpu = READ_ONCE(kvm->last_boosted_vcpu); + nr_vcpus = atomic_read(&kvm->online_vcpus); + if (nr_vcpus < 2) + return; + + /* Pairs with the smp_wmb() in kvm_vm_ioctl_create_vcpu(). */ + smp_rmb(); + kvm_vcpu_set_in_spin_loop(me, true); + /* - * We boost the priority of a VCPU that is runnable but not - * currently running, because it got preempted by something - * else and called schedule in __vcpu_run. Hopefully that - * VCPU is holding the lock that we need and will release it. - * We approximate round-robin by starting at the last boosted VCPU. + * The current vCPU ("me") is spinning in kernel mode, i.e. is likely + * waiting for a resource to become available. Attempt to yield to a + * vCPU that is runnable, but not currently running, e.g. because the + * vCPU was preempted by a higher priority task. With luck, the vCPU + * that was preempted is holding a lock or some other resource that the + * current vCPU is waiting to acquire, and yielding to the other vCPU + * will allow it to make forward progress and release the lock (or kick + * the spinning vCPU, etc). + * + * Since KVM has no insight into what exactly the guest is doing, + * approximate a round-robin selection by iterating over all vCPUs, + * starting at the last boosted vCPU. I.e. if N=kvm->last_boosted_vcpu, + * iterate over vCPU[N+1]..vCPU[N-1], wrapping as needed. + * + * Note, this is inherently racy, e.g. if multiple vCPUs are spinning, + * they may all try to yield to the same vCPU(s). But as above, this + * is all best effort due to KVM's lack of visibility into the guest. */ - for (pass = 0; pass < 2 && !yielded && try; pass++) { - kvm_for_each_vcpu(i, vcpu, kvm) { - if (!pass && i <= last_boosted_vcpu) { - i = last_boosted_vcpu; - continue; - } else if (pass && i > last_boosted_vcpu) - break; - if (!READ_ONCE(vcpu->ready)) - continue; - if (vcpu == me) - continue; - if (kvm_vcpu_is_blocking(vcpu) && !vcpu_dy_runnable(vcpu)) - continue; + start = READ_ONCE(kvm->last_boosted_vcpu) + 1; + for (i = 0; i < nr_vcpus; i++) { + idx = (start + i) % nr_vcpus; + if (idx == me->vcpu_idx) + continue; - /* - * Treat the target vCPU as being in-kernel if it has a - * pending interrupt, as the vCPU trying to yield may - * be spinning waiting on IPI delivery, i.e. the target - * vCPU is in-kernel for the purposes of directed yield. - */ - if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode && - !kvm_arch_dy_has_pending_interrupt(vcpu) && - !kvm_arch_vcpu_preempted_in_kernel(vcpu)) - continue; - if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) - continue; + vcpu = xa_load(&kvm->vcpu_array, idx); + if (!READ_ONCE(vcpu->ready)) + continue; + if (kvm_vcpu_is_blocking(vcpu) && !vcpu_dy_runnable(vcpu)) + continue; - yielded = kvm_vcpu_yield_to(vcpu); - if (yielded > 0) { - WRITE_ONCE(kvm->last_boosted_vcpu, i); - break; - } else if (yielded < 0) { - try--; - if (!try) - break; - } + /* + * Treat the target vCPU as being in-kernel if it has a pending + * interrupt, as the vCPU trying to yield may be spinning + * waiting on IPI delivery, i.e. the target vCPU is in-kernel + * for the purposes of directed yield. + */ + if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode && + !kvm_arch_dy_has_pending_interrupt(vcpu) && + !kvm_arch_vcpu_preempted_in_kernel(vcpu)) + continue; + + if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) + continue; + + yielded = kvm_vcpu_yield_to(vcpu); + if (yielded > 0) { + WRITE_ONCE(kvm->last_boosted_vcpu, i); + break; + } else if (yielded < 0 && !--try) { + break; } } kvm_vcpu_set_in_spin_loop(me, false); @@ -4018,9 +4033,9 @@ static int vcpu_get_pid(void *data, u64 *val) { struct kvm_vcpu *vcpu = data; - rcu_read_lock(); - *val = pid_nr(rcu_dereference(vcpu->pid)); - rcu_read_unlock(); + read_lock(&vcpu->pid_lock); + *val = pid_nr(vcpu->pid); + read_unlock(&vcpu->pid_lock); return 0; } @@ -4306,7 +4321,14 @@ static long kvm_vcpu_ioctl(struct file *filp, r = -EINVAL; if (arg) goto out; - oldpid = rcu_access_pointer(vcpu->pid); + + /* + * Note, vcpu->pid is primarily protected by vcpu->mutex. The + * dedicated r/w lock allows other tasks, e.g. other vCPUs, to + * read vcpu->pid while this vCPU is in KVM_RUN, e.g. to yield + * directly to this vCPU + */ + oldpid = vcpu->pid; if (unlikely(oldpid != task_pid(current))) { /* The thread running this VCPU changed. */ struct pid *newpid; @@ -4316,9 +4338,10 @@ static long kvm_vcpu_ioctl(struct file *filp, break; newpid = get_task_pid(current, PIDTYPE_PID); - rcu_assign_pointer(vcpu->pid, newpid); - if (oldpid) - synchronize_rcu(); + write_lock(&vcpu->pid_lock); + vcpu->pid = newpid; + write_unlock(&vcpu->pid_lock); + put_pid(oldpid); } vcpu->wants_to_run = !READ_ONCE(vcpu->run->immediate_exit__unsafe); |