diff options
author | Ingo Molnar <mingo@elte.hu> | 2009-09-15 12:18:15 +0200 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2009-09-15 12:18:15 +0200 |
commit | dca2d6ac09d9ef59ff46820d4f0c94b08a671202 (patch) | |
tree | fdec753b842dad09e3a4151954fab3eb5c43500d /virt/kvm | |
parent | tracing: Fix ring-buffer and ksym tracer merge interaction (diff) | |
parent | Merge branch 'for-linus3' of git://git.kernel.org/pub/scm/linux/kernel/git/jm... (diff) | |
download | linux-dca2d6ac09d9ef59ff46820d4f0c94b08a671202.tar.xz linux-dca2d6ac09d9ef59ff46820d4f0c94b08a671202.zip |
Merge branch 'linus' into tracing/hw-breakpoints
Conflicts:
arch/x86/kernel/process_64.c
Semantic conflict fixed in:
arch/x86/kvm/x86.c
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'virt/kvm')
-rw-r--r-- | virt/kvm/Kconfig | 14 | ||||
-rw-r--r-- | virt/kvm/coalesced_mmio.c | 74 | ||||
-rw-r--r-- | virt/kvm/coalesced_mmio.h | 1 | ||||
-rw-r--r-- | virt/kvm/eventfd.c | 578 | ||||
-rw-r--r-- | virt/kvm/ioapic.c | 78 | ||||
-rw-r--r-- | virt/kvm/iodev.h | 55 | ||||
-rw-r--r-- | virt/kvm/irq_comm.c | 51 | ||||
-rw-r--r-- | virt/kvm/kvm_main.c | 298 | ||||
-rw-r--r-- | virt/kvm/kvm_trace.c | 285 |
9 files changed, 963 insertions, 471 deletions
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig new file mode 100644 index 000000000000..daece36c0a57 --- /dev/null +++ b/virt/kvm/Kconfig @@ -0,0 +1,14 @@ +# KVM common configuration items and defaults + +config HAVE_KVM + bool + +config HAVE_KVM_IRQCHIP + bool + +config HAVE_KVM_EVENTFD + bool + select EVENTFD + +config KVM_APIC_ARCHITECTURE + bool diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c index 5ae620d32fac..04d69cd7049b 100644 --- a/virt/kvm/coalesced_mmio.c +++ b/virt/kvm/coalesced_mmio.c @@ -14,32 +14,28 @@ #include "coalesced_mmio.h" -static int coalesced_mmio_in_range(struct kvm_io_device *this, - gpa_t addr, int len, int is_write) +static inline struct kvm_coalesced_mmio_dev *to_mmio(struct kvm_io_device *dev) +{ + return container_of(dev, struct kvm_coalesced_mmio_dev, dev); +} + +static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev, + gpa_t addr, int len) { - struct kvm_coalesced_mmio_dev *dev = - (struct kvm_coalesced_mmio_dev*)this->private; struct kvm_coalesced_mmio_zone *zone; - int next; + struct kvm_coalesced_mmio_ring *ring; + unsigned avail; int i; - if (!is_write) - return 0; - - /* kvm->lock is taken by the caller and must be not released before - * dev.read/write - */ - /* Are we able to batch it ? */ /* last is the first free entry * check if we don't meet the first used entry * there is always one unused entry in the buffer */ - - next = (dev->kvm->coalesced_mmio_ring->last + 1) % - KVM_COALESCED_MMIO_MAX; - if (next == dev->kvm->coalesced_mmio_ring->first) { + ring = dev->kvm->coalesced_mmio_ring; + avail = (ring->first - ring->last - 1) % KVM_COALESCED_MMIO_MAX; + if (avail < KVM_MAX_VCPUS) { /* full */ return 0; } @@ -60,14 +56,15 @@ static int coalesced_mmio_in_range(struct kvm_io_device *this, return 0; } -static void coalesced_mmio_write(struct kvm_io_device *this, - gpa_t addr, int len, const void *val) +static int coalesced_mmio_write(struct kvm_io_device *this, + gpa_t addr, int len, const void *val) { - struct kvm_coalesced_mmio_dev *dev = - (struct kvm_coalesced_mmio_dev*)this->private; + struct kvm_coalesced_mmio_dev *dev = to_mmio(this); struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring; + if (!coalesced_mmio_in_range(dev, addr, len)) + return -EOPNOTSUPP; - /* kvm->lock must be taken by caller before call to in_range()*/ + spin_lock(&dev->lock); /* copy data in first free entry of the ring */ @@ -76,29 +73,40 @@ static void coalesced_mmio_write(struct kvm_io_device *this, memcpy(ring->coalesced_mmio[ring->last].data, val, len); smp_wmb(); ring->last = (ring->last + 1) % KVM_COALESCED_MMIO_MAX; + spin_unlock(&dev->lock); + return 0; } static void coalesced_mmio_destructor(struct kvm_io_device *this) { - kfree(this); + struct kvm_coalesced_mmio_dev *dev = to_mmio(this); + + kfree(dev); } +static const struct kvm_io_device_ops coalesced_mmio_ops = { + .write = coalesced_mmio_write, + .destructor = coalesced_mmio_destructor, +}; + int kvm_coalesced_mmio_init(struct kvm *kvm) { struct kvm_coalesced_mmio_dev *dev; + int ret; dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL); if (!dev) return -ENOMEM; - dev->dev.write = coalesced_mmio_write; - dev->dev.in_range = coalesced_mmio_in_range; - dev->dev.destructor = coalesced_mmio_destructor; - dev->dev.private = dev; + spin_lock_init(&dev->lock); + kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops); dev->kvm = kvm; kvm->coalesced_mmio_dev = dev; - kvm_io_bus_register_dev(&kvm->mmio_bus, &dev->dev); - return 0; + ret = kvm_io_bus_register_dev(kvm, &kvm->mmio_bus, &dev->dev); + if (ret < 0) + kfree(dev); + + return ret; } int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm, @@ -109,16 +117,16 @@ int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm, if (dev == NULL) return -EINVAL; - mutex_lock(&kvm->lock); + down_write(&kvm->slots_lock); if (dev->nb_zones >= KVM_COALESCED_MMIO_ZONE_MAX) { - mutex_unlock(&kvm->lock); + up_write(&kvm->slots_lock); return -ENOBUFS; } dev->zone[dev->nb_zones] = *zone; dev->nb_zones++; - mutex_unlock(&kvm->lock); + up_write(&kvm->slots_lock); return 0; } @@ -132,7 +140,7 @@ int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm, if (dev == NULL) return -EINVAL; - mutex_lock(&kvm->lock); + down_write(&kvm->slots_lock); i = dev->nb_zones; while(i) { @@ -150,7 +158,7 @@ int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm, i--; } - mutex_unlock(&kvm->lock); + up_write(&kvm->slots_lock); return 0; } diff --git a/virt/kvm/coalesced_mmio.h b/virt/kvm/coalesced_mmio.h index 5ac0ec628461..4b49f27fa31e 100644 --- a/virt/kvm/coalesced_mmio.h +++ b/virt/kvm/coalesced_mmio.h @@ -12,6 +12,7 @@ struct kvm_coalesced_mmio_dev { struct kvm_io_device dev; struct kvm *kvm; + spinlock_t lock; int nb_zones; struct kvm_coalesced_mmio_zone zone[KVM_COALESCED_MMIO_ZONE_MAX]; }; diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c new file mode 100644 index 000000000000..bb4ebd89b9ff --- /dev/null +++ b/virt/kvm/eventfd.c @@ -0,0 +1,578 @@ +/* + * kvm eventfd support - use eventfd objects to signal various KVM events + * + * Copyright 2009 Novell. All Rights Reserved. + * + * Author: + * Gregory Haskins <ghaskins@novell.com> + * + * This file is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <linux/kvm_host.h> +#include <linux/kvm.h> +#include <linux/workqueue.h> +#include <linux/syscalls.h> +#include <linux/wait.h> +#include <linux/poll.h> +#include <linux/file.h> +#include <linux/list.h> +#include <linux/eventfd.h> +#include <linux/kernel.h> + +#include "iodev.h" + +/* + * -------------------------------------------------------------------- + * irqfd: Allows an fd to be used to inject an interrupt to the guest + * + * Credit goes to Avi Kivity for the original idea. + * -------------------------------------------------------------------- + */ + +struct _irqfd { + struct kvm *kvm; + struct eventfd_ctx *eventfd; + int gsi; + struct list_head list; + poll_table pt; + wait_queue_head_t *wqh; + wait_queue_t wait; + struct work_struct inject; + struct work_struct shutdown; +}; + +static struct workqueue_struct *irqfd_cleanup_wq; + +static void +irqfd_inject(struct work_struct *work) +{ + struct _irqfd *irqfd = container_of(work, struct _irqfd, inject); + struct kvm *kvm = irqfd->kvm; + + mutex_lock(&kvm->irq_lock); + kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1); + kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0); + mutex_unlock(&kvm->irq_lock); +} + +/* + * Race-free decouple logic (ordering is critical) + */ +static void +irqfd_shutdown(struct work_struct *work) +{ + struct _irqfd *irqfd = container_of(work, struct _irqfd, shutdown); + + /* + * Synchronize with the wait-queue and unhook ourselves to prevent + * further events. + */ + remove_wait_queue(irqfd->wqh, &irqfd->wait); + + /* + * We know no new events will be scheduled at this point, so block + * until all previously outstanding events have completed + */ + flush_work(&irqfd->inject); + + /* + * It is now safe to release the object's resources + */ + eventfd_ctx_put(irqfd->eventfd); + kfree(irqfd); +} + + +/* assumes kvm->irqfds.lock is held */ +static bool +irqfd_is_active(struct _irqfd *irqfd) +{ + return list_empty(&irqfd->list) ? false : true; +} + +/* + * Mark the irqfd as inactive and schedule it for removal + * + * assumes kvm->irqfds.lock is held + */ +static void +irqfd_deactivate(struct _irqfd *irqfd) +{ + BUG_ON(!irqfd_is_active(irqfd)); + + list_del_init(&irqfd->list); + + queue_work(irqfd_cleanup_wq, &irqfd->shutdown); +} + +/* + * Called with wqh->lock held and interrupts disabled + */ +static int +irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait); + unsigned long flags = (unsigned long)key; + + if (flags & POLLIN) + /* An event has been signaled, inject an interrupt */ + schedule_work(&irqfd->inject); + + if (flags & POLLHUP) { + /* The eventfd is closing, detach from KVM */ + struct kvm *kvm = irqfd->kvm; + unsigned long flags; + + spin_lock_irqsave(&kvm->irqfds.lock, flags); + + /* + * We must check if someone deactivated the irqfd before + * we could acquire the irqfds.lock since the item is + * deactivated from the KVM side before it is unhooked from + * the wait-queue. If it is already deactivated, we can + * simply return knowing the other side will cleanup for us. + * We cannot race against the irqfd going away since the + * other side is required to acquire wqh->lock, which we hold + */ + if (irqfd_is_active(irqfd)) + irqfd_deactivate(irqfd); + + spin_unlock_irqrestore(&kvm->irqfds.lock, flags); + } + + return 0; +} + +static void +irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, + poll_table *pt) +{ + struct _irqfd *irqfd = container_of(pt, struct _irqfd, pt); + + irqfd->wqh = wqh; + add_wait_queue(wqh, &irqfd->wait); +} + +static int +kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi) +{ + struct _irqfd *irqfd; + struct file *file = NULL; + struct eventfd_ctx *eventfd = NULL; + int ret; + unsigned int events; + + irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL); + if (!irqfd) + return -ENOMEM; + + irqfd->kvm = kvm; + irqfd->gsi = gsi; + INIT_LIST_HEAD(&irqfd->list); + INIT_WORK(&irqfd->inject, irqfd_inject); + INIT_WORK(&irqfd->shutdown, irqfd_shutdown); + + file = eventfd_fget(fd); + if (IS_ERR(file)) { + ret = PTR_ERR(file); + goto fail; + } + + eventfd = eventfd_ctx_fileget(file); + if (IS_ERR(eventfd)) { + ret = PTR_ERR(eventfd); + goto fail; + } + + irqfd->eventfd = eventfd; + + /* + * Install our own custom wake-up handling so we are notified via + * a callback whenever someone signals the underlying eventfd + */ + init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); + init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc); + + events = file->f_op->poll(file, &irqfd->pt); + + spin_lock_irq(&kvm->irqfds.lock); + list_add_tail(&irqfd->list, &kvm->irqfds.items); + spin_unlock_irq(&kvm->irqfds.lock); + + /* + * Check if there was an event already pending on the eventfd + * before we registered, and trigger it as if we didn't miss it. + */ + if (events & POLLIN) + schedule_work(&irqfd->inject); + + /* + * do not drop the file until the irqfd is fully initialized, otherwise + * we might race against the POLLHUP + */ + fput(file); + + return 0; + +fail: + if (eventfd && !IS_ERR(eventfd)) + eventfd_ctx_put(eventfd); + + if (!IS_ERR(file)) + fput(file); + + kfree(irqfd); + return ret; +} + +void +kvm_eventfd_init(struct kvm *kvm) +{ + spin_lock_init(&kvm->irqfds.lock); + INIT_LIST_HEAD(&kvm->irqfds.items); + INIT_LIST_HEAD(&kvm->ioeventfds); +} + +/* + * shutdown any irqfd's that match fd+gsi + */ +static int +kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi) +{ + struct _irqfd *irqfd, *tmp; + struct eventfd_ctx *eventfd; + + eventfd = eventfd_ctx_fdget(fd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + + spin_lock_irq(&kvm->irqfds.lock); + + list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) { + if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) + irqfd_deactivate(irqfd); + } + + spin_unlock_irq(&kvm->irqfds.lock); + eventfd_ctx_put(eventfd); + + /* + * Block until we know all outstanding shutdown jobs have completed + * so that we guarantee there will not be any more interrupts on this + * gsi once this deassign function returns. + */ + flush_workqueue(irqfd_cleanup_wq); + + return 0; +} + +int +kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags) +{ + if (flags & KVM_IRQFD_FLAG_DEASSIGN) + return kvm_irqfd_deassign(kvm, fd, gsi); + + return kvm_irqfd_assign(kvm, fd, gsi); +} + +/* + * This function is called as the kvm VM fd is being released. Shutdown all + * irqfds that still remain open + */ +void +kvm_irqfd_release(struct kvm *kvm) +{ + struct _irqfd *irqfd, *tmp; + + spin_lock_irq(&kvm->irqfds.lock); + + list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) + irqfd_deactivate(irqfd); + + spin_unlock_irq(&kvm->irqfds.lock); + + /* + * Block until we know all outstanding shutdown jobs have completed + * since we do not take a kvm* reference. + */ + flush_workqueue(irqfd_cleanup_wq); + +} + +/* + * create a host-wide workqueue for issuing deferred shutdown requests + * aggregated from all vm* instances. We need our own isolated single-thread + * queue to prevent deadlock against flushing the normal work-queue. + */ +static int __init irqfd_module_init(void) +{ + irqfd_cleanup_wq = create_singlethread_workqueue("kvm-irqfd-cleanup"); + if (!irqfd_cleanup_wq) + return -ENOMEM; + + return 0; +} + +static void __exit irqfd_module_exit(void) +{ + destroy_workqueue(irqfd_cleanup_wq); +} + +module_init(irqfd_module_init); +module_exit(irqfd_module_exit); + +/* + * -------------------------------------------------------------------- + * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal. + * + * userspace can register a PIO/MMIO address with an eventfd for receiving + * notification when the memory has been touched. + * -------------------------------------------------------------------- + */ + +struct _ioeventfd { + struct list_head list; + u64 addr; + int length; + struct eventfd_ctx *eventfd; + u64 datamatch; + struct kvm_io_device dev; + bool wildcard; +}; + +static inline struct _ioeventfd * +to_ioeventfd(struct kvm_io_device *dev) +{ + return container_of(dev, struct _ioeventfd, dev); +} + +static void +ioeventfd_release(struct _ioeventfd *p) +{ + eventfd_ctx_put(p->eventfd); + list_del(&p->list); + kfree(p); +} + +static bool +ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val) +{ + u64 _val; + + if (!(addr == p->addr && len == p->length)) + /* address-range must be precise for a hit */ + return false; + + if (p->wildcard) + /* all else equal, wildcard is always a hit */ + return true; + + /* otherwise, we have to actually compare the data */ + + BUG_ON(!IS_ALIGNED((unsigned long)val, len)); + + switch (len) { + case 1: + _val = *(u8 *)val; + break; + case 2: + _val = *(u16 *)val; + break; + case 4: + _val = *(u32 *)val; + break; + case 8: + _val = *(u64 *)val; + break; + default: + return false; + } + + return _val == p->datamatch ? true : false; +} + +/* MMIO/PIO writes trigger an event if the addr/val match */ +static int +ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len, + const void *val) +{ + struct _ioeventfd *p = to_ioeventfd(this); + + if (!ioeventfd_in_range(p, addr, len, val)) + return -EOPNOTSUPP; + + eventfd_signal(p->eventfd, 1); + return 0; +} + +/* + * This function is called as KVM is completely shutting down. We do not + * need to worry about locking just nuke anything we have as quickly as possible + */ +static void +ioeventfd_destructor(struct kvm_io_device *this) +{ + struct _ioeventfd *p = to_ioeventfd(this); + + ioeventfd_release(p); +} + +static const struct kvm_io_device_ops ioeventfd_ops = { + .write = ioeventfd_write, + .destructor = ioeventfd_destructor, +}; + +/* assumes kvm->slots_lock held */ +static bool +ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p) +{ + struct _ioeventfd *_p; + + list_for_each_entry(_p, &kvm->ioeventfds, list) + if (_p->addr == p->addr && _p->length == p->length && + (_p->wildcard || p->wildcard || + _p->datamatch == p->datamatch)) + return true; + + return false; +} + +static int +kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) +{ + int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO; + struct kvm_io_bus *bus = pio ? &kvm->pio_bus : &kvm->mmio_bus; + struct _ioeventfd *p; + struct eventfd_ctx *eventfd; + int ret; + + /* must be natural-word sized */ + switch (args->len) { + case 1: + case 2: + case 4: + case 8: + break; + default: + return -EINVAL; + } + + /* check for range overflow */ + if (args->addr + args->len < args->addr) + return -EINVAL; + + /* check for extra flags that we don't understand */ + if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK) + return -EINVAL; + + eventfd = eventfd_ctx_fdget(args->fd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) { + ret = -ENOMEM; + goto fail; + } + + INIT_LIST_HEAD(&p->list); + p->addr = args->addr; + p->length = args->len; + p->eventfd = eventfd; + + /* The datamatch feature is optional, otherwise this is a wildcard */ + if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH) + p->datamatch = args->datamatch; + else + p->wildcard = true; + + down_write(&kvm->slots_lock); + + /* Verify that there isnt a match already */ + if (ioeventfd_check_collision(kvm, p)) { + ret = -EEXIST; + goto unlock_fail; + } + + kvm_iodevice_init(&p->dev, &ioeventfd_ops); + + ret = __kvm_io_bus_register_dev(bus, &p->dev); + if (ret < 0) + goto unlock_fail; + + list_add_tail(&p->list, &kvm->ioeventfds); + + up_write(&kvm->slots_lock); + + return 0; + +unlock_fail: + up_write(&kvm->slots_lock); + +fail: + kfree(p); + eventfd_ctx_put(eventfd); + + return ret; +} + +static int +kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) +{ + int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO; + struct kvm_io_bus *bus = pio ? &kvm->pio_bus : &kvm->mmio_bus; + struct _ioeventfd *p, *tmp; + struct eventfd_ctx *eventfd; + int ret = -ENOENT; + + eventfd = eventfd_ctx_fdget(args->fd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + + down_write(&kvm->slots_lock); + + list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) { + bool wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH); + + if (p->eventfd != eventfd || + p->addr != args->addr || + p->length != args->len || + p->wildcard != wildcard) + continue; + + if (!p->wildcard && p->datamatch != args->datamatch) + continue; + + __kvm_io_bus_unregister_dev(bus, &p->dev); + ioeventfd_release(p); + ret = 0; + break; + } + + up_write(&kvm->slots_lock); + + eventfd_ctx_put(eventfd); + + return ret; +} + +int +kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) +{ + if (args->flags & KVM_IOEVENTFD_FLAG_DEASSIGN) + return kvm_deassign_ioeventfd(kvm, args); + + return kvm_assign_ioeventfd(kvm, args); +} diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 1150c6d5c7b8..9fe140bb38ec 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -36,6 +36,7 @@ #include <asm/processor.h> #include <asm/page.h> #include <asm/current.h> +#include <trace/events/kvm.h> #include "ioapic.h" #include "lapic.h" @@ -103,6 +104,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) { unsigned index; bool mask_before, mask_after; + union kvm_ioapic_redirect_entry *e; switch (ioapic->ioregsel) { case IOAPIC_REG_VERSION: @@ -122,19 +124,20 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) ioapic_debug("change redir index %x val %x\n", index, val); if (index >= IOAPIC_NUM_PINS) return; - mask_before = ioapic->redirtbl[index].fields.mask; + e = &ioapic->redirtbl[index]; + mask_before = e->fields.mask; if (ioapic->ioregsel & 1) { - ioapic->redirtbl[index].bits &= 0xffffffff; - ioapic->redirtbl[index].bits |= (u64) val << 32; + e->bits &= 0xffffffff; + e->bits |= (u64) val << 32; } else { - ioapic->redirtbl[index].bits &= ~0xffffffffULL; - ioapic->redirtbl[index].bits |= (u32) val; - ioapic->redirtbl[index].fields.remote_irr = 0; + e->bits &= ~0xffffffffULL; + e->bits |= (u32) val; + e->fields.remote_irr = 0; } - mask_after = ioapic->redirtbl[index].fields.mask; + mask_after = e->fields.mask; if (mask_before != mask_after) kvm_fire_mask_notifiers(ioapic->kvm, index, mask_after); - if (ioapic->redirtbl[index].fields.trig_mode == IOAPIC_LEVEL_TRIG + if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG && ioapic->irr & (1 << index)) ioapic_service(ioapic, index); break; @@ -164,7 +167,9 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq) /* Always delivery PIT interrupt to vcpu 0 */ if (irq == 0) { irqe.dest_mode = 0; /* Physical mode. */ - irqe.dest_id = ioapic->kvm->vcpus[0]->vcpu_id; + /* need to read apic_id from apic regiest since + * it can be rewritten */ + irqe.dest_id = ioapic->kvm->bsp_vcpu->vcpu_id; } #endif return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe); @@ -188,7 +193,10 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) if ((edge && old_irr != ioapic->irr) || (!edge && !entry.fields.remote_irr)) ret = ioapic_service(ioapic, irq); + else + ret = 0; /* report coalesced interrupt */ } + trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0); } return ret; } @@ -220,24 +228,29 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode) __kvm_ioapic_update_eoi(ioapic, i, trigger_mode); } -static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr, - int len, int is_write) +static inline struct kvm_ioapic *to_ioapic(struct kvm_io_device *dev) { - struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; + return container_of(dev, struct kvm_ioapic, dev); +} +static inline int ioapic_in_range(struct kvm_ioapic *ioapic, gpa_t addr) +{ return ((addr >= ioapic->base_address && (addr < ioapic->base_address + IOAPIC_MEM_LENGTH))); } -static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, - void *val) +static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, + void *val) { - struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; + struct kvm_ioapic *ioapic = to_ioapic(this); u32 result; + if (!ioapic_in_range(ioapic, addr)) + return -EOPNOTSUPP; ioapic_debug("addr %lx\n", (unsigned long)addr); ASSERT(!(addr & 0xf)); /* check alignment */ + mutex_lock(&ioapic->kvm->irq_lock); addr &= 0xff; switch (addr) { case IOAPIC_REG_SELECT: @@ -264,22 +277,28 @@ static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, default: printk(KERN_WARNING "ioapic: wrong length %d\n", len); } + mutex_unlock(&ioapic->kvm->irq_lock); + return 0; } -static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, - const void *val) +static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, + const void *val) { - struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; + struct kvm_ioapic *ioapic = to_ioapic(this); u32 data; + if (!ioapic_in_range(ioapic, addr)) + return -EOPNOTSUPP; ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n", (void*)addr, len, val); ASSERT(!(addr & 0xf)); /* check alignment */ + + mutex_lock(&ioapic->kvm->irq_lock); if (len == 4 || len == 8) data = *(u32 *) val; else { printk(KERN_WARNING "ioapic: Unsupported size %d\n", len); - return; + goto unlock; } addr &= 0xff; @@ -300,6 +319,9 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, default: break; } +unlock: + mutex_unlock(&ioapic->kvm->irq_lock); + return 0; } void kvm_ioapic_reset(struct kvm_ioapic *ioapic) @@ -314,21 +336,27 @@ void kvm_ioapic_reset(struct kvm_ioapic *ioapic) ioapic->id = 0; } +static const struct kvm_io_device_ops ioapic_mmio_ops = { + .read = ioapic_mmio_read, + .write = ioapic_mmio_write, +}; + int kvm_ioapic_init(struct kvm *kvm) { struct kvm_ioapic *ioapic; + int ret; ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL); if (!ioapic) return -ENOMEM; kvm->arch.vioapic = ioapic; kvm_ioapic_reset(ioapic); - ioapic->dev.read = ioapic_mmio_read; - ioapic->dev.write = ioapic_mmio_write; - ioapic->dev.in_range = ioapic_in_range; - ioapic->dev.private = ioapic; + kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops); ioapic->kvm = kvm; - kvm_io_bus_register_dev(&kvm->mmio_bus, &ioapic->dev); - return 0; + ret = kvm_io_bus_register_dev(kvm, &kvm->mmio_bus, &ioapic->dev); + if (ret < 0) + kfree(ioapic); + + return ret; } diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h index 55e8846ac3a6..12fd3caffd2b 100644 --- a/virt/kvm/iodev.h +++ b/virt/kvm/iodev.h @@ -17,49 +17,54 @@ #define __KVM_IODEV_H__ #include <linux/kvm_types.h> +#include <asm/errno.h> -struct kvm_io_device { - void (*read)(struct kvm_io_device *this, +struct kvm_io_device; + +/** + * kvm_io_device_ops are called under kvm slots_lock. + * read and write handlers return 0 if the transaction has been handled, + * or non-zero to have it passed to the next device. + **/ +struct kvm_io_device_ops { + int (*read)(struct kvm_io_device *this, + gpa_t addr, + int len, + void *val); + int (*write)(struct kvm_io_device *this, gpa_t addr, int len, - void *val); - void (*write)(struct kvm_io_device *this, - gpa_t addr, - int len, - const void *val); - int (*in_range)(struct kvm_io_device *this, gpa_t addr, int len, - int is_write); + const void *val); void (*destructor)(struct kvm_io_device *this); +}; - void *private; + +struct kvm_io_device { + const struct kvm_io_device_ops *ops; }; -static inline void kvm_iodevice_read(struct kvm_io_device *dev, - gpa_t addr, - int len, - void *val) +static inline void kvm_iodevice_init(struct kvm_io_device *dev, + const struct kvm_io_device_ops *ops) { - dev->read(dev, addr, len, val); + dev->ops = ops; } -static inline void kvm_iodevice_write(struct kvm_io_device *dev, - gpa_t addr, - int len, - const void *val) +static inline int kvm_iodevice_read(struct kvm_io_device *dev, + gpa_t addr, int l, void *v) { - dev->write(dev, addr, len, val); + return dev->ops->read ? dev->ops->read(dev, addr, l, v) : -EOPNOTSUPP; } -static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, - gpa_t addr, int len, int is_write) +static inline int kvm_iodevice_write(struct kvm_io_device *dev, + gpa_t addr, int l, const void *v) { - return dev->in_range(dev, addr, len, is_write); + return dev->ops->write ? dev->ops->write(dev, addr, l, v) : -EOPNOTSUPP; } static inline void kvm_iodevice_destructor(struct kvm_io_device *dev) { - if (dev->destructor) - dev->destructor(dev); + if (dev->ops->destructor) + dev->ops->destructor(dev); } #endif /* __KVM_IODEV_H__ */ diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index ddc17f0e2f35..001663ff401a 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -20,6 +20,7 @@ */ #include <linux/kvm_host.h> +#include <trace/events/kvm.h> #include <asm/msidef.h> #ifdef CONFIG_IA64 @@ -62,14 +63,14 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, int i, r = -1; struct kvm_vcpu *vcpu, *lowest = NULL; + WARN_ON(!mutex_is_locked(&kvm->irq_lock)); + if (irq->dest_mode == 0 && irq->dest_id == 0xff && kvm_is_dm_lowest_prio(irq)) printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n"); - for (i = 0; i < KVM_MAX_VCPUS; i++) { - vcpu = kvm->vcpus[i]; - - if (!vcpu || !kvm_apic_present(vcpu)) + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!kvm_apic_present(vcpu)) continue; if (!kvm_apic_match_dest(vcpu, src, irq->shorthand, @@ -99,6 +100,8 @@ static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, { struct kvm_lapic_irq irq; + trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data); + irq.dest_id = (e->msi.address_lo & MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT; irq.vector = (e->msi.data & @@ -113,7 +116,7 @@ static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, return kvm_irq_delivery_to_apic(kvm, NULL, &irq); } -/* This should be called with the kvm->lock mutex held +/* This should be called with the kvm->irq_lock mutex held * Return value: * < 0 Interrupt was ignored (masked or not delivered for other reasons) * = 0 Interrupt was coalesced (previous irq is still pending) @@ -125,6 +128,10 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level) unsigned long *irq_state, sig_level; int ret = -1; + trace_kvm_set_irq(irq, level, irq_source_id); + + WARN_ON(!mutex_is_locked(&kvm->irq_lock)); + if (irq < KVM_IOAPIC_NUM_PINS) { irq_state = (unsigned long *)&kvm->arch.irq_states[irq]; @@ -134,7 +141,9 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level) else clear_bit(irq_source_id, irq_state); sig_level = !!(*irq_state); - } else /* Deal with MSI/MSI-X */ + } else if (!level) + return ret; + else /* Deal with MSI/MSI-X */ sig_level = 1; /* Not possible to detect if the guest uses the PIC or the @@ -159,6 +168,8 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) struct hlist_node *n; unsigned gsi = pin; + trace_kvm_ack_irq(irqchip, pin); + list_for_each_entry(e, &kvm->irq_routing, link) if (e->type == KVM_IRQ_ROUTING_IRQCHIP && e->irqchip.irqchip == irqchip && @@ -175,19 +186,26 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) void kvm_register_irq_ack_notifier(struct kvm *kvm, struct kvm_irq_ack_notifier *kian) { + mutex_lock(&kvm->irq_lock); hlist_add_head(&kian->link, &kvm->arch.irq_ack_notifier_list); + mutex_unlock(&kvm->irq_lock); } -void kvm_unregister_irq_ack_notifier(struct kvm_irq_ack_notifier *kian) +void kvm_unregister_irq_ack_notifier(struct kvm *kvm, + struct kvm_irq_ack_notifier *kian) { + mutex_lock(&kvm->irq_lock); hlist_del_init(&kian->link); + mutex_unlock(&kvm->irq_lock); } -/* The caller must hold kvm->lock mutex */ int kvm_request_irq_source_id(struct kvm *kvm) { unsigned long *bitmap = &kvm->arch.irq_sources_bitmap; - int irq_source_id = find_first_zero_bit(bitmap, + int irq_source_id; + + mutex_lock(&kvm->irq_lock); + irq_source_id = find_first_zero_bit(bitmap, sizeof(kvm->arch.irq_sources_bitmap)); if (irq_source_id >= sizeof(kvm->arch.irq_sources_bitmap)) { @@ -197,6 +215,7 @@ int kvm_request_irq_source_id(struct kvm *kvm) ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); set_bit(irq_source_id, bitmap); + mutex_unlock(&kvm->irq_lock); return irq_source_id; } @@ -207,6 +226,7 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); + mutex_lock(&kvm->irq_lock); if (irq_source_id < 0 || irq_source_id >= sizeof(kvm->arch.irq_sources_bitmap)) { printk(KERN_ERR "kvm: IRQ source ID out of range!\n"); @@ -215,19 +235,24 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++) clear_bit(irq_source_id, &kvm->arch.irq_states[i]); clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap); + mutex_unlock(&kvm->irq_lock); } void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq, struct kvm_irq_mask_notifier *kimn) { + mutex_lock(&kvm->irq_lock); kimn->irq = irq; hlist_add_head(&kimn->link, &kvm->mask_notifier_list); + mutex_unlock(&kvm->irq_lock); } void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, struct kvm_irq_mask_notifier *kimn) { + mutex_lock(&kvm->irq_lock); hlist_del(&kimn->link); + mutex_unlock(&kvm->irq_lock); } void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask) @@ -235,6 +260,8 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask) struct kvm_irq_mask_notifier *kimn; struct hlist_node *n; + WARN_ON(!mutex_is_locked(&kvm->irq_lock)); + hlist_for_each_entry(kimn, n, &kvm->mask_notifier_list, link) if (kimn->irq == irq) kimn->func(kimn, mask); @@ -250,7 +277,9 @@ static void __kvm_free_irq_routing(struct list_head *irq_routing) void kvm_free_irq_routing(struct kvm *kvm) { + mutex_lock(&kvm->irq_lock); __kvm_free_irq_routing(&kvm->irq_routing); + mutex_unlock(&kvm->irq_lock); } static int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e, @@ -325,13 +354,13 @@ int kvm_set_irq_routing(struct kvm *kvm, e = NULL; } - mutex_lock(&kvm->lock); + mutex_lock(&kvm->irq_lock); list_splice(&kvm->irq_routing, &tmp); INIT_LIST_HEAD(&kvm->irq_routing); list_splice(&irq_list, &kvm->irq_routing); INIT_LIST_HEAD(&irq_list); list_splice(&tmp, &irq_list); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->irq_lock); r = 0; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 2884baf1d5f9..897bff3b7df9 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -59,9 +59,18 @@ #include "irq.h" #endif +#define CREATE_TRACE_POINTS +#include <trace/events/kvm.h> + MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); +/* + * Ordering of locks: + * + * kvm->slots_lock --> kvm->lock --> kvm->irq_lock + */ + DEFINE_SPINLOCK(kvm_lock); LIST_HEAD(vm_list); @@ -79,6 +88,8 @@ static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, static bool kvm_rebooting; +static bool largepages_enabled = true; + #ifdef KVM_CAP_DEVICE_ASSIGNMENT static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, int assigned_dev_id) @@ -120,17 +131,13 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) { struct kvm_assigned_dev_kernel *assigned_dev; struct kvm *kvm; - int irq, i; + int i; assigned_dev = container_of(work, struct kvm_assigned_dev_kernel, interrupt_work); kvm = assigned_dev->kvm; - /* This is taken to safely inject irq inside the guest. When - * the interrupt injection (or the ioapic code) uses a - * finer-grained lock, update this - */ - mutex_lock(&kvm->lock); + mutex_lock(&kvm->irq_lock); spin_lock_irq(&assigned_dev->assigned_dev_lock); if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { struct kvm_guest_msix_entry *guest_entries = @@ -143,23 +150,13 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, guest_entries[i].vector, 1); - irq = assigned_dev->host_msix_entries[i].vector; - if (irq != 0) - enable_irq(irq); - assigned_dev->host_irq_disabled = false; } - } else { + } else kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, assigned_dev->guest_irq, 1); - if (assigned_dev->irq_requested_type & - KVM_DEV_IRQ_GUEST_MSI) { - enable_irq(assigned_dev->host_irq); - assigned_dev->host_irq_disabled = false; - } - } spin_unlock_irq(&assigned_dev->assigned_dev_lock); - mutex_unlock(&assigned_dev->kvm->lock); + mutex_unlock(&assigned_dev->kvm->irq_lock); } static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) @@ -179,8 +176,10 @@ static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) schedule_work(&assigned_dev->interrupt_work); - disable_irq_nosync(irq); - assigned_dev->host_irq_disabled = true; + if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) { + disable_irq_nosync(irq); + assigned_dev->host_irq_disabled = true; + } out: spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags); @@ -215,7 +214,7 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) static void deassign_guest_irq(struct kvm *kvm, struct kvm_assigned_dev_kernel *assigned_dev) { - kvm_unregister_irq_ack_notifier(&assigned_dev->ack_notifier); + kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier); assigned_dev->ack_notifier.gsi = -1; if (assigned_dev->irq_source_id != -1) @@ -417,6 +416,7 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm, { dev->guest_irq = irq->guest_irq; dev->ack_notifier.gsi = -1; + dev->host_irq_disabled = false; return 0; } #endif @@ -427,6 +427,7 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm, { dev->guest_irq = irq->guest_irq; dev->ack_notifier.gsi = -1; + dev->host_irq_disabled = false; return 0; } #endif @@ -693,11 +694,6 @@ out: } #endif -static inline int valid_vcpu(int n) -{ - return likely(n >= 0 && n < KVM_MAX_VCPUS); -} - inline int kvm_is_mmio_pfn(pfn_t pfn) { if (pfn_valid(pfn)) { @@ -745,12 +741,9 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) if (alloc_cpumask_var(&cpus, GFP_ATOMIC)) cpumask_clear(cpus); - me = get_cpu(); spin_lock(&kvm->requests_lock); - for (i = 0; i < KVM_MAX_VCPUS; ++i) { - vcpu = kvm->vcpus[i]; - if (!vcpu) - continue; + me = smp_processor_id(); + kvm_for_each_vcpu(i, vcpu, kvm) { if (test_and_set_bit(req, &vcpu->requests)) continue; cpu = vcpu->cpu; @@ -764,7 +757,6 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) else called = false; spin_unlock(&kvm->requests_lock); - put_cpu(); free_cpumask_var(cpus); return called; } @@ -986,7 +978,9 @@ static struct kvm *kvm_create_vm(void) spin_lock_init(&kvm->mmu_lock); spin_lock_init(&kvm->requests_lock); kvm_io_bus_init(&kvm->pio_bus); + kvm_eventfd_init(kvm); mutex_init(&kvm->lock); + mutex_init(&kvm->irq_lock); kvm_io_bus_init(&kvm->mmio_bus); init_rwsem(&kvm->slots_lock); atomic_set(&kvm->users_count, 1); @@ -1006,19 +1000,25 @@ out: static void kvm_free_physmem_slot(struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { + int i; + if (!dont || free->rmap != dont->rmap) vfree(free->rmap); if (!dont || free->dirty_bitmap != dont->dirty_bitmap) vfree(free->dirty_bitmap); - if (!dont || free->lpage_info != dont->lpage_info) - vfree(free->lpage_info); + + for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { + if (!dont || free->lpage_info[i] != dont->lpage_info[i]) { + vfree(free->lpage_info[i]); + free->lpage_info[i] = NULL; + } + } free->npages = 0; free->dirty_bitmap = NULL; free->rmap = NULL; - free->lpage_info = NULL; } void kvm_free_physmem(struct kvm *kvm) @@ -1071,6 +1071,8 @@ static int kvm_vm_release(struct inode *inode, struct file *filp) { struct kvm *kvm = filp->private_data; + kvm_irqfd_release(kvm); + kvm_put_kvm(kvm); return 0; } @@ -1089,8 +1091,8 @@ int __kvm_set_memory_region(struct kvm *kvm, { int r; gfn_t base_gfn; - unsigned long npages, ugfn; - unsigned long largepages, i; + unsigned long npages; + unsigned long i; struct kvm_memory_slot *memslot; struct kvm_memory_slot old, new; @@ -1164,31 +1166,51 @@ int __kvm_set_memory_region(struct kvm *kvm, else new.userspace_addr = 0; } - if (npages && !new.lpage_info) { - largepages = 1 + (base_gfn + npages - 1) / KVM_PAGES_PER_HPAGE; - largepages -= base_gfn / KVM_PAGES_PER_HPAGE; + if (!npages) + goto skip_lpage; - new.lpage_info = vmalloc(largepages * sizeof(*new.lpage_info)); + for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { + unsigned long ugfn; + unsigned long j; + int lpages; + int level = i + 2; - if (!new.lpage_info) + /* Avoid unused variable warning if no large pages */ + (void)level; + + if (new.lpage_info[i]) + continue; + + lpages = 1 + (base_gfn + npages - 1) / + KVM_PAGES_PER_HPAGE(level); + lpages -= base_gfn / KVM_PAGES_PER_HPAGE(level); + + new.lpage_info[i] = vmalloc(lpages * sizeof(*new.lpage_info[i])); + + if (!new.lpage_info[i]) goto out_free; - memset(new.lpage_info, 0, largepages * sizeof(*new.lpage_info)); + memset(new.lpage_info[i], 0, + lpages * sizeof(*new.lpage_info[i])); - if (base_gfn % KVM_PAGES_PER_HPAGE) - new.lpage_info[0].write_count = 1; - if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE) - new.lpage_info[largepages-1].write_count = 1; + if (base_gfn % KVM_PAGES_PER_HPAGE(level)) + new.lpage_info[i][0].write_count = 1; + if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE(level)) + new.lpage_info[i][lpages - 1].write_count = 1; ugfn = new.userspace_addr >> PAGE_SHIFT; /* * If the gfn and userspace address are not aligned wrt each - * other, disable large page support for this slot + * other, or if explicitly asked to, disable large page + * support for this slot */ - if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE - 1)) - for (i = 0; i < largepages; ++i) - new.lpage_info[i].write_count = 1; + if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || + !largepages_enabled) + for (j = 0; j < lpages; ++j) + new.lpage_info[i][j].write_count = 1; } +skip_lpage: + /* Allocate page dirty bitmap if needed */ if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8; @@ -1200,6 +1222,10 @@ int __kvm_set_memory_region(struct kvm *kvm, if (old.npages) kvm_arch_flush_shadow(kvm); } +#else /* not defined CONFIG_S390 */ + new.user_alloc = user_alloc; + if (user_alloc) + new.userspace_addr = mem->userspace_addr; #endif /* not defined CONFIG_S390 */ if (!npages) @@ -1299,6 +1325,12 @@ out: return r; } +void kvm_disable_largepages(void) +{ + largepages_enabled = false; +} +EXPORT_SYMBOL_GPL(kvm_disable_largepages); + int is_error_page(struct page *page) { return page == bad_page; @@ -1635,9 +1667,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) for (;;) { prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); - if ((kvm_arch_interrupt_allowed(vcpu) && - kvm_cpu_has_interrupt(vcpu)) || - kvm_arch_vcpu_runnable(vcpu)) { + if (kvm_arch_vcpu_runnable(vcpu)) { set_bit(KVM_REQ_UNHALT, &vcpu->requests); break; } @@ -1714,24 +1744,18 @@ static struct file_operations kvm_vcpu_fops = { */ static int create_vcpu_fd(struct kvm_vcpu *vcpu) { - int fd = anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, 0); - if (fd < 0) - kvm_put_kvm(vcpu->kvm); - return fd; + return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, 0); } /* * Creates some virtual cpus. Good luck creating more than one. */ -static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) +static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) { int r; - struct kvm_vcpu *vcpu; - - if (!valid_vcpu(n)) - return -EINVAL; + struct kvm_vcpu *vcpu, *v; - vcpu = kvm_arch_vcpu_create(kvm, n); + vcpu = kvm_arch_vcpu_create(kvm, id); if (IS_ERR(vcpu)) return PTR_ERR(vcpu); @@ -1742,23 +1766,38 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) return r; mutex_lock(&kvm->lock); - if (kvm->vcpus[n]) { - r = -EEXIST; + if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) { + r = -EINVAL; goto vcpu_destroy; } - kvm->vcpus[n] = vcpu; - mutex_unlock(&kvm->lock); + + kvm_for_each_vcpu(r, v, kvm) + if (v->vcpu_id == id) { + r = -EEXIST; + goto vcpu_destroy; + } + + BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]); /* Now it's all set up, let userspace reach it */ kvm_get_kvm(kvm); r = create_vcpu_fd(vcpu); - if (r < 0) - goto unlink; + if (r < 0) { + kvm_put_kvm(kvm); + goto vcpu_destroy; + } + + kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu; + smp_wmb(); + atomic_inc(&kvm->online_vcpus); + +#ifdef CONFIG_KVM_APIC_ARCHITECTURE + if (kvm->bsp_vcpu_id == id) + kvm->bsp_vcpu = vcpu; +#endif + mutex_unlock(&kvm->lock); return r; -unlink: - mutex_lock(&kvm->lock); - kvm->vcpus[n] = NULL; vcpu_destroy: mutex_unlock(&kvm->lock); kvm_arch_vcpu_destroy(vcpu); @@ -2199,6 +2238,7 @@ static long kvm_vm_ioctl(struct file *filp, vfree(entries); break; } +#endif /* KVM_CAP_IRQ_ROUTING */ #ifdef __KVM_HAVE_MSIX case KVM_ASSIGN_SET_MSIX_NR: { struct kvm_assigned_msix_nr entry_nr; @@ -2221,7 +2261,35 @@ static long kvm_vm_ioctl(struct file *filp, break; } #endif -#endif /* KVM_CAP_IRQ_ROUTING */ + case KVM_IRQFD: { + struct kvm_irqfd data; + + r = -EFAULT; + if (copy_from_user(&data, argp, sizeof data)) + goto out; + r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags); + break; + } + case KVM_IOEVENTFD: { + struct kvm_ioeventfd data; + + r = -EFAULT; + if (copy_from_user(&data, argp, sizeof data)) + goto out; + r = kvm_ioeventfd(kvm, &data); + break; + } +#ifdef CONFIG_KVM_APIC_ARCHITECTURE + case KVM_SET_BOOT_CPU_ID: + r = 0; + mutex_lock(&kvm->lock); + if (atomic_read(&kvm->online_vcpus) != 0) + r = -EBUSY; + else + kvm->bsp_vcpu_id = arg; + mutex_unlock(&kvm->lock); + break; +#endif default: r = kvm_arch_vm_ioctl(filp, ioctl, arg); } @@ -2288,6 +2356,9 @@ static long kvm_dev_ioctl_check_extension_generic(long arg) case KVM_CAP_USER_MEMORY: case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: +#ifdef CONFIG_KVM_APIC_ARCHITECTURE + case KVM_CAP_SET_BOOT_CPU_ID: +#endif return 1; #ifdef CONFIG_HAVE_KVM_IRQCHIP case KVM_CAP_IRQ_ROUTING: @@ -2335,7 +2406,7 @@ static long kvm_dev_ioctl(struct file *filp, case KVM_TRACE_ENABLE: case KVM_TRACE_PAUSE: case KVM_TRACE_DISABLE: - r = kvm_trace_ioctl(ioctl, arg); + r = -EOPNOTSUPP; break; default: return kvm_arch_dev_ioctl(filp, ioctl, arg); @@ -2449,26 +2520,71 @@ void kvm_io_bus_destroy(struct kvm_io_bus *bus) } } -struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, - gpa_t addr, int len, int is_write) +/* kvm_io_bus_write - called under kvm->slots_lock */ +int kvm_io_bus_write(struct kvm_io_bus *bus, gpa_t addr, + int len, const void *val) { int i; + for (i = 0; i < bus->dev_count; i++) + if (!kvm_iodevice_write(bus->devs[i], addr, len, val)) + return 0; + return -EOPNOTSUPP; +} - for (i = 0; i < bus->dev_count; i++) { - struct kvm_io_device *pos = bus->devs[i]; +/* kvm_io_bus_read - called under kvm->slots_lock */ +int kvm_io_bus_read(struct kvm_io_bus *bus, gpa_t addr, int len, void *val) +{ + int i; + for (i = 0; i < bus->dev_count; i++) + if (!kvm_iodevice_read(bus->devs[i], addr, len, val)) + return 0; + return -EOPNOTSUPP; +} - if (pos->in_range(pos, addr, len, is_write)) - return pos; - } +int kvm_io_bus_register_dev(struct kvm *kvm, struct kvm_io_bus *bus, + struct kvm_io_device *dev) +{ + int ret; - return NULL; + down_write(&kvm->slots_lock); + ret = __kvm_io_bus_register_dev(bus, dev); + up_write(&kvm->slots_lock); + + return ret; } -void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev) +/* An unlocked version. Caller must have write lock on slots_lock. */ +int __kvm_io_bus_register_dev(struct kvm_io_bus *bus, + struct kvm_io_device *dev) { - BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1)); + if (bus->dev_count > NR_IOBUS_DEVS-1) + return -ENOSPC; bus->devs[bus->dev_count++] = dev; + + return 0; +} + +void kvm_io_bus_unregister_dev(struct kvm *kvm, + struct kvm_io_bus *bus, + struct kvm_io_device *dev) +{ + down_write(&kvm->slots_lock); + __kvm_io_bus_unregister_dev(bus, dev); + up_write(&kvm->slots_lock); +} + +/* An unlocked version. Caller must have write lock on slots_lock. */ +void __kvm_io_bus_unregister_dev(struct kvm_io_bus *bus, + struct kvm_io_device *dev) +{ + int i; + + for (i = 0; i < bus->dev_count; i++) + if (bus->devs[i] == dev) { + bus->devs[i] = bus->devs[--bus->dev_count]; + break; + } } static struct notifier_block kvm_cpu_notifier = { @@ -2501,11 +2617,9 @@ static int vcpu_stat_get(void *_offset, u64 *val) *val = 0; spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) - for (i = 0; i < KVM_MAX_VCPUS; ++i) { - vcpu = kvm->vcpus[i]; - if (vcpu) - *val += *(u32 *)((void *)vcpu + offset); - } + kvm_for_each_vcpu(i, vcpu, kvm) + *val += *(u32 *)((void *)vcpu + offset); + spin_unlock(&kvm_lock); return 0; } @@ -2679,15 +2793,15 @@ out_free_0: __free_page(bad_page); out: kvm_arch_exit(); - kvm_exit_debug(); out_fail: + kvm_exit_debug(); return r; } EXPORT_SYMBOL_GPL(kvm_init); void kvm_exit(void) { - kvm_trace_cleanup(); + tracepoint_synchronize_unregister(); misc_deregister(&kvm_dev); kmem_cache_destroy(kvm_vcpu_cache); sysdev_unregister(&kvm_sysdev); diff --git a/virt/kvm/kvm_trace.c b/virt/kvm/kvm_trace.c deleted file mode 100644 index f59874446440..000000000000 --- a/virt/kvm/kvm_trace.c +++ /dev/null @@ -1,285 +0,0 @@ -/* - * kvm trace - * - * It is designed to allow debugging traces of kvm to be generated - * on UP / SMP machines. Each trace entry can be timestamped so that - * it's possible to reconstruct a chronological record of trace events. - * The implementation refers to blktrace kernel support. - * - * Copyright (c) 2008 Intel Corporation - * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk> - * - * Authors: Feng(Eric) Liu, eric.e.liu@intel.com - * - * Date: Feb 2008 - */ - -#include <linux/module.h> -#include <linux/relay.h> -#include <linux/debugfs.h> -#include <linux/ktime.h> - -#include <linux/kvm_host.h> - -#define KVM_TRACE_STATE_RUNNING (1 << 0) -#define KVM_TRACE_STATE_PAUSE (1 << 1) -#define KVM_TRACE_STATE_CLEARUP (1 << 2) - -struct kvm_trace { - int trace_state; - struct rchan *rchan; - struct dentry *lost_file; - atomic_t lost_records; -}; -static struct kvm_trace *kvm_trace; - -struct kvm_trace_probe { - const char *name; - const char *format; - u32 timestamp_in; - marker_probe_func *probe_func; -}; - -static inline int calc_rec_size(int timestamp, int extra) -{ - int rec_size = KVM_TRC_HEAD_SIZE; - - rec_size += extra; - return timestamp ? rec_size += KVM_TRC_CYCLE_SIZE : rec_size; -} - -static void kvm_add_trace(void *probe_private, void *call_data, - const char *format, va_list *args) -{ - struct kvm_trace_probe *p = probe_private; - struct kvm_trace *kt = kvm_trace; - struct kvm_trace_rec rec; - struct kvm_vcpu *vcpu; - int i, size; - u32 extra; - - if (unlikely(kt->trace_state != KVM_TRACE_STATE_RUNNING)) - return; - - rec.rec_val = TRACE_REC_EVENT_ID(va_arg(*args, u32)); - vcpu = va_arg(*args, struct kvm_vcpu *); - rec.pid = current->tgid; - rec.vcpu_id = vcpu->vcpu_id; - - extra = va_arg(*args, u32); - WARN_ON(!(extra <= KVM_TRC_EXTRA_MAX)); - extra = min_t(u32, extra, KVM_TRC_EXTRA_MAX); - - rec.rec_val |= TRACE_REC_TCS(p->timestamp_in) - | TRACE_REC_NUM_DATA_ARGS(extra); - - if (p->timestamp_in) { - rec.u.timestamp.timestamp = ktime_to_ns(ktime_get()); - - for (i = 0; i < extra; i++) - rec.u.timestamp.extra_u32[i] = va_arg(*args, u32); - } else { - for (i = 0; i < extra; i++) - rec.u.notimestamp.extra_u32[i] = va_arg(*args, u32); - } - - size = calc_rec_size(p->timestamp_in, extra * sizeof(u32)); - relay_write(kt->rchan, &rec, size); -} - -static struct kvm_trace_probe kvm_trace_probes[] = { - { "kvm_trace_entryexit", "%u %p %u %u %u %u %u %u", 1, kvm_add_trace }, - { "kvm_trace_handler", "%u %p %u %u %u %u %u %u", 0, kvm_add_trace }, -}; - -static int lost_records_get(void *data, u64 *val) -{ - struct kvm_trace *kt = data; - - *val = atomic_read(&kt->lost_records); - return 0; -} - -DEFINE_SIMPLE_ATTRIBUTE(kvm_trace_lost_ops, lost_records_get, NULL, "%llu\n"); - -/* - * The relay channel is used in "no-overwrite" mode, it keeps trace of how - * many times we encountered a full subbuffer, to tell user space app the - * lost records there were. - */ -static int kvm_subbuf_start_callback(struct rchan_buf *buf, void *subbuf, - void *prev_subbuf, size_t prev_padding) -{ - struct kvm_trace *kt; - - if (!relay_buf_full(buf)) { - if (!prev_subbuf) { - /* - * executed only once when the channel is opened - * save metadata as first record - */ - subbuf_start_reserve(buf, sizeof(u32)); - *(u32 *)subbuf = 0x12345678; - } - - return 1; - } - - kt = buf->chan->private_data; - atomic_inc(&kt->lost_records); - - return 0; -} - -static struct dentry *kvm_create_buf_file_callack(const char *filename, - struct dentry *parent, - int mode, - struct rchan_buf *buf, - int *is_global) -{ - return debugfs_create_file(filename, mode, parent, buf, - &relay_file_operations); -} - -static int kvm_remove_buf_file_callback(struct dentry *dentry) -{ - debugfs_remove(dentry); - return 0; -} - -static struct rchan_callbacks kvm_relay_callbacks = { - .subbuf_start = kvm_subbuf_start_callback, - .create_buf_file = kvm_create_buf_file_callack, - .remove_buf_file = kvm_remove_buf_file_callback, -}; - -static int do_kvm_trace_enable(struct kvm_user_trace_setup *kuts) -{ - struct kvm_trace *kt; - int i, r = -ENOMEM; - - if (!kuts->buf_size || !kuts->buf_nr) - return -EINVAL; - - kt = kzalloc(sizeof(*kt), GFP_KERNEL); - if (!kt) - goto err; - - r = -EIO; - atomic_set(&kt->lost_records, 0); - kt->lost_file = debugfs_create_file("lost_records", 0444, kvm_debugfs_dir, - kt, &kvm_trace_lost_ops); - if (!kt->lost_file) - goto err; - - kt->rchan = relay_open("trace", kvm_debugfs_dir, kuts->buf_size, - kuts->buf_nr, &kvm_relay_callbacks, kt); - if (!kt->rchan) - goto err; - - kvm_trace = kt; - - for (i = 0; i < ARRAY_SIZE(kvm_trace_probes); i++) { - struct kvm_trace_probe *p = &kvm_trace_probes[i]; - - r = marker_probe_register(p->name, p->format, p->probe_func, p); - if (r) - printk(KERN_INFO "Unable to register probe %s\n", - p->name); - } - - kvm_trace->trace_state = KVM_TRACE_STATE_RUNNING; - - return 0; -err: - if (kt) { - if (kt->lost_file) - debugfs_remove(kt->lost_file); - if (kt->rchan) - relay_close(kt->rchan); - kfree(kt); - } - return r; -} - -static int kvm_trace_enable(char __user *arg) -{ - struct kvm_user_trace_setup kuts; - int ret; - - ret = copy_from_user(&kuts, arg, sizeof(kuts)); - if (ret) - return -EFAULT; - - ret = do_kvm_trace_enable(&kuts); - if (ret) - return ret; - - return 0; -} - -static int kvm_trace_pause(void) -{ - struct kvm_trace *kt = kvm_trace; - int r = -EINVAL; - - if (kt == NULL) - return r; - - if (kt->trace_state == KVM_TRACE_STATE_RUNNING) { - kt->trace_state = KVM_TRACE_STATE_PAUSE; - relay_flush(kt->rchan); - r = 0; - } - - return r; -} - -void kvm_trace_cleanup(void) -{ - struct kvm_trace *kt = kvm_trace; - int i; - - if (kt == NULL) - return; - - if (kt->trace_state == KVM_TRACE_STATE_RUNNING || - kt->trace_state == KVM_TRACE_STATE_PAUSE) { - - kt->trace_state = KVM_TRACE_STATE_CLEARUP; - - for (i = 0; i < ARRAY_SIZE(kvm_trace_probes); i++) { - struct kvm_trace_probe *p = &kvm_trace_probes[i]; - marker_probe_unregister(p->name, p->probe_func, p); - } - marker_synchronize_unregister(); - - relay_close(kt->rchan); - debugfs_remove(kt->lost_file); - kfree(kt); - } -} - -int kvm_trace_ioctl(unsigned int ioctl, unsigned long arg) -{ - void __user *argp = (void __user *)arg; - long r = -EINVAL; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - switch (ioctl) { - case KVM_TRACE_ENABLE: - r = kvm_trace_enable(argp); - break; - case KVM_TRACE_PAUSE: - r = kvm_trace_pause(); - break; - case KVM_TRACE_DISABLE: - r = 0; - kvm_trace_cleanup(); - break; - } - - return r; -} |