summaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-05-02 23:14:04 +0200
committerLinus Torvalds <torvalds@linux-foundation.org>2013-05-02 23:14:04 +0200
commit736a2dd2571ac56b11ed95a7814d838d5311be04 (patch)
treede10d107025970c6e51d5b6faeba799ed4b9caae /drivers
parentMerge tag 'vfio-for-v3.10' of git://github.com/awilliam/linux-vfio (diff)
parentcaif_virtio: Remove bouncing email addresses (diff)
downloadlinux-736a2dd2571ac56b11ed95a7814d838d5311be04.tar.xz
linux-736a2dd2571ac56b11ed95a7814d838d5311be04.zip
Merge tag 'virtio-next-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux
Pull virtio & lguest updates from Rusty Russell: "Lots of virtio work which wasn't quite ready for last merge window. Plus I dived into lguest again, reworking the pagetable code so we can move the switcher page: our fixmaps sometimes take more than 2MB now..." Ugh. Annoying conflicts with the tcm_vhost -> vhost_scsi rename. Hopefully correctly resolved. * tag 'virtio-next-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux: (57 commits) caif_virtio: Remove bouncing email addresses lguest: improve code readability in lg_cpu_start. virtio-net: fill only rx queues which are being used lguest: map Switcher below fixmap. lguest: cache last cpu we ran on. lguest: map Switcher text whenever we allocate a new pagetable. lguest: don't share Switcher PTE pages between guests. lguest: expost switcher_pages array (as lg_switcher_pages). lguest: extract shadow PTE walking / allocating. lguest: make check_gpte et. al return bool. lguest: assume Switcher text is a single page. lguest: rename switcher_page to switcher_pages. lguest: remove RESERVE_MEM constant. lguest: check vaddr not pgd for Switcher protection. lguest: prepare to make SWITCHER_ADDR a variable. virtio: console: replace EMFILE with EBUSY for already-open port virtio-scsi: reset virtqueue affinity when doing cpu hotplug virtio-scsi: introduce multiqueue support virtio-scsi: push vq lock/unlock into virtscsi_vq_done virtio-scsi: pass struct virtio_scsi to virtqueue completion function ...
Diffstat (limited to 'drivers')
-rw-r--r--drivers/Makefile2
-rw-r--r--drivers/block/virtio_blk.c148
-rw-r--r--drivers/char/hw_random/virtio-rng.c2
-rw-r--r--drivers/char/virtio_console.c14
-rw-r--r--drivers/lguest/Kconfig5
-rw-r--r--drivers/lguest/core.c67
-rw-r--r--drivers/lguest/lg.h6
-rw-r--r--drivers/lguest/lguest_user.c6
-rw-r--r--drivers/lguest/page_tables.c567
-rw-r--r--drivers/lguest/x86/core.c7
-rw-r--r--drivers/net/caif/Kconfig14
-rw-r--r--drivers/net/caif/Makefile3
-rw-r--r--drivers/net/caif/caif_virtio.c790
-rw-r--r--drivers/net/virtio_net.c77
-rw-r--r--drivers/rpmsg/virtio_rpmsg_bus.c8
-rw-r--r--drivers/scsi/virtio_scsi.c487
-rw-r--r--drivers/vhost/Kconfig8
-rw-r--r--drivers/vhost/Makefile2
-rw-r--r--drivers/vhost/test.c4
-rw-r--r--drivers/vhost/vringh.c1007
-rw-r--r--drivers/virtio/virtio_balloon.c6
-rw-r--r--drivers/virtio/virtio_ring.c297
22 files changed, 2850 insertions, 677 deletions
diff --git a/drivers/Makefile b/drivers/Makefile
index 33360de63650..8e57688ebd95 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -124,7 +124,7 @@ obj-$(CONFIG_PPC_PS3) += ps3/
obj-$(CONFIG_OF) += of/
obj-$(CONFIG_SSB) += ssb/
obj-$(CONFIG_BCMA) += bcma/
-obj-$(CONFIG_VHOST_NET) += vhost/
+obj-$(CONFIG_VHOST_RING) += vhost/
obj-$(CONFIG_VLYNQ) += vlynq/
obj-$(CONFIG_STAGING) += staging/
obj-y += platform/
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 8ad21a25bc0d..64723953e1c9 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -100,96 +100,103 @@ static inline struct virtblk_req *virtblk_alloc_req(struct virtio_blk *vblk,
return vbr;
}
-static void virtblk_add_buf_wait(struct virtio_blk *vblk,
- struct virtblk_req *vbr,
- unsigned long out,
- unsigned long in)
+static int __virtblk_add_req(struct virtqueue *vq,
+ struct virtblk_req *vbr,
+ struct scatterlist *data_sg,
+ bool have_data)
{
- DEFINE_WAIT(wait);
+ struct scatterlist hdr, status, cmd, sense, inhdr, *sgs[6];
+ unsigned int num_out = 0, num_in = 0;
+ int type = vbr->out_hdr.type & ~VIRTIO_BLK_T_OUT;
- for (;;) {
- prepare_to_wait_exclusive(&vblk->queue_wait, &wait,
- TASK_UNINTERRUPTIBLE);
+ sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr));
+ sgs[num_out++] = &hdr;
- spin_lock_irq(vblk->disk->queue->queue_lock);
- if (virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr,
- GFP_ATOMIC) < 0) {
- spin_unlock_irq(vblk->disk->queue->queue_lock);
- io_schedule();
- } else {
- virtqueue_kick(vblk->vq);
- spin_unlock_irq(vblk->disk->queue->queue_lock);
- break;
- }
+ /*
+ * If this is a packet command we need a couple of additional headers.
+ * Behind the normal outhdr we put a segment with the scsi command
+ * block, and before the normal inhdr we put the sense data and the
+ * inhdr with additional status information.
+ */
+ if (type == VIRTIO_BLK_T_SCSI_CMD) {
+ sg_init_one(&cmd, vbr->req->cmd, vbr->req->cmd_len);
+ sgs[num_out++] = &cmd;
+ }
+ if (have_data) {
+ if (vbr->out_hdr.type & VIRTIO_BLK_T_OUT)
+ sgs[num_out++] = data_sg;
+ else
+ sgs[num_out + num_in++] = data_sg;
}
- finish_wait(&vblk->queue_wait, &wait);
+ if (type == VIRTIO_BLK_T_SCSI_CMD) {
+ sg_init_one(&sense, vbr->req->sense, SCSI_SENSE_BUFFERSIZE);
+ sgs[num_out + num_in++] = &sense;
+ sg_init_one(&inhdr, &vbr->in_hdr, sizeof(vbr->in_hdr));
+ sgs[num_out + num_in++] = &inhdr;
+ }
+
+ sg_init_one(&status, &vbr->status, sizeof(vbr->status));
+ sgs[num_out + num_in++] = &status;
+
+ return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
}
-static inline void virtblk_add_req(struct virtblk_req *vbr,
- unsigned int out, unsigned int in)
+static void virtblk_add_req(struct virtblk_req *vbr, bool have_data)
{
struct virtio_blk *vblk = vbr->vblk;
+ DEFINE_WAIT(wait);
+ int ret;
spin_lock_irq(vblk->disk->queue->queue_lock);
- if (unlikely(virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr,
- GFP_ATOMIC) < 0)) {
+ while (unlikely((ret = __virtblk_add_req(vblk->vq, vbr, vbr->sg,
+ have_data)) < 0)) {
+ prepare_to_wait_exclusive(&vblk->queue_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+
spin_unlock_irq(vblk->disk->queue->queue_lock);
- virtblk_add_buf_wait(vblk, vbr, out, in);
- return;
+ io_schedule();
+ spin_lock_irq(vblk->disk->queue->queue_lock);
+
+ finish_wait(&vblk->queue_wait, &wait);
}
+
virtqueue_kick(vblk->vq);
spin_unlock_irq(vblk->disk->queue->queue_lock);
}
-static int virtblk_bio_send_flush(struct virtblk_req *vbr)
+static void virtblk_bio_send_flush(struct virtblk_req *vbr)
{
- unsigned int out = 0, in = 0;
-
vbr->flags |= VBLK_IS_FLUSH;
vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH;
vbr->out_hdr.sector = 0;
vbr->out_hdr.ioprio = 0;
- sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
- sg_set_buf(&vbr->sg[out + in++], &vbr->status, sizeof(vbr->status));
-
- virtblk_add_req(vbr, out, in);
- return 0;
+ virtblk_add_req(vbr, false);
}
-static int virtblk_bio_send_data(struct virtblk_req *vbr)
+static void virtblk_bio_send_data(struct virtblk_req *vbr)
{
struct virtio_blk *vblk = vbr->vblk;
- unsigned int num, out = 0, in = 0;
struct bio *bio = vbr->bio;
+ bool have_data;
vbr->flags &= ~VBLK_IS_FLUSH;
vbr->out_hdr.type = 0;
vbr->out_hdr.sector = bio->bi_sector;
vbr->out_hdr.ioprio = bio_prio(bio);
- sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
-
- num = blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg + out);
-
- sg_set_buf(&vbr->sg[num + out + in++], &vbr->status,
- sizeof(vbr->status));
-
- if (num) {
- if (bio->bi_rw & REQ_WRITE) {
+ if (blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg)) {
+ have_data = true;
+ if (bio->bi_rw & REQ_WRITE)
vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
- out += num;
- } else {
+ else
vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
- in += num;
- }
- }
+ } else
+ have_data = false;
- virtblk_add_req(vbr, out, in);
-
- return 0;
+ virtblk_add_req(vbr, have_data);
}
static void virtblk_bio_send_data_work(struct work_struct *work)
@@ -298,7 +305,7 @@ static void virtblk_done(struct virtqueue *vq)
static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
struct request *req)
{
- unsigned long num, out = 0, in = 0;
+ unsigned int num;
struct virtblk_req *vbr;
vbr = virtblk_alloc_req(vblk, GFP_ATOMIC);
@@ -335,40 +342,15 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
}
}
- sg_set_buf(&vblk->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
-
- /*
- * If this is a packet command we need a couple of additional headers.
- * Behind the normal outhdr we put a segment with the scsi command
- * block, and before the normal inhdr we put the sense data and the
- * inhdr with additional status information before the normal inhdr.
- */
- if (vbr->req->cmd_type == REQ_TYPE_BLOCK_PC)
- sg_set_buf(&vblk->sg[out++], vbr->req->cmd, vbr->req->cmd_len);
-
- num = blk_rq_map_sg(q, vbr->req, vblk->sg + out);
-
- if (vbr->req->cmd_type == REQ_TYPE_BLOCK_PC) {
- sg_set_buf(&vblk->sg[num + out + in++], vbr->req->sense, SCSI_SENSE_BUFFERSIZE);
- sg_set_buf(&vblk->sg[num + out + in++], &vbr->in_hdr,
- sizeof(vbr->in_hdr));
- }
-
- sg_set_buf(&vblk->sg[num + out + in++], &vbr->status,
- sizeof(vbr->status));
-
+ num = blk_rq_map_sg(q, vbr->req, vblk->sg);
if (num) {
- if (rq_data_dir(vbr->req) == WRITE) {
+ if (rq_data_dir(vbr->req) == WRITE)
vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
- out += num;
- } else {
+ else
vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
- in += num;
- }
}
- if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr,
- GFP_ATOMIC) < 0) {
+ if (__virtblk_add_req(vblk->vq, vbr, vblk->sg, num) < 0) {
mempool_free(vbr, vblk->pool);
return false;
}
@@ -539,6 +521,7 @@ static void virtblk_config_changed_work(struct work_struct *work)
struct virtio_device *vdev = vblk->vdev;
struct request_queue *q = vblk->disk->queue;
char cap_str_2[10], cap_str_10[10];
+ char *envp[] = { "RESIZE=1", NULL };
u64 capacity, size;
mutex_lock(&vblk->config_lock);
@@ -568,6 +551,7 @@ static void virtblk_config_changed_work(struct work_struct *work)
set_capacity(vblk->disk, capacity);
revalidate_disk(vblk->disk);
+ kobject_uevent_env(&disk_to_dev(vblk->disk)->kobj, KOBJ_CHANGE, envp);
done:
mutex_unlock(&vblk->config_lock);
}
diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c
index 6bf4d47324eb..ef46a9cfd832 100644
--- a/drivers/char/hw_random/virtio-rng.c
+++ b/drivers/char/hw_random/virtio-rng.c
@@ -47,7 +47,7 @@ static void register_buffer(u8 *buf, size_t size)
sg_init_one(&sg, buf, size);
/* There should always be room for one buffer. */
- if (virtqueue_add_buf(vq, &sg, 0, 1, buf, GFP_KERNEL) < 0)
+ if (virtqueue_add_inbuf(vq, &sg, 1, buf, GFP_KERNEL) < 0)
BUG();
virtqueue_kick(vq);
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index ce5f3fc25d6d..1b456fe9b87a 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -78,8 +78,8 @@ struct ports_driver_data {
};
static struct ports_driver_data pdrvdata;
-DEFINE_SPINLOCK(pdrvdata_lock);
-DECLARE_COMPLETION(early_console_added);
+static DEFINE_SPINLOCK(pdrvdata_lock);
+static DECLARE_COMPLETION(early_console_added);
/* This struct holds information that's relevant only for console ports */
struct console {
@@ -503,7 +503,7 @@ static int add_inbuf(struct virtqueue *vq, struct port_buffer *buf)
sg_init_one(sg, buf->buf, buf->size);
- ret = virtqueue_add_buf(vq, sg, 0, 1, buf, GFP_ATOMIC);
+ ret = virtqueue_add_inbuf(vq, sg, 1, buf, GFP_ATOMIC);
virtqueue_kick(vq);
if (!ret)
ret = vq->num_free;
@@ -572,7 +572,7 @@ static ssize_t __send_control_msg(struct ports_device *portdev, u32 port_id,
sg_init_one(sg, &cpkt, sizeof(cpkt));
spin_lock(&portdev->c_ovq_lock);
- if (virtqueue_add_buf(vq, sg, 1, 0, &cpkt, GFP_ATOMIC) == 0) {
+ if (virtqueue_add_outbuf(vq, sg, 1, &cpkt, GFP_ATOMIC) == 0) {
virtqueue_kick(vq);
while (!virtqueue_get_buf(vq, &len))
cpu_relax();
@@ -622,7 +622,7 @@ static ssize_t __send_to_port(struct port *port, struct scatterlist *sg,
reclaim_consumed_buffers(port);
- err = virtqueue_add_buf(out_vq, sg, nents, 0, data, GFP_ATOMIC);
+ err = virtqueue_add_outbuf(out_vq, sg, nents, data, GFP_ATOMIC);
/* Tell Host to go! */
virtqueue_kick(out_vq);
@@ -1040,7 +1040,7 @@ static int port_fops_open(struct inode *inode, struct file *filp)
spin_lock_irq(&port->inbuf_lock);
if (port->guest_connected) {
spin_unlock_irq(&port->inbuf_lock);
- ret = -EMFILE;
+ ret = -EBUSY;
goto out;
}
@@ -1202,7 +1202,7 @@ int __init virtio_cons_early_init(int (*put_chars)(u32, const char *, int))
return hvc_instantiate(0, 0, &hv_ops);
}
-int init_port_console(struct port *port)
+static int init_port_console(struct port *port)
{
int ret;
diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig
index 89875ea19ade..ee035ec4526b 100644
--- a/drivers/lguest/Kconfig
+++ b/drivers/lguest/Kconfig
@@ -5,10 +5,9 @@ config LGUEST
---help---
This is a very simple module which allows you to run
multiple instances of the same Linux kernel, using the
- "lguest" command found in the Documentation/virtual/lguest
- directory.
+ "lguest" command found in the tools/lguest directory.
Note that "lguest" is pronounced to rhyme with "fell quest",
- not "rustyvisor". See Documentation/virtual/lguest/lguest.txt.
+ not "rustyvisor". See tools/lguest/lguest.txt.
If unsure, say N. If curious, say M. If masochistic, say Y.
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index a5ebc0083d87..0bf1e4edf04d 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -20,9 +20,9 @@
#include <asm/asm-offsets.h>
#include "lg.h"
-
+unsigned long switcher_addr;
+struct page **lg_switcher_pages;
static struct vm_struct *switcher_vma;
-static struct page **switcher_page;
/* This One Big lock protects all inter-guest data structures. */
DEFINE_MUTEX(lguest_lock);
@@ -52,13 +52,21 @@ static __init int map_switcher(void)
* easy.
*/
+ /* We assume Switcher text fits into a single page. */
+ if (end_switcher_text - start_switcher_text > PAGE_SIZE) {
+ printk(KERN_ERR "lguest: switcher text too large (%zu)\n",
+ end_switcher_text - start_switcher_text);
+ return -EINVAL;
+ }
+
/*
* We allocate an array of struct page pointers. map_vm_area() wants
* this, rather than just an array of pages.
*/
- switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES,
- GFP_KERNEL);
- if (!switcher_page) {
+ lg_switcher_pages = kmalloc(sizeof(lg_switcher_pages[0])
+ * TOTAL_SWITCHER_PAGES,
+ GFP_KERNEL);
+ if (!lg_switcher_pages) {
err = -ENOMEM;
goto out;
}
@@ -68,32 +76,29 @@ static __init int map_switcher(void)
* so we make sure they're zeroed.
*/
for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
- switcher_page[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
- if (!switcher_page[i]) {
+ lg_switcher_pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
+ if (!lg_switcher_pages[i]) {
err = -ENOMEM;
goto free_some_pages;
}
}
/*
- * First we check that the Switcher won't overlap the fixmap area at
- * the top of memory. It's currently nowhere near, but it could have
- * very strange effects if it ever happened.
+ * We place the Switcher underneath the fixmap area, which is the
+ * highest virtual address we can get. This is important, since we
+ * tell the Guest it can't access this memory, so we want its ceiling
+ * as high as possible.
*/
- if (SWITCHER_ADDR + (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE > FIXADDR_START){
- err = -ENOMEM;
- printk("lguest: mapping switcher would thwack fixmap\n");
- goto free_pages;
- }
+ switcher_addr = FIXADDR_START - (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE;
/*
- * Now we reserve the "virtual memory area" we want: 0xFFC00000
- * (SWITCHER_ADDR). We might not get it in theory, but in practice
- * it's worked so far. The end address needs +1 because __get_vm_area
- * allocates an extra guard page, so we need space for that.
+ * Now we reserve the "virtual memory area" we want. We might
+ * not get it in theory, but in practice it's worked so far.
+ * The end address needs +1 because __get_vm_area allocates an
+ * extra guard page, so we need space for that.
*/
switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE,
- VM_ALLOC, SWITCHER_ADDR, SWITCHER_ADDR
+ VM_ALLOC, switcher_addr, switcher_addr
+ (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE);
if (!switcher_vma) {
err = -ENOMEM;
@@ -103,12 +108,12 @@ static __init int map_switcher(void)
/*
* This code actually sets up the pages we've allocated to appear at
- * SWITCHER_ADDR. map_vm_area() takes the vma we allocated above, the
+ * switcher_addr. map_vm_area() takes the vma we allocated above, the
* kind of pages we're mapping (kernel pages), and a pointer to our
* array of struct pages. It increments that pointer, but we don't
* care.
*/
- pagep = switcher_page;
+ pagep = lg_switcher_pages;
err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep);
if (err) {
printk("lguest: map_vm_area failed: %i\n", err);
@@ -133,8 +138,8 @@ free_pages:
i = TOTAL_SWITCHER_PAGES;
free_some_pages:
for (--i; i >= 0; i--)
- __free_pages(switcher_page[i], 0);
- kfree(switcher_page);
+ __free_pages(lg_switcher_pages[i], 0);
+ kfree(lg_switcher_pages);
out:
return err;
}
@@ -149,8 +154,8 @@ static void unmap_switcher(void)
vunmap(switcher_vma->addr);
/* Now we just need to free the pages we copied the switcher into */
for (i = 0; i < TOTAL_SWITCHER_PAGES; i++)
- __free_pages(switcher_page[i], 0);
- kfree(switcher_page);
+ __free_pages(lg_switcher_pages[i], 0);
+ kfree(lg_switcher_pages);
}
/*H:032
@@ -323,15 +328,10 @@ static int __init init(void)
if (err)
goto out;
- /* Now we set up the pagetable implementation for the Guests. */
- err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES);
- if (err)
- goto unmap;
-
/* We might need to reserve an interrupt vector. */
err = init_interrupts();
if (err)
- goto free_pgtables;
+ goto unmap;
/* /dev/lguest needs to be registered. */
err = lguest_device_init();
@@ -346,8 +346,6 @@ static int __init init(void)
free_interrupts:
free_interrupts();
-free_pgtables:
- free_pagetables();
unmap:
unmap_switcher();
out:
@@ -359,7 +357,6 @@ static void __exit fini(void)
{
lguest_device_remove();
free_interrupts();
- free_pagetables();
unmap_switcher();
lguest_arch_host_fini();
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 295df06e6590..2eef40be4c04 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -14,11 +14,10 @@
#include <asm/lguest.h>
-void free_pagetables(void);
-int init_pagetables(struct page **switcher_page, unsigned int pages);
-
struct pgdir {
unsigned long gpgdir;
+ bool switcher_mapped;
+ int last_host_cpu;
pgd_t *pgdir;
};
@@ -124,6 +123,7 @@ bool lguest_address_ok(const struct lguest *lg,
unsigned long addr, unsigned long len);
void __lgread(struct lg_cpu *, void *, unsigned long, unsigned);
void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned);
+extern struct page **lg_switcher_pages;
/*H:035
* Using memory-copy operations like that is usually inconvient, so we
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index ff4a0bc9904d..4263f4cc8c55 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -250,13 +250,13 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
*/
static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
{
- /* We have a limited number the number of CPUs in the lguest struct. */
+ /* We have a limited number of CPUs in the lguest struct. */
if (id >= ARRAY_SIZE(cpu->lg->cpus))
return -EINVAL;
/* Set up this CPU's id, and pointer back to the lguest struct. */
cpu->id = id;
- cpu->lg = container_of((cpu - id), struct lguest, cpus[0]);
+ cpu->lg = container_of(cpu, struct lguest, cpus[id]);
cpu->lg->nr_cpus++;
/* Each CPU has a timer it can set. */
@@ -270,7 +270,7 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
if (!cpu->regs_page)
return -ENOMEM;
- /* We actually put the registers at the bottom of the page. */
+ /* We actually put the registers at the end of the page. */
cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs);
/*
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index 864baabaee25..699187ab3800 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -7,7 +7,7 @@
* converted Guest pages when running the Guest.
:*/
-/* Copyright (C) Rusty Russell IBM Corporation 2006.
+/* Copyright (C) Rusty Russell IBM Corporation 2013.
* GPL v2 and any later version */
#include <linux/mm.h>
#include <linux/gfp.h>
@@ -62,22 +62,11 @@
* will need the last pmd entry of the last pmd page.
*/
#ifdef CONFIG_X86_PAE
-#define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1)
-#define RESERVE_MEM 2U
#define CHECK_GPGD_MASK _PAGE_PRESENT
#else
-#define RESERVE_MEM 4U
#define CHECK_GPGD_MASK _PAGE_TABLE
#endif
-/*
- * We actually need a separate PTE page for each CPU. Remember that after the
- * Switcher code itself comes two pages for each CPU, and we don't want this
- * CPU's guest to see the pages of any other CPU.
- */
-static DEFINE_PER_CPU(pte_t *, switcher_pte_pages);
-#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu)
-
/*H:320
* The page table code is curly enough to need helper functions to keep it
* clear and clean. The kernel itself provides many of them; one advantage
@@ -95,13 +84,6 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr)
{
unsigned int index = pgd_index(vaddr);
-#ifndef CONFIG_X86_PAE
- /* We kill any Guest trying to touch the Switcher addresses. */
- if (index >= SWITCHER_PGD_INDEX) {
- kill_guest(cpu, "attempt to access switcher pages");
- index = 0;
- }
-#endif
/* Return a pointer index'th pgd entry for the i'th page table. */
return &cpu->lg->pgdirs[i].pgdir[index];
}
@@ -117,13 +99,6 @@ static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
unsigned int index = pmd_index(vaddr);
pmd_t *page;
- /* We kill any Guest trying to touch the Switcher addresses. */
- if (pgd_index(vaddr) == SWITCHER_PGD_INDEX &&
- index >= SWITCHER_PMD_INDEX) {
- kill_guest(cpu, "attempt to access switcher pages");
- index = 0;
- }
-
/* You should never call this if the PGD entry wasn't valid */
BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
@@ -275,122 +250,177 @@ static void release_pte(pte_t pte)
}
/*:*/
-static void check_gpte(struct lg_cpu *cpu, pte_t gpte)
+static bool check_gpte(struct lg_cpu *cpu, pte_t gpte)
{
if ((pte_flags(gpte) & _PAGE_PSE) ||
- pte_pfn(gpte) >= cpu->lg->pfn_limit)
+ pte_pfn(gpte) >= cpu->lg->pfn_limit) {
kill_guest(cpu, "bad page table entry");
+ return false;
+ }
+ return true;
}
-static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd)
+static bool check_gpgd(struct lg_cpu *cpu, pgd_t gpgd)
{
if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) ||
- (pgd_pfn(gpgd) >= cpu->lg->pfn_limit))
+ (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) {
kill_guest(cpu, "bad page directory entry");
+ return false;
+ }
+ return true;
}
#ifdef CONFIG_X86_PAE
-static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd)
+static bool check_gpmd(struct lg_cpu *cpu, pmd_t gpmd)
{
if ((pmd_flags(gpmd) & ~_PAGE_TABLE) ||
- (pmd_pfn(gpmd) >= cpu->lg->pfn_limit))
+ (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) {
kill_guest(cpu, "bad page middle directory entry");
+ return false;
+ }
+ return true;
}
#endif
-/*H:330
- * (i) Looking up a page table entry when the Guest faults.
- *
- * We saw this call in run_guest(): when we see a page fault in the Guest, we
- * come here. That's because we only set up the shadow page tables lazily as
- * they're needed, so we get page faults all the time and quietly fix them up
- * and return to the Guest without it knowing.
+/*H:331
+ * This is the core routine to walk the shadow page tables and find the page
+ * table entry for a specific address.
*
- * If we fixed up the fault (ie. we mapped the address), this routine returns
- * true. Otherwise, it was a real fault and we need to tell the Guest.
+ * If allocate is set, then we allocate any missing levels, setting the flags
+ * on the new page directory and mid-level directories using the arguments
+ * (which are copied from the Guest's page table entries).
*/
-bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
+static pte_t *find_spte(struct lg_cpu *cpu, unsigned long vaddr, bool allocate,
+ int pgd_flags, int pmd_flags)
{
- pgd_t gpgd;
pgd_t *spgd;
- unsigned long gpte_ptr;
- pte_t gpte;
- pte_t *spte;
-
/* Mid level for PAE. */
#ifdef CONFIG_X86_PAE
pmd_t *spmd;
- pmd_t gpmd;
#endif
- /* First step: get the top-level Guest page table entry. */
- if (unlikely(cpu->linear_pages)) {
- /* Faking up a linear mapping. */
- gpgd = __pgd(CHECK_GPGD_MASK);
- } else {
- gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
- /* Toplevel not present? We can't map it in. */
- if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
- return false;
- }
-
- /* Now look at the matching shadow entry. */
+ /* Get top level entry. */
spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) {
/* No shadow entry: allocate a new shadow PTE page. */
- unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
+ unsigned long ptepage;
+
+ /* If they didn't want us to allocate anything, stop. */
+ if (!allocate)
+ return NULL;
+
+ ptepage = get_zeroed_page(GFP_KERNEL);
/*
* This is not really the Guest's fault, but killing it is
* simple for this corner case.
*/
if (!ptepage) {
kill_guest(cpu, "out of memory allocating pte page");
- return false;
+ return NULL;
}
- /* We check that the Guest pgd is OK. */
- check_gpgd(cpu, gpgd);
/*
* And we copy the flags to the shadow PGD entry. The page
* number in the shadow PGD is the page we just allocated.
*/
- set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd)));
+ set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags));
}
+ /*
+ * Intel's Physical Address Extension actually uses three levels of
+ * page tables, so we need to look in the mid-level.
+ */
#ifdef CONFIG_X86_PAE
- if (unlikely(cpu->linear_pages)) {
- /* Faking up a linear mapping. */
- gpmd = __pmd(_PAGE_TABLE);
- } else {
- gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
- /* Middle level not present? We can't map it in. */
- if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
- return false;
- }
-
- /* Now look at the matching shadow entry. */
+ /* Now look at the mid-level shadow entry. */
spmd = spmd_addr(cpu, *spgd, vaddr);
if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) {
/* No shadow entry: allocate a new shadow PTE page. */
- unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
+ unsigned long ptepage;
+
+ /* If they didn't want us to allocate anything, stop. */
+ if (!allocate)
+ return NULL;
+
+ ptepage = get_zeroed_page(GFP_KERNEL);
/*
* This is not really the Guest's fault, but killing it is
* simple for this corner case.
*/
if (!ptepage) {
- kill_guest(cpu, "out of memory allocating pte page");
- return false;
+ kill_guest(cpu, "out of memory allocating pmd page");
+ return NULL;
}
- /* We check that the Guest pmd is OK. */
- check_gpmd(cpu, gpmd);
-
/*
* And we copy the flags to the shadow PMD entry. The page
* number in the shadow PMD is the page we just allocated.
*/
- set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd)));
+ set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags));
+ }
+#endif
+
+ /* Get the pointer to the shadow PTE entry we're going to set. */
+ return spte_addr(cpu, *spgd, vaddr);
+}
+
+/*H:330
+ * (i) Looking up a page table entry when the Guest faults.
+ *
+ * We saw this call in run_guest(): when we see a page fault in the Guest, we
+ * come here. That's because we only set up the shadow page tables lazily as
+ * they're needed, so we get page faults all the time and quietly fix them up
+ * and return to the Guest without it knowing.
+ *
+ * If we fixed up the fault (ie. we mapped the address), this routine returns
+ * true. Otherwise, it was a real fault and we need to tell the Guest.
+ */
+bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
+{
+ unsigned long gpte_ptr;
+ pte_t gpte;
+ pte_t *spte;
+ pmd_t gpmd;
+ pgd_t gpgd;
+
+ /* We never demand page the Switcher, so trying is a mistake. */
+ if (vaddr >= switcher_addr)
+ return false;
+
+ /* First step: get the top-level Guest page table entry. */
+ if (unlikely(cpu->linear_pages)) {
+ /* Faking up a linear mapping. */
+ gpgd = __pgd(CHECK_GPGD_MASK);
+ } else {
+ gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
+ /* Toplevel not present? We can't map it in. */
+ if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
+ return false;
+
+ /*
+ * This kills the Guest if it has weird flags or tries to
+ * refer to a "physical" address outside the bounds.
+ */
+ if (!check_gpgd(cpu, gpgd))
+ return false;
+ }
+
+ /* This "mid-level" entry is only used for non-linear, PAE mode. */
+ gpmd = __pmd(_PAGE_TABLE);
+
+#ifdef CONFIG_X86_PAE
+ if (likely(!cpu->linear_pages)) {
+ gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
+ /* Middle level not present? We can't map it in. */
+ if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
+ return false;
+
+ /*
+ * This kills the Guest if it has weird flags or tries to
+ * refer to a "physical" address outside the bounds.
+ */
+ if (!check_gpmd(cpu, gpmd))
+ return false;
}
/*
@@ -433,7 +463,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
* Check that the Guest PTE flags are OK, and the page number is below
* the pfn_limit (ie. not mapping the Launcher binary).
*/
- check_gpte(cpu, gpte);
+ if (!check_gpte(cpu, gpte))
+ return false;
/* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
gpte = pte_mkyoung(gpte);
@@ -441,7 +472,9 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
gpte = pte_mkdirty(gpte);
/* Get the pointer to the shadow PTE entry we're going to set. */
- spte = spte_addr(cpu, *spgd, vaddr);
+ spte = find_spte(cpu, vaddr, true, pgd_flags(gpgd), pmd_flags(gpmd));
+ if (!spte)
+ return false;
/*
* If there was a valid shadow PTE entry here before, we release it.
@@ -493,29 +526,23 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
*/
static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr)
{
- pgd_t *spgd;
+ pte_t *spte;
unsigned long flags;
-#ifdef CONFIG_X86_PAE
- pmd_t *spmd;
-#endif
- /* Look at the current top level entry: is it present? */
- spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
- if (!(pgd_flags(*spgd) & _PAGE_PRESENT))
+ /* You can't put your stack in the Switcher! */
+ if (vaddr >= switcher_addr)
return false;
-#ifdef CONFIG_X86_PAE
- spmd = spmd_addr(cpu, *spgd, vaddr);
- if (!(pmd_flags(*spmd) & _PAGE_PRESENT))
+ /* If there's no shadow PTE, it's not writable. */
+ spte = find_spte(cpu, vaddr, false, 0, 0);
+ if (!spte)
return false;
-#endif
/*
* Check the flags on the pte entry itself: it must be present and
* writable.
*/
- flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr)));
-
+ flags = pte_flags(*spte);
return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
}
@@ -678,9 +705,6 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
int *blank_pgdir)
{
unsigned int next;
-#ifdef CONFIG_X86_PAE
- pmd_t *pmd_table;
-#endif
/*
* We pick one entry at random to throw out. Choosing the Least
@@ -695,29 +719,11 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
if (!cpu->lg->pgdirs[next].pgdir)
next = cpu->cpu_pgd;
else {
-#ifdef CONFIG_X86_PAE
/*
- * In PAE mode, allocate a pmd page and populate the
- * last pgd entry.
+ * This is a blank page, so there are no kernel
+ * mappings: caller must map the stack!
*/
- pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL);
- if (!pmd_table) {
- free_page((long)cpu->lg->pgdirs[next].pgdir);
- set_pgd(cpu->lg->pgdirs[next].pgdir, __pgd(0));
- next = cpu->cpu_pgd;
- } else {
- set_pgd(cpu->lg->pgdirs[next].pgdir +
- SWITCHER_PGD_INDEX,
- __pgd(__pa(pmd_table) | _PAGE_PRESENT));
- /*
- * This is a blank page, so there are no kernel
- * mappings: caller must map the stack!
- */
- *blank_pgdir = 1;
- }
-#else
*blank_pgdir = 1;
-#endif
}
}
/* Record which Guest toplevel this shadows. */
@@ -725,9 +731,50 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
/* Release all the non-kernel mappings. */
flush_user_mappings(cpu->lg, next);
+ /* This hasn't run on any CPU at all. */
+ cpu->lg->pgdirs[next].last_host_cpu = -1;
+
return next;
}
+/*H:501
+ * We do need the Switcher code mapped at all times, so we allocate that
+ * part of the Guest page table here. We map the Switcher code immediately,
+ * but defer mapping of the guest register page and IDT/LDT etc page until
+ * just before we run the guest in map_switcher_in_guest().
+ *
+ * We *could* do this setup in map_switcher_in_guest(), but at that point
+ * we've interrupts disabled, and allocating pages like that is fraught: we
+ * can't sleep if we need to free up some memory.
+ */
+static bool allocate_switcher_mapping(struct lg_cpu *cpu)
+{
+ int i;
+
+ for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
+ pte_t *pte = find_spte(cpu, switcher_addr + i * PAGE_SIZE, true,
+ CHECK_GPGD_MASK, _PAGE_TABLE);
+ if (!pte)
+ return false;
+
+ /*
+ * Map the switcher page if not already there. It might
+ * already be there because we call allocate_switcher_mapping()
+ * in guest_set_pgd() just in case it did discard our Switcher
+ * mapping, but it probably didn't.
+ */
+ if (i == 0 && !(pte_flags(*pte) & _PAGE_PRESENT)) {
+ /* Get a reference to the Switcher page. */
+ get_page(lg_switcher_pages[0]);
+ /* Create a read-only, exectuable, kernel-style PTE */
+ set_pte(pte,
+ mk_pte(lg_switcher_pages[0], PAGE_KERNEL_RX));
+ }
+ }
+ cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped = true;
+ return true;
+}
+
/*H:470
* Finally, a routine which throws away everything: all PGD entries in all
* the shadow page tables, including the Guest's kernel mappings. This is used
@@ -738,28 +785,16 @@ static void release_all_pagetables(struct lguest *lg)
unsigned int i, j;
/* Every shadow pagetable this Guest has */
- for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
- if (lg->pgdirs[i].pgdir) {
-#ifdef CONFIG_X86_PAE
- pgd_t *spgd;
- pmd_t *pmdpage;
- unsigned int k;
-
- /* Get the last pmd page. */
- spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX;
- pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
-
- /*
- * And release the pmd entries of that pmd page,
- * except for the switcher pmd.
- */
- for (k = 0; k < SWITCHER_PMD_INDEX; k++)
- release_pmd(&pmdpage[k]);
-#endif
- /* Every PGD entry except the Switcher at the top */
- for (j = 0; j < SWITCHER_PGD_INDEX; j++)
- release_pgd(lg->pgdirs[i].pgdir + j);
- }
+ for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) {
+ if (!lg->pgdirs[i].pgdir)
+ continue;
+
+ /* Every PGD entry. */
+ for (j = 0; j < PTRS_PER_PGD; j++)
+ release_pgd(lg->pgdirs[i].pgdir + j);
+ lg->pgdirs[i].switcher_mapped = false;
+ lg->pgdirs[i].last_host_cpu = -1;
+ }
}
/*
@@ -773,6 +808,9 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu)
release_all_pagetables(cpu->lg);
/* We need the Guest kernel stack mapped again. */
pin_stack_pages(cpu);
+ /* And we need Switcher allocated. */
+ if (!allocate_switcher_mapping(cpu))
+ kill_guest(cpu, "Cannot populate switcher mapping");
}
/*H:430
@@ -808,9 +846,17 @@ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
newpgdir = new_pgdir(cpu, pgtable, &repin);
/* Change the current pgd index to the new one. */
cpu->cpu_pgd = newpgdir;
- /* If it was completely blank, we map in the Guest kernel stack */
+ /*
+ * If it was completely blank, we map in the Guest kernel stack and
+ * the Switcher.
+ */
if (repin)
pin_stack_pages(cpu);
+
+ if (!cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped) {
+ if (!allocate_switcher_mapping(cpu))
+ kill_guest(cpu, "Cannot populate switcher mapping");
+ }
}
/*:*/
@@ -865,7 +911,8 @@ static void do_set_pte(struct lg_cpu *cpu, int idx,
* micro-benchmark.
*/
if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
- check_gpte(cpu, gpte);
+ if (!check_gpte(cpu, gpte))
+ return;
set_pte(spte,
gpte_to_spte(cpu, gpte,
pte_flags(gpte) & _PAGE_DIRTY));
@@ -897,6 +944,12 @@ static void do_set_pte(struct lg_cpu *cpu, int idx,
void guest_set_pte(struct lg_cpu *cpu,
unsigned long gpgdir, unsigned long vaddr, pte_t gpte)
{
+ /* We don't let you remap the Switcher; we need it to get back! */
+ if (vaddr >= switcher_addr) {
+ kill_guest(cpu, "attempt to set pte into Switcher pages");
+ return;
+ }
+
/*
* Kernel mappings must be changed on all top levels. Slow, but doesn't
* happen often.
@@ -933,14 +986,23 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx)
{
int pgdir;
- if (idx >= SWITCHER_PGD_INDEX)
+ if (idx > PTRS_PER_PGD) {
+ kill_guest(&lg->cpus[0], "Attempt to set pgd %u/%u",
+ idx, PTRS_PER_PGD);
return;
+ }
/* If they're talking about a page table we have a shadow for... */
pgdir = find_pgdir(lg, gpgdir);
- if (pgdir < ARRAY_SIZE(lg->pgdirs))
+ if (pgdir < ARRAY_SIZE(lg->pgdirs)) {
/* ... throw it away. */
release_pgd(lg->pgdirs[pgdir].pgdir + idx);
+ /* That might have been the Switcher mapping, remap it. */
+ if (!allocate_switcher_mapping(&lg->cpus[0])) {
+ kill_guest(&lg->cpus[0],
+ "Cannot populate switcher mapping");
+ }
+ }
}
#ifdef CONFIG_X86_PAE
@@ -958,6 +1020,9 @@ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx)
* we will populate on future faults. The Guest doesn't have any actual
* pagetables yet, so we set linear_pages to tell demand_page() to fake it
* for the moment.
+ *
+ * We do need the Switcher to be mapped at all times, so we allocate that
+ * part of the Guest page table here.
*/
int init_guest_pagetable(struct lguest *lg)
{
@@ -971,21 +1036,34 @@ int init_guest_pagetable(struct lguest *lg)
/* We start with a linear mapping until the initialize. */
cpu->linear_pages = true;
+
+ /* Allocate the page tables for the Switcher. */
+ if (!allocate_switcher_mapping(cpu)) {
+ release_all_pagetables(lg);
+ return -ENOMEM;
+ }
+
return 0;
}
/*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */
void page_table_guest_data_init(struct lg_cpu *cpu)
{
+ /*
+ * We tell the Guest that it can't use the virtual addresses
+ * used by the Switcher. This trick is equivalent to 4GB -
+ * switcher_addr.
+ */
+ u32 top = ~switcher_addr + 1;
+
/* We get the kernel address: above this is all kernel memory. */
if (get_user(cpu->lg->kernel_address,
- &cpu->lg->lguest_data->kernel_address)
+ &cpu->lg->lguest_data->kernel_address)
/*
- * We tell the Guest that it can't use the top 2 or 4 MB
- * of virtual addresses used by the Switcher.
+ * We tell the Guest that it can't use the top virtual
+ * addresses (used by the Switcher).
*/
- || put_user(RESERVE_MEM * 1024 * 1024,
- &cpu->lg->lguest_data->reserve_mem)) {
+ || put_user(top, &cpu->lg->lguest_data->reserve_mem)) {
kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
return;
}
@@ -995,12 +1073,7 @@ void page_table_guest_data_init(struct lg_cpu *cpu)
* "pgd_index(lg->kernel_address)". This assumes it won't hit the
* Switcher mappings, so check that now.
*/
-#ifdef CONFIG_X86_PAE
- if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX &&
- pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX)
-#else
- if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX)
-#endif
+ if (cpu->lg->kernel_address >= switcher_addr)
kill_guest(cpu, "bad kernel address %#lx",
cpu->lg->kernel_address);
}
@@ -1017,102 +1090,96 @@ void free_guest_pagetable(struct lguest *lg)
free_page((long)lg->pgdirs[i].pgdir);
}
-/*H:480
- * (vi) Mapping the Switcher when the Guest is about to run.
- *
- * The Switcher and the two pages for this CPU need to be visible in the
- * Guest (and not the pages for other CPUs). We have the appropriate PTE pages
- * for each CPU already set up, we just need to hook them in now we know which
- * Guest is about to run on this CPU.
+/*H:481
+ * This clears the Switcher mappings for cpu #i.
*/
-void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
+static void remove_switcher_percpu_map(struct lg_cpu *cpu, unsigned int i)
{
- pte_t *switcher_pte_page = __this_cpu_read(switcher_pte_pages);
- pte_t regs_pte;
+ unsigned long base = switcher_addr + PAGE_SIZE + i * PAGE_SIZE*2;
+ pte_t *pte;
-#ifdef CONFIG_X86_PAE
- pmd_t switcher_pmd;
- pmd_t *pmd_table;
-
- switcher_pmd = pfn_pmd(__pa(switcher_pte_page) >> PAGE_SHIFT,
- PAGE_KERNEL_EXEC);
-
- /* Figure out where the pmd page is, by reading the PGD, and converting
- * it to a virtual address. */
- pmd_table = __va(pgd_pfn(cpu->lg->
- pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX])
- << PAGE_SHIFT);
- /* Now write it into the shadow page table. */
- set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd);
-#else
- pgd_t switcher_pgd;
+ /* Clear the mappings for both pages. */
+ pte = find_spte(cpu, base, false, 0, 0);
+ release_pte(*pte);
+ set_pte(pte, __pte(0));
- /*
- * Make the last PGD entry for this Guest point to the Switcher's PTE
- * page for this CPU (with appropriate flags).
- */
- switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC);
-
- cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
-
-#endif
- /*
- * We also change the Switcher PTE page. When we're running the Guest,
- * we want the Guest's "regs" page to appear where the first Switcher
- * page for this CPU is. This is an optimization: when the Switcher
- * saves the Guest registers, it saves them into the first page of this
- * CPU's "struct lguest_pages": if we make sure the Guest's register
- * page is already mapped there, we don't have to copy them out
- * again.
- */
- regs_pte = pfn_pte(__pa(cpu->regs_page) >> PAGE_SHIFT, PAGE_KERNEL);
- set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], regs_pte);
+ pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0);
+ release_pte(*pte);
+ set_pte(pte, __pte(0));
}
-/*:*/
-static void free_switcher_pte_pages(void)
-{
- unsigned int i;
-
- for_each_possible_cpu(i)
- free_page((long)switcher_pte_page(i));
-}
-
-/*H:520
- * Setting up the Switcher PTE page for given CPU is fairly easy, given
- * the CPU number and the "struct page"s for the Switcher code itself.
+/*H:480
+ * (vi) Mapping the Switcher when the Guest is about to run.
+ *
+ * The Switcher and the two pages for this CPU need to be visible in the Guest
+ * (and not the pages for other CPUs).
*
- * Currently the Switcher is less than a page long, so "pages" is always 1.
+ * The pages for the pagetables have all been allocated before: we just need
+ * to make sure the actual PTEs are up-to-date for the CPU we're about to run
+ * on.
*/
-static __init void populate_switcher_pte_page(unsigned int cpu,
- struct page *switcher_page[],
- unsigned int pages)
+void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
{
- unsigned int i;
- pte_t *pte = switcher_pte_page(cpu);
+ unsigned long base;
+ struct page *percpu_switcher_page, *regs_page;
+ pte_t *pte;
+ struct pgdir *pgdir = &cpu->lg->pgdirs[cpu->cpu_pgd];
+
+ /* Switcher page should always be mapped by now! */
+ BUG_ON(!pgdir->switcher_mapped);
+
+ /*
+ * Remember that we have two pages for each Host CPU, so we can run a
+ * Guest on each CPU without them interfering. We need to make sure
+ * those pages are mapped correctly in the Guest, but since we usually
+ * run on the same CPU, we cache that, and only update the mappings
+ * when we move.
+ */
+ if (pgdir->last_host_cpu == raw_smp_processor_id())
+ return;
- /* The first entries are easy: they map the Switcher code. */
- for (i = 0; i < pages; i++) {
- set_pte(&pte[i], mk_pte(switcher_page[i],
- __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)));
+ /* -1 means unknown so we remove everything. */
+ if (pgdir->last_host_cpu == -1) {
+ unsigned int i;
+ for_each_possible_cpu(i)
+ remove_switcher_percpu_map(cpu, i);
+ } else {
+ /* We know exactly what CPU mapping to remove. */
+ remove_switcher_percpu_map(cpu, pgdir->last_host_cpu);
}
- /* The only other thing we map is this CPU's pair of pages. */
- i = pages + cpu*2;
-
- /* First page (Guest registers) is writable from the Guest */
- set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]),
- __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW)));
+ /*
+ * When we're running the Guest, we want the Guest's "regs" page to
+ * appear where the first Switcher page for this CPU is. This is an
+ * optimization: when the Switcher saves the Guest registers, it saves
+ * them into the first page of this CPU's "struct lguest_pages": if we
+ * make sure the Guest's register page is already mapped there, we
+ * don't have to copy them out again.
+ */
+ /* Find the shadow PTE for this regs page. */
+ base = switcher_addr + PAGE_SIZE
+ + raw_smp_processor_id() * sizeof(struct lguest_pages);
+ pte = find_spte(cpu, base, false, 0, 0);
+ regs_page = pfn_to_page(__pa(cpu->regs_page) >> PAGE_SHIFT);
+ get_page(regs_page);
+ set_pte(pte, mk_pte(regs_page, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL)));
/*
- * The second page contains the "struct lguest_ro_state", and is
- * read-only.
+ * We map the second page of the struct lguest_pages read-only in
+ * the Guest: the IDT, GDT and other things it's not supposed to
+ * change.
*/
- set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]),
- __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)));
+ pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0);
+ percpu_switcher_page
+ = lg_switcher_pages[1 + raw_smp_processor_id()*2 + 1];
+ get_page(percpu_switcher_page);
+ set_pte(pte, mk_pte(percpu_switcher_page,
+ __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL)));
+
+ pgdir->last_host_cpu = raw_smp_processor_id();
}
-/*
+/*H:490
* We've made it through the page table code. Perhaps our tired brains are
* still processing the details, or perhaps we're simply glad it's over.
*
@@ -1124,29 +1191,3 @@ static __init void populate_switcher_pte_page(unsigned int cpu,
*
* There is just one file remaining in the Host.
*/
-
-/*H:510
- * At boot or module load time, init_pagetables() allocates and populates
- * the Switcher PTE page for each CPU.
- */
-__init int init_pagetables(struct page **switcher_page, unsigned int pages)
-{
- unsigned int i;
-
- for_each_possible_cpu(i) {
- switcher_pte_page(i) = (pte_t *)get_zeroed_page(GFP_KERNEL);
- if (!switcher_pte_page(i)) {
- free_switcher_pte_pages();
- return -ENOMEM;
- }
- populate_switcher_pte_page(i, switcher_page, pages);
- }
- return 0;
-}
-/*:*/
-
-/* Cleaning up simply involves freeing the PTE page for each CPU. */
-void free_pagetables(void)
-{
- free_switcher_pte_pages();
-}
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 4af12e1844d5..f0a3347b6441 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -59,14 +59,13 @@ static struct {
/* Offset from where switcher.S was compiled to where we've copied it */
static unsigned long switcher_offset(void)
{
- return SWITCHER_ADDR - (unsigned long)start_switcher_text;
+ return switcher_addr - (unsigned long)start_switcher_text;
}
-/* This cpu's struct lguest_pages. */
+/* This cpu's struct lguest_pages (after the Switcher text page) */
static struct lguest_pages *lguest_pages(unsigned int cpu)
{
- return &(((struct lguest_pages *)
- (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]);
+ return &(((struct lguest_pages *)(switcher_addr + PAGE_SIZE))[cpu]);
}
static DEFINE_PER_CPU(struct lg_cpu *, lg_last_cpu);
diff --git a/drivers/net/caif/Kconfig b/drivers/net/caif/Kconfig
index a966128c2a7a..7ffc756131a2 100644
--- a/drivers/net/caif/Kconfig
+++ b/drivers/net/caif/Kconfig
@@ -40,3 +40,17 @@ config CAIF_HSI
The caif low level driver for CAIF over HSI.
Be aware that if you enable this then you also need to
enable a low-level HSI driver.
+
+config CAIF_VIRTIO
+ tristate "CAIF virtio transport driver"
+ depends on CAIF
+ select VHOST_RING
+ select VIRTIO
+ select GENERIC_ALLOCATOR
+ default n
+ ---help---
+ The caif driver for CAIF over Virtio.
+
+if CAIF_VIRTIO
+source "drivers/vhost/Kconfig"
+endif
diff --git a/drivers/net/caif/Makefile b/drivers/net/caif/Makefile
index 15a9d2fc753d..9bbd45391f6c 100644
--- a/drivers/net/caif/Makefile
+++ b/drivers/net/caif/Makefile
@@ -9,3 +9,6 @@ obj-$(CONFIG_CAIF_SPI_SLAVE) += cfspi_slave.o
# HSI interface
obj-$(CONFIG_CAIF_HSI) += caif_hsi.o
+
+# Virtio interface
+obj-$(CONFIG_CAIF_VIRTIO) += caif_virtio.o
diff --git a/drivers/net/caif/caif_virtio.c b/drivers/net/caif/caif_virtio.c
new file mode 100644
index 000000000000..b9ed1288ce2d
--- /dev/null
+++ b/drivers/net/caif/caif_virtio.c
@@ -0,0 +1,790 @@
+/*
+ * Copyright (C) ST-Ericsson AB 2013
+ * Authors: Vicram Arv
+ * Dmitry Tarnyagin <dmitry.tarnyagin@lockless.no>
+ * Sjur Brendeland
+ * License terms: GNU General Public License (GPL) version 2
+ */
+#include <linux/module.h>
+#include <linux/if_arp.h>
+#include <linux/virtio.h>
+#include <linux/vringh.h>
+#include <linux/debugfs.h>
+#include <linux/spinlock.h>
+#include <linux/genalloc.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/virtio_ids.h>
+#include <linux/virtio_caif.h>
+#include <linux/virtio_ring.h>
+#include <linux/dma-mapping.h>
+#include <net/caif/caif_dev.h>
+#include <linux/virtio_config.h>
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Vicram Arv");
+MODULE_AUTHOR("Sjur Brendeland");
+MODULE_DESCRIPTION("Virtio CAIF Driver");
+
+/* NAPI schedule quota */
+#define CFV_DEFAULT_QUOTA 32
+
+/* Defaults used if virtio config space is unavailable */
+#define CFV_DEF_MTU_SIZE 4096
+#define CFV_DEF_HEADROOM 32
+#define CFV_DEF_TAILROOM 32
+
+/* Required IP header alignment */
+#define IP_HDR_ALIGN 4
+
+/* struct cfv_napi_contxt - NAPI context info
+ * @riov: IOV holding data read from the ring. Note that riov may
+ * still hold data when cfv_rx_poll() returns.
+ * @head: Last descriptor ID we received from vringh_getdesc_kern.
+ * We use this to put descriptor back on the used ring. USHRT_MAX is
+ * used to indicate invalid head-id.
+ */
+struct cfv_napi_context {
+ struct vringh_kiov riov;
+ unsigned short head;
+};
+
+/* struct cfv_stats - statistics for debugfs
+ * @rx_napi_complete: Number of NAPI completions (RX)
+ * @rx_napi_resched: Number of calls where the full quota was used (RX)
+ * @rx_nomem: Number of SKB alloc failures (RX)
+ * @rx_kicks: Number of RX kicks
+ * @tx_full_ring: Number times TX ring was full
+ * @tx_no_mem: Number of times TX went out of memory
+ * @tx_flow_on: Number of flow on (TX)
+ * @tx_kicks: Number of TX kicks
+ */
+struct cfv_stats {
+ u32 rx_napi_complete;
+ u32 rx_napi_resched;
+ u32 rx_nomem;
+ u32 rx_kicks;
+ u32 tx_full_ring;
+ u32 tx_no_mem;
+ u32 tx_flow_on;
+ u32 tx_kicks;
+};
+
+/* struct cfv_info - Caif Virtio control structure
+ * @cfdev: caif common header
+ * @vdev: Associated virtio device
+ * @vr_rx: rx/downlink host vring
+ * @vq_tx: tx/uplink virtqueue
+ * @ndev: CAIF link layer device
+ * @watermark_tx: indicates number of free descriptors we need
+ * to reopen the tx-queues after overload.
+ * @tx_lock: protects vq_tx from concurrent use
+ * @tx_release_tasklet: Tasklet for freeing consumed TX buffers
+ * @napi: Napi context used in cfv_rx_poll()
+ * @ctx: Context data used in cfv_rx_poll()
+ * @tx_hr: transmit headroom
+ * @rx_hr: receive headroom
+ * @tx_tr: transmit tail room
+ * @rx_tr: receive tail room
+ * @mtu: transmit max size
+ * @mru: receive max size
+ * @allocsz: size of dma memory reserved for TX buffers
+ * @alloc_addr: virtual address to dma memory for TX buffers
+ * @alloc_dma: dma address to dma memory for TX buffers
+ * @genpool: Gen Pool used for allocating TX buffers
+ * @reserved_mem: Pointer to memory reserve allocated from genpool
+ * @reserved_size: Size of memory reserve allocated from genpool
+ * @stats: Statistics exposed in sysfs
+ * @debugfs: Debugfs dentry for statistic counters
+ */
+struct cfv_info {
+ struct caif_dev_common cfdev;
+ struct virtio_device *vdev;
+ struct vringh *vr_rx;
+ struct virtqueue *vq_tx;
+ struct net_device *ndev;
+ unsigned int watermark_tx;
+ /* Protect access to vq_tx */
+ spinlock_t tx_lock;
+ struct tasklet_struct tx_release_tasklet;
+ struct napi_struct napi;
+ struct cfv_napi_context ctx;
+ u16 tx_hr;
+ u16 rx_hr;
+ u16 tx_tr;
+ u16 rx_tr;
+ u32 mtu;
+ u32 mru;
+ size_t allocsz;
+ void *alloc_addr;
+ dma_addr_t alloc_dma;
+ struct gen_pool *genpool;
+ unsigned long reserved_mem;
+ size_t reserved_size;
+ struct cfv_stats stats;
+ struct dentry *debugfs;
+};
+
+/* struct buf_info - maintains transmit buffer data handle
+ * @size: size of transmit buffer
+ * @dma_handle: handle to allocated dma device memory area
+ * @vaddr: virtual address mapping to allocated memory area
+ */
+struct buf_info {
+ size_t size;
+ u8 *vaddr;
+};
+
+/* Called from virtio device, in IRQ context */
+static void cfv_release_cb(struct virtqueue *vq_tx)
+{
+ struct cfv_info *cfv = vq_tx->vdev->priv;
+
+ ++cfv->stats.tx_kicks;
+ tasklet_schedule(&cfv->tx_release_tasklet);
+}
+
+static void free_buf_info(struct cfv_info *cfv, struct buf_info *buf_info)
+{
+ if (!buf_info)
+ return;
+ gen_pool_free(cfv->genpool, (unsigned long) buf_info->vaddr,
+ buf_info->size);
+ kfree(buf_info);
+}
+
+/* This is invoked whenever the remote processor completed processing
+ * a TX msg we just sent, and the buffer is put back to the used ring.
+ */
+static void cfv_release_used_buf(struct virtqueue *vq_tx)
+{
+ struct cfv_info *cfv = vq_tx->vdev->priv;
+ unsigned long flags;
+
+ BUG_ON(vq_tx != cfv->vq_tx);
+
+ for (;;) {
+ unsigned int len;
+ struct buf_info *buf_info;
+
+ /* Get used buffer from used ring to recycle used descriptors */
+ spin_lock_irqsave(&cfv->tx_lock, flags);
+ buf_info = virtqueue_get_buf(vq_tx, &len);
+ spin_unlock_irqrestore(&cfv->tx_lock, flags);
+
+ /* Stop looping if there are no more buffers to free */
+ if (!buf_info)
+ break;
+
+ free_buf_info(cfv, buf_info);
+
+ /* watermark_tx indicates if we previously stopped the tx
+ * queues. If we have enough free stots in the virtio ring,
+ * re-establish memory reserved and open up tx queues.
+ */
+ if (cfv->vq_tx->num_free <= cfv->watermark_tx)
+ continue;
+
+ /* Re-establish memory reserve */
+ if (cfv->reserved_mem == 0 && cfv->genpool)
+ cfv->reserved_mem =
+ gen_pool_alloc(cfv->genpool,
+ cfv->reserved_size);
+
+ /* Open up the tx queues */
+ if (cfv->reserved_mem) {
+ cfv->watermark_tx =
+ virtqueue_get_vring_size(cfv->vq_tx);
+ netif_tx_wake_all_queues(cfv->ndev);
+ /* Buffers are recycled in cfv_netdev_tx, so
+ * disable notifications when queues are opened.
+ */
+ virtqueue_disable_cb(cfv->vq_tx);
+ ++cfv->stats.tx_flow_on;
+ } else {
+ /* if no memory reserve, wait for more free slots */
+ WARN_ON(cfv->watermark_tx >
+ virtqueue_get_vring_size(cfv->vq_tx));
+ cfv->watermark_tx +=
+ virtqueue_get_vring_size(cfv->vq_tx) / 4;
+ }
+ }
+}
+
+/* Allocate a SKB and copy packet data to it */
+static struct sk_buff *cfv_alloc_and_copy_skb(int *err,
+ struct cfv_info *cfv,
+ u8 *frm, u32 frm_len)
+{
+ struct sk_buff *skb;
+ u32 cfpkt_len, pad_len;
+
+ *err = 0;
+ /* Verify that packet size with down-link header and mtu size */
+ if (frm_len > cfv->mru || frm_len <= cfv->rx_hr + cfv->rx_tr) {
+ netdev_err(cfv->ndev,
+ "Invalid frmlen:%u mtu:%u hr:%d tr:%d\n",
+ frm_len, cfv->mru, cfv->rx_hr,
+ cfv->rx_tr);
+ *err = -EPROTO;
+ return NULL;
+ }
+
+ cfpkt_len = frm_len - (cfv->rx_hr + cfv->rx_tr);
+ pad_len = (unsigned long)(frm + cfv->rx_hr) & (IP_HDR_ALIGN - 1);
+
+ skb = netdev_alloc_skb(cfv->ndev, frm_len + pad_len);
+ if (!skb) {
+ *err = -ENOMEM;
+ return NULL;
+ }
+
+ skb_reserve(skb, cfv->rx_hr + pad_len);
+
+ memcpy(skb_put(skb, cfpkt_len), frm + cfv->rx_hr, cfpkt_len);
+ return skb;
+}
+
+/* Get packets from the host vring */
+static int cfv_rx_poll(struct napi_struct *napi, int quota)
+{
+ struct cfv_info *cfv = container_of(napi, struct cfv_info, napi);
+ int rxcnt = 0;
+ int err = 0;
+ void *buf;
+ struct sk_buff *skb;
+ struct vringh_kiov *riov = &cfv->ctx.riov;
+ unsigned int skb_len;
+
+again:
+ do {
+ skb = NULL;
+
+ /* Put the previous iovec back on the used ring and
+ * fetch a new iovec if we have processed all elements.
+ */
+ if (riov->i == riov->used) {
+ if (cfv->ctx.head != USHRT_MAX) {
+ vringh_complete_kern(cfv->vr_rx,
+ cfv->ctx.head,
+ 0);
+ cfv->ctx.head = USHRT_MAX;
+ }
+
+ err = vringh_getdesc_kern(
+ cfv->vr_rx,
+ riov,
+ NULL,
+ &cfv->ctx.head,
+ GFP_ATOMIC);
+
+ if (err <= 0)
+ goto exit;
+ }
+
+ buf = phys_to_virt((unsigned long) riov->iov[riov->i].iov_base);
+ /* TODO: Add check on valid buffer address */
+
+ skb = cfv_alloc_and_copy_skb(&err, cfv, buf,
+ riov->iov[riov->i].iov_len);
+ if (unlikely(err))
+ goto exit;
+
+ /* Push received packet up the stack. */
+ skb_len = skb->len;
+ skb->protocol = htons(ETH_P_CAIF);
+ skb_reset_mac_header(skb);
+ skb->dev = cfv->ndev;
+ err = netif_receive_skb(skb);
+ if (unlikely(err)) {
+ ++cfv->ndev->stats.rx_dropped;
+ } else {
+ ++cfv->ndev->stats.rx_packets;
+ cfv->ndev->stats.rx_bytes += skb_len;
+ }
+
+ ++riov->i;
+ ++rxcnt;
+ } while (rxcnt < quota);
+
+ ++cfv->stats.rx_napi_resched;
+ goto out;
+
+exit:
+ switch (err) {
+ case 0:
+ ++cfv->stats.rx_napi_complete;
+
+ /* Really out of patckets? (stolen from virtio_net)*/
+ napi_complete(napi);
+ if (unlikely(!vringh_notify_enable_kern(cfv->vr_rx)) &&
+ napi_schedule_prep(napi)) {
+ vringh_notify_disable_kern(cfv->vr_rx);
+ __napi_schedule(napi);
+ goto again;
+ }
+ break;
+
+ case -ENOMEM:
+ ++cfv->stats.rx_nomem;
+ dev_kfree_skb(skb);
+ /* Stop NAPI poll on OOM, we hope to be polled later */
+ napi_complete(napi);
+ vringh_notify_enable_kern(cfv->vr_rx);
+ break;
+
+ default:
+ /* We're doomed, any modem fault is fatal */
+ netdev_warn(cfv->ndev, "Bad ring, disable device\n");
+ cfv->ndev->stats.rx_dropped = riov->used - riov->i;
+ napi_complete(napi);
+ vringh_notify_disable_kern(cfv->vr_rx);
+ netif_carrier_off(cfv->ndev);
+ break;
+ }
+out:
+ if (rxcnt && vringh_need_notify_kern(cfv->vr_rx) > 0)
+ vringh_notify(cfv->vr_rx);
+ return rxcnt;
+}
+
+static void cfv_recv(struct virtio_device *vdev, struct vringh *vr_rx)
+{
+ struct cfv_info *cfv = vdev->priv;
+
+ ++cfv->stats.rx_kicks;
+ vringh_notify_disable_kern(cfv->vr_rx);
+ napi_schedule(&cfv->napi);
+}
+
+static void cfv_destroy_genpool(struct cfv_info *cfv)
+{
+ if (cfv->alloc_addr)
+ dma_free_coherent(cfv->vdev->dev.parent->parent,
+ cfv->allocsz, cfv->alloc_addr,
+ cfv->alloc_dma);
+
+ if (!cfv->genpool)
+ return;
+ gen_pool_free(cfv->genpool, cfv->reserved_mem,
+ cfv->reserved_size);
+ gen_pool_destroy(cfv->genpool);
+ cfv->genpool = NULL;
+}
+
+static int cfv_create_genpool(struct cfv_info *cfv)
+{
+ int err;
+
+ /* dma_alloc can only allocate whole pages, and we need a more
+ * fine graned allocation so we use genpool. We ask for space needed
+ * by IP and a full ring. If the dma allcoation fails we retry with a
+ * smaller allocation size.
+ */
+ err = -ENOMEM;
+ cfv->allocsz = (virtqueue_get_vring_size(cfv->vq_tx) *
+ (ETH_DATA_LEN + cfv->tx_hr + cfv->tx_tr) * 11)/10;
+ if (cfv->allocsz <= (num_possible_cpus() + 1) * cfv->ndev->mtu)
+ return -EINVAL;
+
+ for (;;) {
+ if (cfv->allocsz <= num_possible_cpus() * cfv->ndev->mtu) {
+ netdev_info(cfv->ndev, "Not enough device memory\n");
+ return -ENOMEM;
+ }
+
+ cfv->alloc_addr = dma_alloc_coherent(
+ cfv->vdev->dev.parent->parent,
+ cfv->allocsz, &cfv->alloc_dma,
+ GFP_ATOMIC);
+ if (cfv->alloc_addr)
+ break;
+
+ cfv->allocsz = (cfv->allocsz * 3) >> 2;
+ }
+
+ netdev_dbg(cfv->ndev, "Allocated %zd bytes from dma-memory\n",
+ cfv->allocsz);
+
+ /* Allocate on 128 bytes boundaries (1 << 7)*/
+ cfv->genpool = gen_pool_create(7, -1);
+ if (!cfv->genpool)
+ goto err;
+
+ err = gen_pool_add_virt(cfv->genpool, (unsigned long)cfv->alloc_addr,
+ (phys_addr_t)virt_to_phys(cfv->alloc_addr),
+ cfv->allocsz, -1);
+ if (err)
+ goto err;
+
+ /* Reserve some memory for low memory situations. If we hit the roof
+ * in the memory pool, we stop TX flow and release the reserve.
+ */
+ cfv->reserved_size = num_possible_cpus() * cfv->ndev->mtu;
+ cfv->reserved_mem = gen_pool_alloc(cfv->genpool,
+ cfv->reserved_size);
+ if (!cfv->reserved_mem) {
+ err = -ENOMEM;
+ goto err;
+ }
+
+ cfv->watermark_tx = virtqueue_get_vring_size(cfv->vq_tx);
+ return 0;
+err:
+ cfv_destroy_genpool(cfv);
+ return err;
+}
+
+/* Enable the CAIF interface and allocate the memory-pool */
+static int cfv_netdev_open(struct net_device *netdev)
+{
+ struct cfv_info *cfv = netdev_priv(netdev);
+
+ if (cfv_create_genpool(cfv))
+ return -ENOMEM;
+
+ netif_carrier_on(netdev);
+ napi_enable(&cfv->napi);
+
+ /* Schedule NAPI to read any pending packets */
+ napi_schedule(&cfv->napi);
+ return 0;
+}
+
+/* Disable the CAIF interface and free the memory-pool */
+static int cfv_netdev_close(struct net_device *netdev)
+{
+ struct cfv_info *cfv = netdev_priv(netdev);
+ unsigned long flags;
+ struct buf_info *buf_info;
+
+ /* Disable interrupts, queues and NAPI polling */
+ netif_carrier_off(netdev);
+ virtqueue_disable_cb(cfv->vq_tx);
+ vringh_notify_disable_kern(cfv->vr_rx);
+ napi_disable(&cfv->napi);
+
+ /* Release any TX buffers on both used and avilable rings */
+ cfv_release_used_buf(cfv->vq_tx);
+ spin_lock_irqsave(&cfv->tx_lock, flags);
+ while ((buf_info = virtqueue_detach_unused_buf(cfv->vq_tx)))
+ free_buf_info(cfv, buf_info);
+ spin_unlock_irqrestore(&cfv->tx_lock, flags);
+
+ /* Release all dma allocated memory and destroy the pool */
+ cfv_destroy_genpool(cfv);
+ return 0;
+}
+
+/* Allocate a buffer in dma-memory and copy skb to it */
+static struct buf_info *cfv_alloc_and_copy_to_shm(struct cfv_info *cfv,
+ struct sk_buff *skb,
+ struct scatterlist *sg)
+{
+ struct caif_payload_info *info = (void *)&skb->cb;
+ struct buf_info *buf_info = NULL;
+ u8 pad_len, hdr_ofs;
+
+ if (!cfv->genpool)
+ goto err;
+
+ if (unlikely(cfv->tx_hr + skb->len + cfv->tx_tr > cfv->mtu)) {
+ netdev_warn(cfv->ndev, "Invalid packet len (%d > %d)\n",
+ cfv->tx_hr + skb->len + cfv->tx_tr, cfv->mtu);
+ goto err;
+ }
+
+ buf_info = kmalloc(sizeof(struct buf_info), GFP_ATOMIC);
+ if (unlikely(!buf_info))
+ goto err;
+
+ /* Make the IP header aligned in tbe buffer */
+ hdr_ofs = cfv->tx_hr + info->hdr_len;
+ pad_len = hdr_ofs & (IP_HDR_ALIGN - 1);
+ buf_info->size = cfv->tx_hr + skb->len + cfv->tx_tr + pad_len;
+
+ /* allocate dma memory buffer */
+ buf_info->vaddr = (void *)gen_pool_alloc(cfv->genpool, buf_info->size);
+ if (unlikely(!buf_info->vaddr))
+ goto err;
+
+ /* copy skbuf contents to send buffer */
+ skb_copy_bits(skb, 0, buf_info->vaddr + cfv->tx_hr + pad_len, skb->len);
+ sg_init_one(sg, buf_info->vaddr + pad_len,
+ skb->len + cfv->tx_hr + cfv->rx_hr);
+
+ return buf_info;
+err:
+ kfree(buf_info);
+ return NULL;
+}
+
+/* Put the CAIF packet on the virtio ring and kick the receiver */
+static int cfv_netdev_tx(struct sk_buff *skb, struct net_device *netdev)
+{
+ struct cfv_info *cfv = netdev_priv(netdev);
+ struct buf_info *buf_info;
+ struct scatterlist sg;
+ unsigned long flags;
+ bool flow_off = false;
+ int ret;
+
+ /* garbage collect released buffers */
+ cfv_release_used_buf(cfv->vq_tx);
+ spin_lock_irqsave(&cfv->tx_lock, flags);
+
+ /* Flow-off check takes into account number of cpus to make sure
+ * virtqueue will not be overfilled in any possible smp conditions.
+ *
+ * Flow-on is triggered when sufficient buffers are freed
+ */
+ if (unlikely(cfv->vq_tx->num_free <= num_present_cpus())) {
+ flow_off = true;
+ cfv->stats.tx_full_ring++;
+ }
+
+ /* If we run out of memory, we release the memory reserve and retry
+ * allocation.
+ */
+ buf_info = cfv_alloc_and_copy_to_shm(cfv, skb, &sg);
+ if (unlikely(!buf_info)) {
+ cfv->stats.tx_no_mem++;
+ flow_off = true;
+
+ if (cfv->reserved_mem && cfv->genpool) {
+ gen_pool_free(cfv->genpool, cfv->reserved_mem,
+ cfv->reserved_size);
+ cfv->reserved_mem = 0;
+ buf_info = cfv_alloc_and_copy_to_shm(cfv, skb, &sg);
+ }
+ }
+
+ if (unlikely(flow_off)) {
+ /* Turn flow on when a 1/4 of the descriptors are released */
+ cfv->watermark_tx = virtqueue_get_vring_size(cfv->vq_tx) / 4;
+ /* Enable notifications of recycled TX buffers */
+ virtqueue_enable_cb(cfv->vq_tx);
+ netif_tx_stop_all_queues(netdev);
+ }
+
+ if (unlikely(!buf_info)) {
+ /* If the memory reserve does it's job, this shouldn't happen */
+ netdev_warn(cfv->ndev, "Out of gen_pool memory\n");
+ goto err;
+ }
+
+ ret = virtqueue_add_outbuf(cfv->vq_tx, &sg, 1, buf_info, GFP_ATOMIC);
+ if (unlikely((ret < 0))) {
+ /* If flow control works, this shouldn't happen */
+ netdev_warn(cfv->ndev, "Failed adding buffer to TX vring:%d\n",
+ ret);
+ goto err;
+ }
+
+ /* update netdev statistics */
+ cfv->ndev->stats.tx_packets++;
+ cfv->ndev->stats.tx_bytes += skb->len;
+ spin_unlock_irqrestore(&cfv->tx_lock, flags);
+
+ /* tell the remote processor it has a pending message to read */
+ virtqueue_kick(cfv->vq_tx);
+
+ dev_kfree_skb(skb);
+ return NETDEV_TX_OK;
+err:
+ spin_unlock_irqrestore(&cfv->tx_lock, flags);
+ cfv->ndev->stats.tx_dropped++;
+ free_buf_info(cfv, buf_info);
+ dev_kfree_skb(skb);
+ return NETDEV_TX_OK;
+}
+
+static void cfv_tx_release_tasklet(unsigned long drv)
+{
+ struct cfv_info *cfv = (struct cfv_info *)drv;
+ cfv_release_used_buf(cfv->vq_tx);
+}
+
+static const struct net_device_ops cfv_netdev_ops = {
+ .ndo_open = cfv_netdev_open,
+ .ndo_stop = cfv_netdev_close,
+ .ndo_start_xmit = cfv_netdev_tx,
+};
+
+static void cfv_netdev_setup(struct net_device *netdev)
+{
+ netdev->netdev_ops = &cfv_netdev_ops;
+ netdev->type = ARPHRD_CAIF;
+ netdev->tx_queue_len = 100;
+ netdev->flags = IFF_POINTOPOINT | IFF_NOARP;
+ netdev->mtu = CFV_DEF_MTU_SIZE;
+ netdev->destructor = free_netdev;
+}
+
+/* Create debugfs counters for the device */
+static inline void debugfs_init(struct cfv_info *cfv)
+{
+ cfv->debugfs =
+ debugfs_create_dir(netdev_name(cfv->ndev), NULL);
+
+ if (IS_ERR(cfv->debugfs))
+ return;
+
+ debugfs_create_u32("rx-napi-complete", S_IRUSR, cfv->debugfs,
+ &cfv->stats.rx_napi_complete);
+ debugfs_create_u32("rx-napi-resched", S_IRUSR, cfv->debugfs,
+ &cfv->stats.rx_napi_resched);
+ debugfs_create_u32("rx-nomem", S_IRUSR, cfv->debugfs,
+ &cfv->stats.rx_nomem);
+ debugfs_create_u32("rx-kicks", S_IRUSR, cfv->debugfs,
+ &cfv->stats.rx_kicks);
+ debugfs_create_u32("tx-full-ring", S_IRUSR, cfv->debugfs,
+ &cfv->stats.tx_full_ring);
+ debugfs_create_u32("tx-no-mem", S_IRUSR, cfv->debugfs,
+ &cfv->stats.tx_no_mem);
+ debugfs_create_u32("tx-kicks", S_IRUSR, cfv->debugfs,
+ &cfv->stats.tx_kicks);
+ debugfs_create_u32("tx-flow-on", S_IRUSR, cfv->debugfs,
+ &cfv->stats.tx_flow_on);
+}
+
+/* Setup CAIF for the a virtio device */
+static int cfv_probe(struct virtio_device *vdev)
+{
+ vq_callback_t *vq_cbs = cfv_release_cb;
+ vrh_callback_t *vrh_cbs = cfv_recv;
+ const char *names = "output";
+ const char *cfv_netdev_name = "cfvrt";
+ struct net_device *netdev;
+ struct cfv_info *cfv;
+ int err = -EINVAL;
+
+ netdev = alloc_netdev(sizeof(struct cfv_info), cfv_netdev_name,
+ cfv_netdev_setup);
+ if (!netdev)
+ return -ENOMEM;
+
+ cfv = netdev_priv(netdev);
+ cfv->vdev = vdev;
+ cfv->ndev = netdev;
+
+ spin_lock_init(&cfv->tx_lock);
+
+ /* Get the RX virtio ring. This is a "host side vring". */
+ err = -ENODEV;
+ if (!vdev->vringh_config || !vdev->vringh_config->find_vrhs)
+ goto err;
+
+ err = vdev->vringh_config->find_vrhs(vdev, 1, &cfv->vr_rx, &vrh_cbs);
+ if (err)
+ goto err;
+
+ /* Get the TX virtio ring. This is a "guest side vring". */
+ err = vdev->config->find_vqs(vdev, 1, &cfv->vq_tx, &vq_cbs, &names);
+ if (err)
+ goto err;
+
+ /* Get the CAIF configuration from virtio config space, if available */
+#define GET_VIRTIO_CONFIG_OPS(_v, _var, _f) \
+ ((_v)->config->get(_v, offsetof(struct virtio_caif_transf_config, _f), \
+ &_var, \
+ FIELD_SIZEOF(struct virtio_caif_transf_config, _f)))
+
+ if (vdev->config->get) {
+ GET_VIRTIO_CONFIG_OPS(vdev, cfv->tx_hr, headroom);
+ GET_VIRTIO_CONFIG_OPS(vdev, cfv->rx_hr, headroom);
+ GET_VIRTIO_CONFIG_OPS(vdev, cfv->tx_tr, tailroom);
+ GET_VIRTIO_CONFIG_OPS(vdev, cfv->rx_tr, tailroom);
+ GET_VIRTIO_CONFIG_OPS(vdev, cfv->mtu, mtu);
+ GET_VIRTIO_CONFIG_OPS(vdev, cfv->mru, mtu);
+ } else {
+ cfv->tx_hr = CFV_DEF_HEADROOM;
+ cfv->rx_hr = CFV_DEF_HEADROOM;
+ cfv->tx_tr = CFV_DEF_TAILROOM;
+ cfv->rx_tr = CFV_DEF_TAILROOM;
+ cfv->mtu = CFV_DEF_MTU_SIZE;
+ cfv->mru = CFV_DEF_MTU_SIZE;
+ }
+
+ netdev->needed_headroom = cfv->tx_hr;
+ netdev->needed_tailroom = cfv->tx_tr;
+
+ /* Disable buffer release interrupts unless we have stopped TX queues */
+ virtqueue_disable_cb(cfv->vq_tx);
+
+ netdev->mtu = cfv->mtu - cfv->tx_tr;
+ vdev->priv = cfv;
+
+ /* Initialize NAPI poll context data */
+ vringh_kiov_init(&cfv->ctx.riov, NULL, 0);
+ cfv->ctx.head = USHRT_MAX;
+ netif_napi_add(netdev, &cfv->napi, cfv_rx_poll, CFV_DEFAULT_QUOTA);
+
+ tasklet_init(&cfv->tx_release_tasklet,
+ cfv_tx_release_tasklet,
+ (unsigned long)cfv);
+
+ /* Carrier is off until netdevice is opened */
+ netif_carrier_off(netdev);
+
+ /* register Netdev */
+ err = register_netdev(netdev);
+ if (err) {
+ dev_err(&vdev->dev, "Unable to register netdev (%d)\n", err);
+ goto err;
+ }
+
+ debugfs_init(cfv);
+
+ return 0;
+err:
+ netdev_warn(cfv->ndev, "CAIF Virtio probe failed:%d\n", err);
+
+ if (cfv->vr_rx)
+ vdev->vringh_config->del_vrhs(cfv->vdev);
+ if (cfv->vdev)
+ vdev->config->del_vqs(cfv->vdev);
+ free_netdev(netdev);
+ return err;
+}
+
+static void cfv_remove(struct virtio_device *vdev)
+{
+ struct cfv_info *cfv = vdev->priv;
+
+ rtnl_lock();
+ dev_close(cfv->ndev);
+ rtnl_unlock();
+
+ tasklet_kill(&cfv->tx_release_tasklet);
+ debugfs_remove_recursive(cfv->debugfs);
+
+ vringh_kiov_cleanup(&cfv->ctx.riov);
+ vdev->config->reset(vdev);
+ vdev->vringh_config->del_vrhs(cfv->vdev);
+ cfv->vr_rx = NULL;
+ vdev->config->del_vqs(cfv->vdev);
+ unregister_netdev(cfv->ndev);
+}
+
+static struct virtio_device_id id_table[] = {
+ { VIRTIO_ID_CAIF, VIRTIO_DEV_ANY_ID },
+ { 0 },
+};
+
+static unsigned int features[] = {
+};
+
+static struct virtio_driver caif_virtio_driver = {
+ .feature_table = features,
+ .feature_table_size = ARRAY_SIZE(features),
+ .driver.name = KBUILD_MODNAME,
+ .driver.owner = THIS_MODULE,
+ .id_table = id_table,
+ .probe = cfv_probe,
+ .remove = cfv_remove,
+};
+
+module_virtio_driver(caif_virtio_driver);
+MODULE_DEVICE_TABLE(virtio, id_table);
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 50077753a0e5..3c23fdc27bf0 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -39,7 +39,6 @@ module_param(gso, bool, 0444);
#define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
#define GOOD_COPY_LEN 128
-#define VIRTNET_SEND_COMMAND_SG_MAX 2
#define VIRTNET_DRIVER_VERSION "1.0.0"
struct virtnet_stats {
@@ -444,7 +443,7 @@ static int add_recvbuf_small(struct receive_queue *rq, gfp_t gfp)
skb_to_sgvec(skb, rq->sg + 1, 0, skb->len);
- err = virtqueue_add_buf(rq->vq, rq->sg, 0, 2, skb, gfp);
+ err = virtqueue_add_inbuf(rq->vq, rq->sg, 2, skb, gfp);
if (err < 0)
dev_kfree_skb(skb);
@@ -489,8 +488,8 @@ static int add_recvbuf_big(struct receive_queue *rq, gfp_t gfp)
/* chain first in list head */
first->private = (unsigned long)list;
- err = virtqueue_add_buf(rq->vq, rq->sg, 0, MAX_SKB_FRAGS + 2,
- first, gfp);
+ err = virtqueue_add_inbuf(rq->vq, rq->sg, MAX_SKB_FRAGS + 2,
+ first, gfp);
if (err < 0)
give_pages(rq, first);
@@ -508,7 +507,7 @@ static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
sg_init_one(rq->sg, page_address(page), PAGE_SIZE);
- err = virtqueue_add_buf(rq->vq, rq->sg, 0, 1, page, gfp);
+ err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, page, gfp);
if (err < 0)
give_pages(rq, page);
@@ -582,7 +581,7 @@ static void refill_work(struct work_struct *work)
bool still_empty;
int i;
- for (i = 0; i < vi->max_queue_pairs; i++) {
+ for (i = 0; i < vi->curr_queue_pairs; i++) {
struct receive_queue *rq = &vi->rq[i];
napi_disable(&rq->napi);
@@ -637,7 +636,7 @@ static int virtnet_open(struct net_device *dev)
struct virtnet_info *vi = netdev_priv(dev);
int i;
- for (i = 0; i < vi->max_queue_pairs; i++) {
+ for (i = 0; i < vi->curr_queue_pairs; i++) {
/* Make sure we have some buffers: if oom use wq. */
if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
schedule_delayed_work(&vi->refill, 0);
@@ -711,8 +710,7 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
sg_set_buf(sq->sg, &hdr->hdr, sizeof hdr->hdr);
num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len) + 1;
- return virtqueue_add_buf(sq->vq, sq->sg, num_sg,
- 0, skb, GFP_ATOMIC);
+ return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
}
static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
@@ -767,32 +765,35 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
* never fail unless improperly formated.
*/
static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
- struct scatterlist *data, int out, int in)
+ struct scatterlist *out,
+ struct scatterlist *in)
{
- struct scatterlist *s, sg[VIRTNET_SEND_COMMAND_SG_MAX + 2];
+ struct scatterlist *sgs[4], hdr, stat;
struct virtio_net_ctrl_hdr ctrl;
virtio_net_ctrl_ack status = ~0;
- unsigned int tmp;
- int i;
+ unsigned out_num = 0, in_num = 0, tmp;
/* Caller should know better */
- BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ) ||
- (out + in > VIRTNET_SEND_COMMAND_SG_MAX));
-
- out++; /* Add header */
- in++; /* Add return status */
+ BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ));
ctrl.class = class;
ctrl.cmd = cmd;
+ /* Add header */
+ sg_init_one(&hdr, &ctrl, sizeof(ctrl));
+ sgs[out_num++] = &hdr;
- sg_init_table(sg, out + in);
+ if (out)
+ sgs[out_num++] = out;
+ if (in)
+ sgs[out_num + in_num++] = in;
- sg_set_buf(&sg[0], &ctrl, sizeof(ctrl));
- for_each_sg(data, s, out + in - 2, i)
- sg_set_buf(&sg[i + 1], sg_virt(s), s->length);
- sg_set_buf(&sg[out + in - 1], &status, sizeof(status));
+ /* Add return status. */
+ sg_init_one(&stat, &status, sizeof(status));
+ sgs[out_num + in_num++] = &stat;
- BUG_ON(virtqueue_add_buf(vi->cvq, sg, out, in, vi, GFP_ATOMIC) < 0);
+ BUG_ON(out_num + in_num > ARRAY_SIZE(sgs));
+ BUG_ON(virtqueue_add_sgs(vi->cvq, sgs, out_num, in_num, vi, GFP_ATOMIC)
+ < 0);
virtqueue_kick(vi->cvq);
@@ -821,7 +822,7 @@ static int virtnet_set_mac_address(struct net_device *dev, void *p)
sg_init_one(&sg, addr->sa_data, dev->addr_len);
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
VIRTIO_NET_CTRL_MAC_ADDR_SET,
- &sg, 1, 0)) {
+ &sg, NULL)) {
dev_warn(&vdev->dev,
"Failed to set mac address by vq command.\n");
return -EINVAL;
@@ -889,8 +890,7 @@ static void virtnet_ack_link_announce(struct virtnet_info *vi)
{
rtnl_lock();
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE,
- VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL,
- 0, 0))
+ VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL, NULL))
dev_warn(&vi->dev->dev, "Failed to ack link announce.\n");
rtnl_unlock();
}
@@ -900,6 +900,7 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
struct scatterlist sg;
struct virtio_net_ctrl_mq s;
struct net_device *dev = vi->dev;
+ int i;
if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
return 0;
@@ -908,12 +909,16 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
sg_init_one(&sg, &s, sizeof(s));
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
- VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg, 1, 0)){
+ VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg, NULL)) {
dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n",
queue_pairs);
return -EINVAL;
- } else
+ } else {
+ for (i = vi->curr_queue_pairs; i < queue_pairs; i++)
+ if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
+ schedule_delayed_work(&vi->refill, 0);
vi->curr_queue_pairs = queue_pairs;
+ }
return 0;
}
@@ -955,7 +960,7 @@ static void virtnet_set_rx_mode(struct net_device *dev)
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
VIRTIO_NET_CTRL_RX_PROMISC,
- sg, 1, 0))
+ sg, NULL))
dev_warn(&dev->dev, "Failed to %sable promisc mode.\n",
promisc ? "en" : "dis");
@@ -963,7 +968,7 @@ static void virtnet_set_rx_mode(struct net_device *dev)
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
VIRTIO_NET_CTRL_RX_ALLMULTI,
- sg, 1, 0))
+ sg, NULL))
dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
allmulti ? "en" : "dis");
@@ -1000,7 +1005,7 @@ static void virtnet_set_rx_mode(struct net_device *dev)
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
VIRTIO_NET_CTRL_MAC_TABLE_SET,
- sg, 2, 0))
+ sg, NULL))
dev_warn(&dev->dev, "Failed to set MAC fitler table.\n");
kfree(buf);
@@ -1015,7 +1020,7 @@ static int virtnet_vlan_rx_add_vid(struct net_device *dev,
sg_init_one(&sg, &vid, sizeof(vid));
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
- VIRTIO_NET_CTRL_VLAN_ADD, &sg, 1, 0))
+ VIRTIO_NET_CTRL_VLAN_ADD, &sg, NULL))
dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid);
return 0;
}
@@ -1029,7 +1034,7 @@ static int virtnet_vlan_rx_kill_vid(struct net_device *dev,
sg_init_one(&sg, &vid, sizeof(vid));
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
- VIRTIO_NET_CTRL_VLAN_DEL, &sg, 1, 0))
+ VIRTIO_NET_CTRL_VLAN_DEL, &sg, NULL))
dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid);
return 0;
}
@@ -1570,7 +1575,7 @@ static int virtnet_probe(struct virtio_device *vdev)
}
/* Last of all, set up some receive buffers. */
- for (i = 0; i < vi->max_queue_pairs; i++) {
+ for (i = 0; i < vi->curr_queue_pairs; i++) {
try_fill_recv(&vi->rq[i], GFP_KERNEL);
/* If we didn't even get one input buffer, we're useless. */
@@ -1694,7 +1699,7 @@ static int virtnet_restore(struct virtio_device *vdev)
netif_device_attach(vi->dev);
- for (i = 0; i < vi->max_queue_pairs; i++)
+ for (i = 0; i < vi->curr_queue_pairs; i++)
if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
schedule_delayed_work(&vi->refill, 0);
diff --git a/drivers/rpmsg/virtio_rpmsg_bus.c b/drivers/rpmsg/virtio_rpmsg_bus.c
index 7861f1119b7d..56fceafec9ec 100644
--- a/drivers/rpmsg/virtio_rpmsg_bus.c
+++ b/drivers/rpmsg/virtio_rpmsg_bus.c
@@ -757,14 +757,14 @@ int rpmsg_send_offchannel_raw(struct rpmsg_channel *rpdev, u32 src, u32 dst,
mutex_lock(&vrp->tx_lock);
/* add message to the remote processor's virtqueue */
- err = virtqueue_add_buf(vrp->svq, &sg, 1, 0, msg, GFP_KERNEL);
+ err = virtqueue_add_outbuf(vrp->svq, &sg, 1, msg, GFP_KERNEL);
if (err) {
/*
* need to reclaim the buffer here, otherwise it's lost
* (memory won't leak, but rpmsg won't use it again for TX).
* this will wait for a buffer management overhaul.
*/
- dev_err(dev, "virtqueue_add_buf failed: %d\n", err);
+ dev_err(dev, "virtqueue_add_outbuf failed: %d\n", err);
goto out;
}
@@ -839,7 +839,7 @@ static void rpmsg_recv_done(struct virtqueue *rvq)
sg_init_one(&sg, msg, RPMSG_BUF_SIZE);
/* add the buffer back to the remote processor's virtqueue */
- err = virtqueue_add_buf(vrp->rvq, &sg, 0, 1, msg, GFP_KERNEL);
+ err = virtqueue_add_inbuf(vrp->rvq, &sg, 1, msg, GFP_KERNEL);
if (err < 0) {
dev_err(dev, "failed to add a virtqueue buffer: %d\n", err);
return;
@@ -972,7 +972,7 @@ static int rpmsg_probe(struct virtio_device *vdev)
sg_init_one(&sg, cpu_addr, RPMSG_BUF_SIZE);
- err = virtqueue_add_buf(vrp->rvq, &sg, 0, 1, cpu_addr,
+ err = virtqueue_add_inbuf(vrp->rvq, &sg, 1, cpu_addr,
GFP_KERNEL);
WARN_ON(err); /* sanity check; this can't really happen */
}
diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index 3449a1f8c656..2168258fb2c3 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -13,6 +13,8 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/mempool.h>
@@ -20,12 +22,14 @@
#include <linux/virtio_ids.h>
#include <linux/virtio_config.h>
#include <linux/virtio_scsi.h>
+#include <linux/cpu.h>
#include <scsi/scsi_host.h>
#include <scsi/scsi_device.h>
#include <scsi/scsi_cmnd.h>
#define VIRTIO_SCSI_MEMPOOL_SZ 64
#define VIRTIO_SCSI_EVENT_LEN 8
+#define VIRTIO_SCSI_VQ_BASE 2
/* Command queue element */
struct virtio_scsi_cmd {
@@ -57,27 +61,61 @@ struct virtio_scsi_vq {
struct virtqueue *vq;
};
-/* Per-target queue state */
+/*
+ * Per-target queue state.
+ *
+ * This struct holds the data needed by the queue steering policy. When a
+ * target is sent multiple requests, we need to drive them to the same queue so
+ * that FIFO processing order is kept. However, if a target was idle, we can
+ * choose a queue arbitrarily. In this case the queue is chosen according to
+ * the current VCPU, so the driver expects the number of request queues to be
+ * equal to the number of VCPUs. This makes it easy and fast to select the
+ * queue, and also lets the driver optimize the IRQ affinity for the virtqueues
+ * (each virtqueue's affinity is set to the CPU that "owns" the queue).
+ *
+ * An interesting effect of this policy is that only writes to req_vq need to
+ * take the tgt_lock. Read can be done outside the lock because:
+ *
+ * - writes of req_vq only occur when atomic_inc_return(&tgt->reqs) returns 1.
+ * In that case, no other CPU is reading req_vq: even if they were in
+ * virtscsi_queuecommand_multi, they would be spinning on tgt_lock.
+ *
+ * - reads of req_vq only occur when the target is not idle (reqs != 0).
+ * A CPU that enters virtscsi_queuecommand_multi will not modify req_vq.
+ *
+ * Similarly, decrements of reqs are never concurrent with writes of req_vq.
+ * Thus they can happen outside the tgt_lock, provided of course we make reqs
+ * an atomic_t.
+ */
struct virtio_scsi_target_state {
- /* Protects sg. Lock hierarchy is tgt_lock -> vq_lock. */
+ /* This spinlock never held at the same time as vq_lock. */
spinlock_t tgt_lock;
- /* For sglist construction when adding commands to the virtqueue. */
- struct scatterlist sg[];
+ /* Count of outstanding requests. */
+ atomic_t reqs;
+
+ /* Currently active virtqueue for requests sent to this target. */
+ struct virtio_scsi_vq *req_vq;
};
/* Driver instance state */
struct virtio_scsi {
struct virtio_device *vdev;
- struct virtio_scsi_vq ctrl_vq;
- struct virtio_scsi_vq event_vq;
- struct virtio_scsi_vq req_vq;
-
/* Get some buffers ready for event vq */
struct virtio_scsi_event_node event_list[VIRTIO_SCSI_EVENT_LEN];
- struct virtio_scsi_target_state *tgt[];
+ u32 num_queues;
+
+ /* If the affinity hint is set for virtqueues */
+ bool affinity_hint_set;
+
+ /* CPU hotplug notifier */
+ struct notifier_block nb;
+
+ struct virtio_scsi_vq ctrl_vq;
+ struct virtio_scsi_vq event_vq;
+ struct virtio_scsi_vq req_vqs[];
};
static struct kmem_cache *virtscsi_cmd_cache;
@@ -107,11 +145,13 @@ static void virtscsi_compute_resid(struct scsi_cmnd *sc, u32 resid)
*
* Called with vq_lock held.
*/
-static void virtscsi_complete_cmd(void *buf)
+static void virtscsi_complete_cmd(struct virtio_scsi *vscsi, void *buf)
{
struct virtio_scsi_cmd *cmd = buf;
struct scsi_cmnd *sc = cmd->sc;
struct virtio_scsi_cmd_resp *resp = &cmd->resp.cmd;
+ struct virtio_scsi_target_state *tgt =
+ scsi_target(sc->device)->hostdata;
dev_dbg(&sc->device->sdev_gendev,
"cmd %p response %u status %#02x sense_len %u\n",
@@ -166,32 +206,71 @@ static void virtscsi_complete_cmd(void *buf)
mempool_free(cmd, virtscsi_cmd_pool);
sc->scsi_done(sc);
+
+ atomic_dec(&tgt->reqs);
}
-static void virtscsi_vq_done(struct virtqueue *vq, void (*fn)(void *buf))
+static void virtscsi_vq_done(struct virtio_scsi *vscsi,
+ struct virtio_scsi_vq *virtscsi_vq,
+ void (*fn)(struct virtio_scsi *vscsi, void *buf))
{
void *buf;
unsigned int len;
+ unsigned long flags;
+ struct virtqueue *vq = virtscsi_vq->vq;
+ spin_lock_irqsave(&virtscsi_vq->vq_lock, flags);
do {
virtqueue_disable_cb(vq);
while ((buf = virtqueue_get_buf(vq, &len)) != NULL)
- fn(buf);
+ fn(vscsi, buf);
} while (!virtqueue_enable_cb(vq));
+ spin_unlock_irqrestore(&virtscsi_vq->vq_lock, flags);
}
static void virtscsi_req_done(struct virtqueue *vq)
{
struct Scsi_Host *sh = virtio_scsi_host(vq->vdev);
struct virtio_scsi *vscsi = shost_priv(sh);
- unsigned long flags;
+ int index = vq->index - VIRTIO_SCSI_VQ_BASE;
+ struct virtio_scsi_vq *req_vq = &vscsi->req_vqs[index];
- spin_lock_irqsave(&vscsi->req_vq.vq_lock, flags);
- virtscsi_vq_done(vq, virtscsi_complete_cmd);
- spin_unlock_irqrestore(&vscsi->req_vq.vq_lock, flags);
+ /*
+ * Read req_vq before decrementing the reqs field in
+ * virtscsi_complete_cmd.
+ *
+ * With barriers:
+ *
+ * CPU #0 virtscsi_queuecommand_multi (CPU #1)
+ * ------------------------------------------------------------
+ * lock vq_lock
+ * read req_vq
+ * read reqs (reqs = 1)
+ * write reqs (reqs = 0)
+ * increment reqs (reqs = 1)
+ * write req_vq
+ *
+ * Possible reordering without barriers:
+ *
+ * CPU #0 virtscsi_queuecommand_multi (CPU #1)
+ * ------------------------------------------------------------
+ * lock vq_lock
+ * read reqs (reqs = 1)
+ * write reqs (reqs = 0)
+ * increment reqs (reqs = 1)
+ * write req_vq
+ * read (wrong) req_vq
+ *
+ * We do not need a full smp_rmb, because req_vq is required to get
+ * to tgt->reqs: tgt is &vscsi->tgt[sc->device->id], where sc is stored
+ * in the virtqueue as the user token.
+ */
+ smp_read_barrier_depends();
+
+ virtscsi_vq_done(vscsi, req_vq, virtscsi_complete_cmd);
};
-static void virtscsi_complete_free(void *buf)
+static void virtscsi_complete_free(struct virtio_scsi *vscsi, void *buf)
{
struct virtio_scsi_cmd *cmd = buf;
@@ -205,11 +284,8 @@ static void virtscsi_ctrl_done(struct virtqueue *vq)
{
struct Scsi_Host *sh = virtio_scsi_host(vq->vdev);
struct virtio_scsi *vscsi = shost_priv(sh);
- unsigned long flags;
- spin_lock_irqsave(&vscsi->ctrl_vq.vq_lock, flags);
- virtscsi_vq_done(vq, virtscsi_complete_free);
- spin_unlock_irqrestore(&vscsi->ctrl_vq.vq_lock, flags);
+ virtscsi_vq_done(vscsi, &vscsi->ctrl_vq, virtscsi_complete_free);
};
static int virtscsi_kick_event(struct virtio_scsi *vscsi,
@@ -223,8 +299,8 @@ static int virtscsi_kick_event(struct virtio_scsi *vscsi,
spin_lock_irqsave(&vscsi->event_vq.vq_lock, flags);
- err = virtqueue_add_buf(vscsi->event_vq.vq, &sg, 0, 1, event_node,
- GFP_ATOMIC);
+ err = virtqueue_add_inbuf(vscsi->event_vq.vq, &sg, 1, event_node,
+ GFP_ATOMIC);
if (!err)
virtqueue_kick(vscsi->event_vq.vq);
@@ -254,7 +330,7 @@ static void virtscsi_cancel_event_work(struct virtio_scsi *vscsi)
}
static void virtscsi_handle_transport_reset(struct virtio_scsi *vscsi,
- struct virtio_scsi_event *event)
+ struct virtio_scsi_event *event)
{
struct scsi_device *sdev;
struct Scsi_Host *shost = virtio_scsi_host(vscsi->vdev);
@@ -332,7 +408,7 @@ static void virtscsi_handle_event(struct work_struct *work)
virtscsi_kick_event(vscsi, event_node);
}
-static void virtscsi_complete_event(void *buf)
+static void virtscsi_complete_event(struct virtio_scsi *vscsi, void *buf)
{
struct virtio_scsi_event_node *event_node = buf;
@@ -344,82 +420,65 @@ static void virtscsi_event_done(struct virtqueue *vq)
{
struct Scsi_Host *sh = virtio_scsi_host(vq->vdev);
struct virtio_scsi *vscsi = shost_priv(sh);
- unsigned long flags;
- spin_lock_irqsave(&vscsi->event_vq.vq_lock, flags);
- virtscsi_vq_done(vq, virtscsi_complete_event);
- spin_unlock_irqrestore(&vscsi->event_vq.vq_lock, flags);
+ virtscsi_vq_done(vscsi, &vscsi->event_vq, virtscsi_complete_event);
};
-static void virtscsi_map_sgl(struct scatterlist *sg, unsigned int *p_idx,
- struct scsi_data_buffer *sdb)
-{
- struct sg_table *table = &sdb->table;
- struct scatterlist *sg_elem;
- unsigned int idx = *p_idx;
- int i;
-
- for_each_sg(table->sgl, sg_elem, table->nents, i)
- sg[idx++] = *sg_elem;
-
- *p_idx = idx;
-}
-
/**
- * virtscsi_map_cmd - map a scsi_cmd to a virtqueue scatterlist
- * @vscsi : virtio_scsi state
+ * virtscsi_add_cmd - add a virtio_scsi_cmd to a virtqueue
+ * @vq : the struct virtqueue we're talking about
* @cmd : command structure
- * @out_num : number of read-only elements
- * @in_num : number of write-only elements
* @req_size : size of the request buffer
* @resp_size : size of the response buffer
- *
- * Called with tgt_lock held.
+ * @gfp : flags to use for memory allocations
*/
-static void virtscsi_map_cmd(struct virtio_scsi_target_state *tgt,
- struct virtio_scsi_cmd *cmd,
- unsigned *out_num, unsigned *in_num,
- size_t req_size, size_t resp_size)
+static int virtscsi_add_cmd(struct virtqueue *vq,
+ struct virtio_scsi_cmd *cmd,
+ size_t req_size, size_t resp_size, gfp_t gfp)
{
struct scsi_cmnd *sc = cmd->sc;
- struct scatterlist *sg = tgt->sg;
- unsigned int idx = 0;
+ struct scatterlist *sgs[4], req, resp;
+ struct sg_table *out, *in;
+ unsigned out_num = 0, in_num = 0;
+
+ out = in = NULL;
+
+ if (sc && sc->sc_data_direction != DMA_NONE) {
+ if (sc->sc_data_direction != DMA_FROM_DEVICE)
+ out = &scsi_out(sc)->table;
+ if (sc->sc_data_direction != DMA_TO_DEVICE)
+ in = &scsi_in(sc)->table;
+ }
/* Request header. */
- sg_set_buf(&sg[idx++], &cmd->req, req_size);
+ sg_init_one(&req, &cmd->req, req_size);
+ sgs[out_num++] = &req;
/* Data-out buffer. */
- if (sc && sc->sc_data_direction != DMA_FROM_DEVICE)
- virtscsi_map_sgl(sg, &idx, scsi_out(sc));
-
- *out_num = idx;
+ if (out)
+ sgs[out_num++] = out->sgl;
/* Response header. */
- sg_set_buf(&sg[idx++], &cmd->resp, resp_size);
+ sg_init_one(&resp, &cmd->resp, resp_size);
+ sgs[out_num + in_num++] = &resp;
/* Data-in buffer */
- if (sc && sc->sc_data_direction != DMA_TO_DEVICE)
- virtscsi_map_sgl(sg, &idx, scsi_in(sc));
+ if (in)
+ sgs[out_num + in_num++] = in->sgl;
- *in_num = idx - *out_num;
+ return virtqueue_add_sgs(vq, sgs, out_num, in_num, cmd, gfp);
}
-static int virtscsi_kick_cmd(struct virtio_scsi_target_state *tgt,
- struct virtio_scsi_vq *vq,
+static int virtscsi_kick_cmd(struct virtio_scsi_vq *vq,
struct virtio_scsi_cmd *cmd,
size_t req_size, size_t resp_size, gfp_t gfp)
{
- unsigned int out_num, in_num;
unsigned long flags;
int err;
bool needs_kick = false;
- spin_lock_irqsave(&tgt->tgt_lock, flags);
- virtscsi_map_cmd(tgt, cmd, &out_num, &in_num, req_size, resp_size);
-
- spin_lock(&vq->vq_lock);
- err = virtqueue_add_buf(vq->vq, tgt->sg, out_num, in_num, cmd, gfp);
- spin_unlock(&tgt->tgt_lock);
+ spin_lock_irqsave(&vq->vq_lock, flags);
+ err = virtscsi_add_cmd(vq->vq, cmd, req_size, resp_size, gfp);
if (!err)
needs_kick = virtqueue_kick_prepare(vq->vq);
@@ -430,10 +489,10 @@ static int virtscsi_kick_cmd(struct virtio_scsi_target_state *tgt,
return err;
}
-static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc)
+static int virtscsi_queuecommand(struct virtio_scsi *vscsi,
+ struct virtio_scsi_vq *req_vq,
+ struct scsi_cmnd *sc)
{
- struct virtio_scsi *vscsi = shost_priv(sh);
- struct virtio_scsi_target_state *tgt = vscsi->tgt[sc->device->id];
struct virtio_scsi_cmd *cmd;
int ret;
@@ -467,7 +526,7 @@ static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc)
BUG_ON(sc->cmd_len > VIRTIO_SCSI_CDB_SIZE);
memcpy(cmd->req.cmd.cdb, sc->cmnd, sc->cmd_len);
- if (virtscsi_kick_cmd(tgt, &vscsi->req_vq, cmd,
+ if (virtscsi_kick_cmd(req_vq, cmd,
sizeof cmd->req.cmd, sizeof cmd->resp.cmd,
GFP_ATOMIC) == 0)
ret = 0;
@@ -478,14 +537,62 @@ out:
return ret;
}
+static int virtscsi_queuecommand_single(struct Scsi_Host *sh,
+ struct scsi_cmnd *sc)
+{
+ struct virtio_scsi *vscsi = shost_priv(sh);
+ struct virtio_scsi_target_state *tgt =
+ scsi_target(sc->device)->hostdata;
+
+ atomic_inc(&tgt->reqs);
+ return virtscsi_queuecommand(vscsi, &vscsi->req_vqs[0], sc);
+}
+
+static struct virtio_scsi_vq *virtscsi_pick_vq(struct virtio_scsi *vscsi,
+ struct virtio_scsi_target_state *tgt)
+{
+ struct virtio_scsi_vq *vq;
+ unsigned long flags;
+ u32 queue_num;
+
+ spin_lock_irqsave(&tgt->tgt_lock, flags);
+
+ /*
+ * The memory barrier after atomic_inc_return matches
+ * the smp_read_barrier_depends() in virtscsi_req_done.
+ */
+ if (atomic_inc_return(&tgt->reqs) > 1)
+ vq = ACCESS_ONCE(tgt->req_vq);
+ else {
+ queue_num = smp_processor_id();
+ while (unlikely(queue_num >= vscsi->num_queues))
+ queue_num -= vscsi->num_queues;
+
+ tgt->req_vq = vq = &vscsi->req_vqs[queue_num];
+ }
+
+ spin_unlock_irqrestore(&tgt->tgt_lock, flags);
+ return vq;
+}
+
+static int virtscsi_queuecommand_multi(struct Scsi_Host *sh,
+ struct scsi_cmnd *sc)
+{
+ struct virtio_scsi *vscsi = shost_priv(sh);
+ struct virtio_scsi_target_state *tgt =
+ scsi_target(sc->device)->hostdata;
+ struct virtio_scsi_vq *req_vq = virtscsi_pick_vq(vscsi, tgt);
+
+ return virtscsi_queuecommand(vscsi, req_vq, sc);
+}
+
static int virtscsi_tmf(struct virtio_scsi *vscsi, struct virtio_scsi_cmd *cmd)
{
DECLARE_COMPLETION_ONSTACK(comp);
- struct virtio_scsi_target_state *tgt = vscsi->tgt[cmd->sc->device->id];
int ret = FAILED;
cmd->comp = &comp;
- if (virtscsi_kick_cmd(tgt, &vscsi->ctrl_vq, cmd,
+ if (virtscsi_kick_cmd(&vscsi->ctrl_vq, cmd,
sizeof cmd->req.tmf, sizeof cmd->resp.tmf,
GFP_NOIO) < 0)
goto out;
@@ -547,18 +654,57 @@ static int virtscsi_abort(struct scsi_cmnd *sc)
return virtscsi_tmf(vscsi, cmd);
}
-static struct scsi_host_template virtscsi_host_template = {
+static int virtscsi_target_alloc(struct scsi_target *starget)
+{
+ struct virtio_scsi_target_state *tgt =
+ kmalloc(sizeof(*tgt), GFP_KERNEL);
+ if (!tgt)
+ return -ENOMEM;
+
+ spin_lock_init(&tgt->tgt_lock);
+ atomic_set(&tgt->reqs, 0);
+ tgt->req_vq = NULL;
+
+ starget->hostdata = tgt;
+ return 0;
+}
+
+static void virtscsi_target_destroy(struct scsi_target *starget)
+{
+ struct virtio_scsi_target_state *tgt = starget->hostdata;
+ kfree(tgt);
+}
+
+static struct scsi_host_template virtscsi_host_template_single = {
+ .module = THIS_MODULE,
+ .name = "Virtio SCSI HBA",
+ .proc_name = "virtio_scsi",
+ .this_id = -1,
+ .queuecommand = virtscsi_queuecommand_single,
+ .eh_abort_handler = virtscsi_abort,
+ .eh_device_reset_handler = virtscsi_device_reset,
+
+ .can_queue = 1024,
+ .dma_boundary = UINT_MAX,
+ .use_clustering = ENABLE_CLUSTERING,
+ .target_alloc = virtscsi_target_alloc,
+ .target_destroy = virtscsi_target_destroy,
+};
+
+static struct scsi_host_template virtscsi_host_template_multi = {
.module = THIS_MODULE,
.name = "Virtio SCSI HBA",
.proc_name = "virtio_scsi",
- .queuecommand = virtscsi_queuecommand,
.this_id = -1,
+ .queuecommand = virtscsi_queuecommand_multi,
.eh_abort_handler = virtscsi_abort,
.eh_device_reset_handler = virtscsi_device_reset,
.can_queue = 1024,
.dma_boundary = UINT_MAX,
.use_clustering = ENABLE_CLUSTERING,
+ .target_alloc = virtscsi_target_alloc,
+ .target_destroy = virtscsi_target_destroy,
};
#define virtscsi_config_get(vdev, fld) \
@@ -578,29 +724,69 @@ static struct scsi_host_template virtscsi_host_template = {
&__val, sizeof(__val)); \
})
-static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq,
- struct virtqueue *vq)
+static void __virtscsi_set_affinity(struct virtio_scsi *vscsi, bool affinity)
{
- spin_lock_init(&virtscsi_vq->vq_lock);
- virtscsi_vq->vq = vq;
+ int i;
+ int cpu;
+
+ /* In multiqueue mode, when the number of cpu is equal
+ * to the number of request queues, we let the qeueues
+ * to be private to one cpu by setting the affinity hint
+ * to eliminate the contention.
+ */
+ if ((vscsi->num_queues == 1 ||
+ vscsi->num_queues != num_online_cpus()) && affinity) {
+ if (vscsi->affinity_hint_set)
+ affinity = false;
+ else
+ return;
+ }
+
+ if (affinity) {
+ i = 0;
+ for_each_online_cpu(cpu) {
+ virtqueue_set_affinity(vscsi->req_vqs[i].vq, cpu);
+ i++;
+ }
+
+ vscsi->affinity_hint_set = true;
+ } else {
+ for (i = 0; i < vscsi->num_queues - VIRTIO_SCSI_VQ_BASE; i++)
+ virtqueue_set_affinity(vscsi->req_vqs[i].vq, -1);
+
+ vscsi->affinity_hint_set = false;
+ }
}
-static struct virtio_scsi_target_state *virtscsi_alloc_tgt(
- struct virtio_device *vdev, int sg_elems)
+static void virtscsi_set_affinity(struct virtio_scsi *vscsi, bool affinity)
{
- struct virtio_scsi_target_state *tgt;
- gfp_t gfp_mask = GFP_KERNEL;
-
- /* We need extra sg elements at head and tail. */
- tgt = kmalloc(sizeof(*tgt) + sizeof(tgt->sg[0]) * (sg_elems + 2),
- gfp_mask);
+ get_online_cpus();
+ __virtscsi_set_affinity(vscsi, affinity);
+ put_online_cpus();
+}
- if (!tgt)
- return NULL;
+static int virtscsi_cpu_callback(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ struct virtio_scsi *vscsi = container_of(nfb, struct virtio_scsi, nb);
+ switch(action) {
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ __virtscsi_set_affinity(vscsi, true);
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
- spin_lock_init(&tgt->tgt_lock);
- sg_init_table(tgt->sg, sg_elems + 2);
- return tgt;
+static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq,
+ struct virtqueue *vq)
+{
+ spin_lock_init(&virtscsi_vq->vq_lock);
+ virtscsi_vq->vq = vq;
}
static void virtscsi_scan(struct virtio_device *vdev)
@@ -614,46 +800,56 @@ static void virtscsi_remove_vqs(struct virtio_device *vdev)
{
struct Scsi_Host *sh = virtio_scsi_host(vdev);
struct virtio_scsi *vscsi = shost_priv(sh);
- u32 i, num_targets;
+
+ virtscsi_set_affinity(vscsi, false);
/* Stop all the virtqueues. */
vdev->config->reset(vdev);
- num_targets = sh->max_id;
- for (i = 0; i < num_targets; i++) {
- kfree(vscsi->tgt[i]);
- vscsi->tgt[i] = NULL;
- }
-
vdev->config->del_vqs(vdev);
}
static int virtscsi_init(struct virtio_device *vdev,
- struct virtio_scsi *vscsi, int num_targets)
+ struct virtio_scsi *vscsi)
{
int err;
- struct virtqueue *vqs[3];
- u32 i, sg_elems;
+ u32 i;
+ u32 num_vqs;
+ vq_callback_t **callbacks;
+ const char **names;
+ struct virtqueue **vqs;
+
+ num_vqs = vscsi->num_queues + VIRTIO_SCSI_VQ_BASE;
+ vqs = kmalloc(num_vqs * sizeof(struct virtqueue *), GFP_KERNEL);
+ callbacks = kmalloc(num_vqs * sizeof(vq_callback_t *), GFP_KERNEL);
+ names = kmalloc(num_vqs * sizeof(char *), GFP_KERNEL);
+
+ if (!callbacks || !vqs || !names) {
+ err = -ENOMEM;
+ goto out;
+ }
- vq_callback_t *callbacks[] = {
- virtscsi_ctrl_done,
- virtscsi_event_done,
- virtscsi_req_done
- };
- const char *names[] = {
- "control",
- "event",
- "request"
- };
+ callbacks[0] = virtscsi_ctrl_done;
+ callbacks[1] = virtscsi_event_done;
+ names[0] = "control";
+ names[1] = "event";
+ for (i = VIRTIO_SCSI_VQ_BASE; i < num_vqs; i++) {
+ callbacks[i] = virtscsi_req_done;
+ names[i] = "request";
+ }
/* Discover virtqueues and write information to configuration. */
- err = vdev->config->find_vqs(vdev, 3, vqs, callbacks, names);
+ err = vdev->config->find_vqs(vdev, num_vqs, vqs, callbacks, names);
if (err)
- return err;
+ goto out;
virtscsi_init_vq(&vscsi->ctrl_vq, vqs[0]);
virtscsi_init_vq(&vscsi->event_vq, vqs[1]);
- virtscsi_init_vq(&vscsi->req_vq, vqs[2]);
+ for (i = VIRTIO_SCSI_VQ_BASE; i < num_vqs; i++)
+ virtscsi_init_vq(&vscsi->req_vqs[i - VIRTIO_SCSI_VQ_BASE],
+ vqs[i]);
+
+ virtscsi_set_affinity(vscsi, true);
virtscsi_config_set(vdev, cdb_size, VIRTIO_SCSI_CDB_SIZE);
virtscsi_config_set(vdev, sense_size, VIRTIO_SCSI_SENSE_SIZE);
@@ -661,19 +857,12 @@ static int virtscsi_init(struct virtio_device *vdev,
if (virtio_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG))
virtscsi_kick_event_all(vscsi);
- /* We need to know how many segments before we allocate. */
- sg_elems = virtscsi_config_get(vdev, seg_max) ?: 1;
-
- for (i = 0; i < num_targets; i++) {
- vscsi->tgt[i] = virtscsi_alloc_tgt(vdev, sg_elems);
- if (!vscsi->tgt[i]) {
- err = -ENOMEM;
- goto out;
- }
- }
err = 0;
out:
+ kfree(names);
+ kfree(callbacks);
+ kfree(vqs);
if (err)
virtscsi_remove_vqs(vdev);
return err;
@@ -686,13 +875,21 @@ static int virtscsi_probe(struct virtio_device *vdev)
int err;
u32 sg_elems, num_targets;
u32 cmd_per_lun;
+ u32 num_queues;
+ struct scsi_host_template *hostt;
+
+ /* We need to know how many queues before we allocate. */
+ num_queues = virtscsi_config_get(vdev, num_queues) ? : 1;
- /* Allocate memory and link the structs together. */
num_targets = virtscsi_config_get(vdev, max_target) + 1;
- shost = scsi_host_alloc(&virtscsi_host_template,
- sizeof(*vscsi)
- + num_targets * sizeof(struct virtio_scsi_target_state));
+ if (num_queues == 1)
+ hostt = &virtscsi_host_template_single;
+ else
+ hostt = &virtscsi_host_template_multi;
+
+ shost = scsi_host_alloc(hostt,
+ sizeof(*vscsi) + sizeof(vscsi->req_vqs[0]) * num_queues);
if (!shost)
return -ENOMEM;
@@ -700,12 +897,20 @@ static int virtscsi_probe(struct virtio_device *vdev)
shost->sg_tablesize = sg_elems;
vscsi = shost_priv(shost);
vscsi->vdev = vdev;
+ vscsi->num_queues = num_queues;
vdev->priv = shost;
- err = virtscsi_init(vdev, vscsi, num_targets);
+ err = virtscsi_init(vdev, vscsi);
if (err)
goto virtscsi_init_failed;
+ vscsi->nb.notifier_call = &virtscsi_cpu_callback;
+ err = register_hotcpu_notifier(&vscsi->nb);
+ if (err) {
+ pr_err("registering cpu notifier failed\n");
+ goto scsi_add_host_failed;
+ }
+
cmd_per_lun = virtscsi_config_get(vdev, cmd_per_lun) ?: 1;
shost->cmd_per_lun = min_t(u32, cmd_per_lun, shost->can_queue);
shost->max_sectors = virtscsi_config_get(vdev, max_sectors) ?: 0xFFFF;
@@ -743,6 +948,8 @@ static void virtscsi_remove(struct virtio_device *vdev)
scsi_remove_host(shost);
+ unregister_hotcpu_notifier(&vscsi->nb);
+
virtscsi_remove_vqs(vdev);
scsi_host_put(shost);
}
@@ -759,7 +966,7 @@ static int virtscsi_restore(struct virtio_device *vdev)
struct Scsi_Host *sh = virtio_scsi_host(vdev);
struct virtio_scsi *vscsi = shost_priv(sh);
- return virtscsi_init(vdev, vscsi, sh->max_id);
+ return virtscsi_init(vdev, vscsi);
}
#endif
@@ -794,8 +1001,7 @@ static int __init init(void)
virtscsi_cmd_cache = KMEM_CACHE(virtio_scsi_cmd, 0);
if (!virtscsi_cmd_cache) {
- printk(KERN_ERR "kmem_cache_create() for "
- "virtscsi_cmd_cache failed\n");
+ pr_err("kmem_cache_create() for virtscsi_cmd_cache failed\n");
goto error;
}
@@ -804,8 +1010,7 @@ static int __init init(void)
mempool_create_slab_pool(VIRTIO_SCSI_MEMPOOL_SZ,
virtscsi_cmd_cache);
if (!virtscsi_cmd_pool) {
- printk(KERN_ERR "mempool_create() for"
- "virtscsi_cmd_pool failed\n");
+ pr_err("mempool_create() for virtscsi_cmd_pool failed\n");
goto error;
}
ret = register_virtio_driver(&virtio_scsi_driver);
diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index 26a64e5b8a58..8b9226da3f54 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -1,6 +1,7 @@
config VHOST_NET
tristate "Host kernel accelerator for virtio net"
depends on NET && EVENTFD && (TUN || !TUN) && (MACVTAP || !MACVTAP)
+ select VHOST_RING
---help---
This kernel module can be loaded in host kernel to accelerate
guest networking with virtio_net. Not to be confused with virtio_net
@@ -12,7 +13,14 @@ config VHOST_NET
config VHOST_SCSI
tristate "VHOST_SCSI TCM fabric driver"
depends on TARGET_CORE && EVENTFD && m
+ select VHOST_RING
default n
---help---
Say M here to enable the vhost_scsi TCM fabric module
for use with virtio-scsi guests
+
+config VHOST_RING
+ tristate
+ ---help---
+ This option is selected by any driver which needs to access
+ the host side of a virtio ring.
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index ef21d5fdfa7d..654e9afb11f5 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -3,3 +3,5 @@ vhost_net-y := vhost.o net.o
obj-$(CONFIG_VHOST_SCSI) += vhost_scsi.o
vhost_scsi-y := scsi.o
+
+obj-$(CONFIG_VHOST_RING) += vringh.o
diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c
index be65414d5bb1..1ee45bc85f67 100644
--- a/drivers/vhost/test.c
+++ b/drivers/vhost/test.c
@@ -282,7 +282,9 @@ static long vhost_test_ioctl(struct file *f, unsigned int ioctl,
return vhost_test_reset_owner(n);
default:
mutex_lock(&n->dev.mutex);
- r = vhost_dev_ioctl(&n->dev, ioctl, arg);
+ r = vhost_dev_ioctl(&n->dev, ioctl, argp);
+ if (r == -ENOIOCTLCMD)
+ r = vhost_vring_ioctl(&n->dev, ioctl, argp);
vhost_test_flush(n);
mutex_unlock(&n->dev.mutex);
return r;
diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c
new file mode 100644
index 000000000000..bff0775e258c
--- /dev/null
+++ b/drivers/vhost/vringh.c
@@ -0,0 +1,1007 @@
+/*
+ * Helpers for the host side of a virtio ring.
+ *
+ * Since these may be in userspace, we use (inline) accessors.
+ */
+#include <linux/vringh.h>
+#include <linux/virtio_ring.h>
+#include <linux/kernel.h>
+#include <linux/ratelimit.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+
+static __printf(1,2) __cold void vringh_bad(const char *fmt, ...)
+{
+ static DEFINE_RATELIMIT_STATE(vringh_rs,
+ DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+ if (__ratelimit(&vringh_rs)) {
+ va_list ap;
+ va_start(ap, fmt);
+ printk(KERN_NOTICE "vringh:");
+ vprintk(fmt, ap);
+ va_end(ap);
+ }
+}
+
+/* Returns vring->num if empty, -ve on error. */
+static inline int __vringh_get_head(const struct vringh *vrh,
+ int (*getu16)(u16 *val, const u16 *p),
+ u16 *last_avail_idx)
+{
+ u16 avail_idx, i, head;
+ int err;
+
+ err = getu16(&avail_idx, &vrh->vring.avail->idx);
+ if (err) {
+ vringh_bad("Failed to access avail idx at %p",
+ &vrh->vring.avail->idx);
+ return err;
+ }
+
+ if (*last_avail_idx == avail_idx)
+ return vrh->vring.num;
+
+ /* Only get avail ring entries after they have been exposed by guest. */
+ virtio_rmb(vrh->weak_barriers);
+
+ i = *last_avail_idx & (vrh->vring.num - 1);
+
+ err = getu16(&head, &vrh->vring.avail->ring[i]);
+ if (err) {
+ vringh_bad("Failed to read head: idx %d address %p",
+ *last_avail_idx, &vrh->vring.avail->ring[i]);
+ return err;
+ }
+
+ if (head >= vrh->vring.num) {
+ vringh_bad("Guest says index %u > %u is available",
+ head, vrh->vring.num);
+ return -EINVAL;
+ }
+
+ (*last_avail_idx)++;
+ return head;
+}
+
+/* Copy some bytes to/from the iovec. Returns num copied. */
+static inline ssize_t vringh_iov_xfer(struct vringh_kiov *iov,
+ void *ptr, size_t len,
+ int (*xfer)(void *addr, void *ptr,
+ size_t len))
+{
+ int err, done = 0;
+
+ while (len && iov->i < iov->used) {
+ size_t partlen;
+
+ partlen = min(iov->iov[iov->i].iov_len, len);
+ err = xfer(iov->iov[iov->i].iov_base, ptr, partlen);
+ if (err)
+ return err;
+ done += partlen;
+ len -= partlen;
+ ptr += partlen;
+ iov->consumed += partlen;
+ iov->iov[iov->i].iov_len -= partlen;
+ iov->iov[iov->i].iov_base += partlen;
+
+ if (!iov->iov[iov->i].iov_len) {
+ /* Fix up old iov element then increment. */
+ iov->iov[iov->i].iov_len = iov->consumed;
+ iov->iov[iov->i].iov_base -= iov->consumed;
+
+ iov->consumed = 0;
+ iov->i++;
+ }
+ }
+ return done;
+}
+
+/* May reduce *len if range is shorter. */
+static inline bool range_check(struct vringh *vrh, u64 addr, size_t *len,
+ struct vringh_range *range,
+ bool (*getrange)(struct vringh *,
+ u64, struct vringh_range *))
+{
+ if (addr < range->start || addr > range->end_incl) {
+ if (!getrange(vrh, addr, range))
+ return false;
+ }
+ BUG_ON(addr < range->start || addr > range->end_incl);
+
+ /* To end of memory? */
+ if (unlikely(addr + *len == 0)) {
+ if (range->end_incl == -1ULL)
+ return true;
+ goto truncate;
+ }
+
+ /* Otherwise, don't wrap. */
+ if (addr + *len < addr) {
+ vringh_bad("Wrapping descriptor %zu@0x%llx",
+ *len, (unsigned long long)addr);
+ return false;
+ }
+
+ if (unlikely(addr + *len - 1 > range->end_incl))
+ goto truncate;
+ return true;
+
+truncate:
+ *len = range->end_incl + 1 - addr;
+ return true;
+}
+
+static inline bool no_range_check(struct vringh *vrh, u64 addr, size_t *len,
+ struct vringh_range *range,
+ bool (*getrange)(struct vringh *,
+ u64, struct vringh_range *))
+{
+ return true;
+}
+
+/* No reason for this code to be inline. */
+static int move_to_indirect(int *up_next, u16 *i, void *addr,
+ const struct vring_desc *desc,
+ struct vring_desc **descs, int *desc_max)
+{
+ /* Indirect tables can't have indirect. */
+ if (*up_next != -1) {
+ vringh_bad("Multilevel indirect %u->%u", *up_next, *i);
+ return -EINVAL;
+ }
+
+ if (unlikely(desc->len % sizeof(struct vring_desc))) {
+ vringh_bad("Strange indirect len %u", desc->len);
+ return -EINVAL;
+ }
+
+ /* We will check this when we follow it! */
+ if (desc->flags & VRING_DESC_F_NEXT)
+ *up_next = desc->next;
+ else
+ *up_next = -2;
+ *descs = addr;
+ *desc_max = desc->len / sizeof(struct vring_desc);
+
+ /* Now, start at the first indirect. */
+ *i = 0;
+ return 0;
+}
+
+static int resize_iovec(struct vringh_kiov *iov, gfp_t gfp)
+{
+ struct kvec *new;
+ unsigned int flag, new_num = (iov->max_num & ~VRINGH_IOV_ALLOCATED) * 2;
+
+ if (new_num < 8)
+ new_num = 8;
+
+ flag = (iov->max_num & VRINGH_IOV_ALLOCATED);
+ if (flag)
+ new = krealloc(iov->iov, new_num * sizeof(struct iovec), gfp);
+ else {
+ new = kmalloc(new_num * sizeof(struct iovec), gfp);
+ if (new) {
+ memcpy(new, iov->iov,
+ iov->max_num * sizeof(struct iovec));
+ flag = VRINGH_IOV_ALLOCATED;
+ }
+ }
+ if (!new)
+ return -ENOMEM;
+ iov->iov = new;
+ iov->max_num = (new_num | flag);
+ return 0;
+}
+
+static u16 __cold return_from_indirect(const struct vringh *vrh, int *up_next,
+ struct vring_desc **descs, int *desc_max)
+{
+ u16 i = *up_next;
+
+ *up_next = -1;
+ *descs = vrh->vring.desc;
+ *desc_max = vrh->vring.num;
+ return i;
+}
+
+static int slow_copy(struct vringh *vrh, void *dst, const void *src,
+ bool (*rcheck)(struct vringh *vrh, u64 addr, size_t *len,
+ struct vringh_range *range,
+ bool (*getrange)(struct vringh *vrh,
+ u64,
+ struct vringh_range *)),
+ bool (*getrange)(struct vringh *vrh,
+ u64 addr,
+ struct vringh_range *r),
+ struct vringh_range *range,
+ int (*copy)(void *dst, const void *src, size_t len))
+{
+ size_t part, len = sizeof(struct vring_desc);
+
+ do {
+ u64 addr;
+ int err;
+
+ part = len;
+ addr = (u64)(unsigned long)src - range->offset;
+
+ if (!rcheck(vrh, addr, &part, range, getrange))
+ return -EINVAL;
+
+ err = copy(dst, src, part);
+ if (err)
+ return err;
+
+ dst += part;
+ src += part;
+ len -= part;
+ } while (len);
+ return 0;
+}
+
+static inline int
+__vringh_iov(struct vringh *vrh, u16 i,
+ struct vringh_kiov *riov,
+ struct vringh_kiov *wiov,
+ bool (*rcheck)(struct vringh *vrh, u64 addr, size_t *len,
+ struct vringh_range *range,
+ bool (*getrange)(struct vringh *, u64,
+ struct vringh_range *)),
+ bool (*getrange)(struct vringh *, u64, struct vringh_range *),
+ gfp_t gfp,
+ int (*copy)(void *dst, const void *src, size_t len))
+{
+ int err, count = 0, up_next, desc_max;
+ struct vring_desc desc, *descs;
+ struct vringh_range range = { -1ULL, 0 }, slowrange;
+ bool slow = false;
+
+ /* We start traversing vring's descriptor table. */
+ descs = vrh->vring.desc;
+ desc_max = vrh->vring.num;
+ up_next = -1;
+
+ if (riov)
+ riov->i = riov->used = 0;
+ else if (wiov)
+ wiov->i = wiov->used = 0;
+ else
+ /* You must want something! */
+ BUG();
+
+ for (;;) {
+ void *addr;
+ struct vringh_kiov *iov;
+ size_t len;
+
+ if (unlikely(slow))
+ err = slow_copy(vrh, &desc, &descs[i], rcheck, getrange,
+ &slowrange, copy);
+ else
+ err = copy(&desc, &descs[i], sizeof(desc));
+ if (unlikely(err))
+ goto fail;
+
+ if (unlikely(desc.flags & VRING_DESC_F_INDIRECT)) {
+ /* Make sure it's OK, and get offset. */
+ len = desc.len;
+ if (!rcheck(vrh, desc.addr, &len, &range, getrange)) {
+ err = -EINVAL;
+ goto fail;
+ }
+
+ if (unlikely(len != desc.len)) {
+ slow = true;
+ /* We need to save this range to use offset */
+ slowrange = range;
+ }
+
+ addr = (void *)(long)(desc.addr + range.offset);
+ err = move_to_indirect(&up_next, &i, addr, &desc,
+ &descs, &desc_max);
+ if (err)
+ goto fail;
+ continue;
+ }
+
+ if (count++ == vrh->vring.num) {
+ vringh_bad("Descriptor loop in %p", descs);
+ err = -ELOOP;
+ goto fail;
+ }
+
+ if (desc.flags & VRING_DESC_F_WRITE)
+ iov = wiov;
+ else {
+ iov = riov;
+ if (unlikely(wiov && wiov->i)) {
+ vringh_bad("Readable desc %p after writable",
+ &descs[i]);
+ err = -EINVAL;
+ goto fail;
+ }
+ }
+
+ if (!iov) {
+ vringh_bad("Unexpected %s desc",
+ !wiov ? "writable" : "readable");
+ err = -EPROTO;
+ goto fail;
+ }
+
+ again:
+ /* Make sure it's OK, and get offset. */
+ len = desc.len;
+ if (!rcheck(vrh, desc.addr, &len, &range, getrange)) {
+ err = -EINVAL;
+ goto fail;
+ }
+ addr = (void *)(unsigned long)(desc.addr + range.offset);
+
+ if (unlikely(iov->used == (iov->max_num & ~VRINGH_IOV_ALLOCATED))) {
+ err = resize_iovec(iov, gfp);
+ if (err)
+ goto fail;
+ }
+
+ iov->iov[iov->used].iov_base = addr;
+ iov->iov[iov->used].iov_len = len;
+ iov->used++;
+
+ if (unlikely(len != desc.len)) {
+ desc.len -= len;
+ desc.addr += len;
+ goto again;
+ }
+
+ if (desc.flags & VRING_DESC_F_NEXT) {
+ i = desc.next;
+ } else {
+ /* Just in case we need to finish traversing above. */
+ if (unlikely(up_next > 0)) {
+ i = return_from_indirect(vrh, &up_next,
+ &descs, &desc_max);
+ slow = false;
+ } else
+ break;
+ }
+
+ if (i >= desc_max) {
+ vringh_bad("Chained index %u > %u", i, desc_max);
+ err = -EINVAL;
+ goto fail;
+ }
+ }
+
+ return 0;
+
+fail:
+ return err;
+}
+
+static inline int __vringh_complete(struct vringh *vrh,
+ const struct vring_used_elem *used,
+ unsigned int num_used,
+ int (*putu16)(u16 *p, u16 val),
+ int (*putused)(struct vring_used_elem *dst,
+ const struct vring_used_elem
+ *src, unsigned num))
+{
+ struct vring_used *used_ring;
+ int err;
+ u16 used_idx, off;
+
+ used_ring = vrh->vring.used;
+ used_idx = vrh->last_used_idx + vrh->completed;
+
+ off = used_idx % vrh->vring.num;
+
+ /* Compiler knows num_used == 1 sometimes, hence extra check */
+ if (num_used > 1 && unlikely(off + num_used >= vrh->vring.num)) {
+ u16 part = vrh->vring.num - off;
+ err = putused(&used_ring->ring[off], used, part);
+ if (!err)
+ err = putused(&used_ring->ring[0], used + part,
+ num_used - part);
+ } else
+ err = putused(&used_ring->ring[off], used, num_used);
+
+ if (err) {
+ vringh_bad("Failed to write %u used entries %u at %p",
+ num_used, off, &used_ring->ring[off]);
+ return err;
+ }
+
+ /* Make sure buffer is written before we update index. */
+ virtio_wmb(vrh->weak_barriers);
+
+ err = putu16(&vrh->vring.used->idx, used_idx + num_used);
+ if (err) {
+ vringh_bad("Failed to update used index at %p",
+ &vrh->vring.used->idx);
+ return err;
+ }
+
+ vrh->completed += num_used;
+ return 0;
+}
+
+
+static inline int __vringh_need_notify(struct vringh *vrh,
+ int (*getu16)(u16 *val, const u16 *p))
+{
+ bool notify;
+ u16 used_event;
+ int err;
+
+ /* Flush out used index update. This is paired with the
+ * barrier that the Guest executes when enabling
+ * interrupts. */
+ virtio_mb(vrh->weak_barriers);
+
+ /* Old-style, without event indices. */
+ if (!vrh->event_indices) {
+ u16 flags;
+ err = getu16(&flags, &vrh->vring.avail->flags);
+ if (err) {
+ vringh_bad("Failed to get flags at %p",
+ &vrh->vring.avail->flags);
+ return err;
+ }
+ return (!(flags & VRING_AVAIL_F_NO_INTERRUPT));
+ }
+
+ /* Modern: we know when other side wants to know. */
+ err = getu16(&used_event, &vring_used_event(&vrh->vring));
+ if (err) {
+ vringh_bad("Failed to get used event idx at %p",
+ &vring_used_event(&vrh->vring));
+ return err;
+ }
+
+ /* Just in case we added so many that we wrap. */
+ if (unlikely(vrh->completed > 0xffff))
+ notify = true;
+ else
+ notify = vring_need_event(used_event,
+ vrh->last_used_idx + vrh->completed,
+ vrh->last_used_idx);
+
+ vrh->last_used_idx += vrh->completed;
+ vrh->completed = 0;
+ return notify;
+}
+
+static inline bool __vringh_notify_enable(struct vringh *vrh,
+ int (*getu16)(u16 *val, const u16 *p),
+ int (*putu16)(u16 *p, u16 val))
+{
+ u16 avail;
+
+ if (!vrh->event_indices) {
+ /* Old-school; update flags. */
+ if (putu16(&vrh->vring.used->flags, 0) != 0) {
+ vringh_bad("Clearing used flags %p",
+ &vrh->vring.used->flags);
+ return true;
+ }
+ } else {
+ if (putu16(&vring_avail_event(&vrh->vring),
+ vrh->last_avail_idx) != 0) {
+ vringh_bad("Updating avail event index %p",
+ &vring_avail_event(&vrh->vring));
+ return true;
+ }
+ }
+
+ /* They could have slipped one in as we were doing that: make
+ * sure it's written, then check again. */
+ virtio_mb(vrh->weak_barriers);
+
+ if (getu16(&avail, &vrh->vring.avail->idx) != 0) {
+ vringh_bad("Failed to check avail idx at %p",
+ &vrh->vring.avail->idx);
+ return true;
+ }
+
+ /* This is unlikely, so we just leave notifications enabled
+ * (if we're using event_indices, we'll only get one
+ * notification anyway). */
+ return avail == vrh->last_avail_idx;
+}
+
+static inline void __vringh_notify_disable(struct vringh *vrh,
+ int (*putu16)(u16 *p, u16 val))
+{
+ if (!vrh->event_indices) {
+ /* Old-school; update flags. */
+ if (putu16(&vrh->vring.used->flags, VRING_USED_F_NO_NOTIFY)) {
+ vringh_bad("Setting used flags %p",
+ &vrh->vring.used->flags);
+ }
+ }
+}
+
+/* Userspace access helpers: in this case, addresses are really userspace. */
+static inline int getu16_user(u16 *val, const u16 *p)
+{
+ return get_user(*val, (__force u16 __user *)p);
+}
+
+static inline int putu16_user(u16 *p, u16 val)
+{
+ return put_user(val, (__force u16 __user *)p);
+}
+
+static inline int copydesc_user(void *dst, const void *src, size_t len)
+{
+ return copy_from_user(dst, (__force void __user *)src, len) ?
+ -EFAULT : 0;
+}
+
+static inline int putused_user(struct vring_used_elem *dst,
+ const struct vring_used_elem *src,
+ unsigned int num)
+{
+ return copy_to_user((__force void __user *)dst, src,
+ sizeof(*dst) * num) ? -EFAULT : 0;
+}
+
+static inline int xfer_from_user(void *src, void *dst, size_t len)
+{
+ return copy_from_user(dst, (__force void __user *)src, len) ?
+ -EFAULT : 0;
+}
+
+static inline int xfer_to_user(void *dst, void *src, size_t len)
+{
+ return copy_to_user((__force void __user *)dst, src, len) ?
+ -EFAULT : 0;
+}
+
+/**
+ * vringh_init_user - initialize a vringh for a userspace vring.
+ * @vrh: the vringh to initialize.
+ * @features: the feature bits for this ring.
+ * @num: the number of elements.
+ * @weak_barriers: true if we only need memory barriers, not I/O.
+ * @desc: the userpace descriptor pointer.
+ * @avail: the userpace avail pointer.
+ * @used: the userpace used pointer.
+ *
+ * Returns an error if num is invalid: you should check pointers
+ * yourself!
+ */
+int vringh_init_user(struct vringh *vrh, u32 features,
+ unsigned int num, bool weak_barriers,
+ struct vring_desc __user *desc,
+ struct vring_avail __user *avail,
+ struct vring_used __user *used)
+{
+ /* Sane power of 2 please! */
+ if (!num || num > 0xffff || (num & (num - 1))) {
+ vringh_bad("Bad ring size %u", num);
+ return -EINVAL;
+ }
+
+ vrh->event_indices = (features & (1 << VIRTIO_RING_F_EVENT_IDX));
+ vrh->weak_barriers = weak_barriers;
+ vrh->completed = 0;
+ vrh->last_avail_idx = 0;
+ vrh->last_used_idx = 0;
+ vrh->vring.num = num;
+ /* vring expects kernel addresses, but only used via accessors. */
+ vrh->vring.desc = (__force struct vring_desc *)desc;
+ vrh->vring.avail = (__force struct vring_avail *)avail;
+ vrh->vring.used = (__force struct vring_used *)used;
+ return 0;
+}
+EXPORT_SYMBOL(vringh_init_user);
+
+/**
+ * vringh_getdesc_user - get next available descriptor from userspace ring.
+ * @vrh: the userspace vring.
+ * @riov: where to put the readable descriptors (or NULL)
+ * @wiov: where to put the writable descriptors (or NULL)
+ * @getrange: function to call to check ranges.
+ * @head: head index we received, for passing to vringh_complete_user().
+ *
+ * Returns 0 if there was no descriptor, 1 if there was, or -errno.
+ *
+ * Note that on error return, you can tell the difference between an
+ * invalid ring and a single invalid descriptor: in the former case,
+ * *head will be vrh->vring.num. You may be able to ignore an invalid
+ * descriptor, but there's not much you can do with an invalid ring.
+ *
+ * Note that you may need to clean up riov and wiov, even on error!
+ */
+int vringh_getdesc_user(struct vringh *vrh,
+ struct vringh_iov *riov,
+ struct vringh_iov *wiov,
+ bool (*getrange)(struct vringh *vrh,
+ u64 addr, struct vringh_range *r),
+ u16 *head)
+{
+ int err;
+
+ *head = vrh->vring.num;
+ err = __vringh_get_head(vrh, getu16_user, &vrh->last_avail_idx);
+ if (err < 0)
+ return err;
+
+ /* Empty... */
+ if (err == vrh->vring.num)
+ return 0;
+
+ /* We need the layouts to be the identical for this to work */
+ BUILD_BUG_ON(sizeof(struct vringh_kiov) != sizeof(struct vringh_iov));
+ BUILD_BUG_ON(offsetof(struct vringh_kiov, iov) !=
+ offsetof(struct vringh_iov, iov));
+ BUILD_BUG_ON(offsetof(struct vringh_kiov, i) !=
+ offsetof(struct vringh_iov, i));
+ BUILD_BUG_ON(offsetof(struct vringh_kiov, used) !=
+ offsetof(struct vringh_iov, used));
+ BUILD_BUG_ON(offsetof(struct vringh_kiov, max_num) !=
+ offsetof(struct vringh_iov, max_num));
+ BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec));
+ BUILD_BUG_ON(offsetof(struct iovec, iov_base) !=
+ offsetof(struct kvec, iov_base));
+ BUILD_BUG_ON(offsetof(struct iovec, iov_len) !=
+ offsetof(struct kvec, iov_len));
+ BUILD_BUG_ON(sizeof(((struct iovec *)NULL)->iov_base)
+ != sizeof(((struct kvec *)NULL)->iov_base));
+ BUILD_BUG_ON(sizeof(((struct iovec *)NULL)->iov_len)
+ != sizeof(((struct kvec *)NULL)->iov_len));
+
+ *head = err;
+ err = __vringh_iov(vrh, *head, (struct vringh_kiov *)riov,
+ (struct vringh_kiov *)wiov,
+ range_check, getrange, GFP_KERNEL, copydesc_user);
+ if (err)
+ return err;
+
+ return 1;
+}
+EXPORT_SYMBOL(vringh_getdesc_user);
+
+/**
+ * vringh_iov_pull_user - copy bytes from vring_iov.
+ * @riov: the riov as passed to vringh_getdesc_user() (updated as we consume)
+ * @dst: the place to copy.
+ * @len: the maximum length to copy.
+ *
+ * Returns the bytes copied <= len or a negative errno.
+ */
+ssize_t vringh_iov_pull_user(struct vringh_iov *riov, void *dst, size_t len)
+{
+ return vringh_iov_xfer((struct vringh_kiov *)riov,
+ dst, len, xfer_from_user);
+}
+EXPORT_SYMBOL(vringh_iov_pull_user);
+
+/**
+ * vringh_iov_push_user - copy bytes into vring_iov.
+ * @wiov: the wiov as passed to vringh_getdesc_user() (updated as we consume)
+ * @dst: the place to copy.
+ * @len: the maximum length to copy.
+ *
+ * Returns the bytes copied <= len or a negative errno.
+ */
+ssize_t vringh_iov_push_user(struct vringh_iov *wiov,
+ const void *src, size_t len)
+{
+ return vringh_iov_xfer((struct vringh_kiov *)wiov,
+ (void *)src, len, xfer_to_user);
+}
+EXPORT_SYMBOL(vringh_iov_push_user);
+
+/**
+ * vringh_abandon_user - we've decided not to handle the descriptor(s).
+ * @vrh: the vring.
+ * @num: the number of descriptors to put back (ie. num
+ * vringh_get_user() to undo).
+ *
+ * The next vringh_get_user() will return the old descriptor(s) again.
+ */
+void vringh_abandon_user(struct vringh *vrh, unsigned int num)
+{
+ /* We only update vring_avail_event(vr) when we want to be notified,
+ * so we haven't changed that yet. */
+ vrh->last_avail_idx -= num;
+}
+EXPORT_SYMBOL(vringh_abandon_user);
+
+/**
+ * vringh_complete_user - we've finished with descriptor, publish it.
+ * @vrh: the vring.
+ * @head: the head as filled in by vringh_getdesc_user.
+ * @len: the length of data we have written.
+ *
+ * You should check vringh_need_notify_user() after one or more calls
+ * to this function.
+ */
+int vringh_complete_user(struct vringh *vrh, u16 head, u32 len)
+{
+ struct vring_used_elem used;
+
+ used.id = head;
+ used.len = len;
+ return __vringh_complete(vrh, &used, 1, putu16_user, putused_user);
+}
+EXPORT_SYMBOL(vringh_complete_user);
+
+/**
+ * vringh_complete_multi_user - we've finished with many descriptors.
+ * @vrh: the vring.
+ * @used: the head, length pairs.
+ * @num_used: the number of used elements.
+ *
+ * You should check vringh_need_notify_user() after one or more calls
+ * to this function.
+ */
+int vringh_complete_multi_user(struct vringh *vrh,
+ const struct vring_used_elem used[],
+ unsigned num_used)
+{
+ return __vringh_complete(vrh, used, num_used,
+ putu16_user, putused_user);
+}
+EXPORT_SYMBOL(vringh_complete_multi_user);
+
+/**
+ * vringh_notify_enable_user - we want to know if something changes.
+ * @vrh: the vring.
+ *
+ * This always enables notifications, but returns false if there are
+ * now more buffers available in the vring.
+ */
+bool vringh_notify_enable_user(struct vringh *vrh)
+{
+ return __vringh_notify_enable(vrh, getu16_user, putu16_user);
+}
+EXPORT_SYMBOL(vringh_notify_enable_user);
+
+/**
+ * vringh_notify_disable_user - don't tell us if something changes.
+ * @vrh: the vring.
+ *
+ * This is our normal running state: we disable and then only enable when
+ * we're going to sleep.
+ */
+void vringh_notify_disable_user(struct vringh *vrh)
+{
+ __vringh_notify_disable(vrh, putu16_user);
+}
+EXPORT_SYMBOL(vringh_notify_disable_user);
+
+/**
+ * vringh_need_notify_user - must we tell the other side about used buffers?
+ * @vrh: the vring we've called vringh_complete_user() on.
+ *
+ * Returns -errno or 0 if we don't need to tell the other side, 1 if we do.
+ */
+int vringh_need_notify_user(struct vringh *vrh)
+{
+ return __vringh_need_notify(vrh, getu16_user);
+}
+EXPORT_SYMBOL(vringh_need_notify_user);
+
+/* Kernelspace access helpers. */
+static inline int getu16_kern(u16 *val, const u16 *p)
+{
+ *val = ACCESS_ONCE(*p);
+ return 0;
+}
+
+static inline int putu16_kern(u16 *p, u16 val)
+{
+ ACCESS_ONCE(*p) = val;
+ return 0;
+}
+
+static inline int copydesc_kern(void *dst, const void *src, size_t len)
+{
+ memcpy(dst, src, len);
+ return 0;
+}
+
+static inline int putused_kern(struct vring_used_elem *dst,
+ const struct vring_used_elem *src,
+ unsigned int num)
+{
+ memcpy(dst, src, num * sizeof(*dst));
+ return 0;
+}
+
+static inline int xfer_kern(void *src, void *dst, size_t len)
+{
+ memcpy(dst, src, len);
+ return 0;
+}
+
+/**
+ * vringh_init_kern - initialize a vringh for a kernelspace vring.
+ * @vrh: the vringh to initialize.
+ * @features: the feature bits for this ring.
+ * @num: the number of elements.
+ * @weak_barriers: true if we only need memory barriers, not I/O.
+ * @desc: the userpace descriptor pointer.
+ * @avail: the userpace avail pointer.
+ * @used: the userpace used pointer.
+ *
+ * Returns an error if num is invalid.
+ */
+int vringh_init_kern(struct vringh *vrh, u32 features,
+ unsigned int num, bool weak_barriers,
+ struct vring_desc *desc,
+ struct vring_avail *avail,
+ struct vring_used *used)
+{
+ /* Sane power of 2 please! */
+ if (!num || num > 0xffff || (num & (num - 1))) {
+ vringh_bad("Bad ring size %u", num);
+ return -EINVAL;
+ }
+
+ vrh->event_indices = (features & (1 << VIRTIO_RING_F_EVENT_IDX));
+ vrh->weak_barriers = weak_barriers;
+ vrh->completed = 0;
+ vrh->last_avail_idx = 0;
+ vrh->last_used_idx = 0;
+ vrh->vring.num = num;
+ vrh->vring.desc = desc;
+ vrh->vring.avail = avail;
+ vrh->vring.used = used;
+ return 0;
+}
+EXPORT_SYMBOL(vringh_init_kern);
+
+/**
+ * vringh_getdesc_kern - get next available descriptor from kernelspace ring.
+ * @vrh: the kernelspace vring.
+ * @riov: where to put the readable descriptors (or NULL)
+ * @wiov: where to put the writable descriptors (or NULL)
+ * @head: head index we received, for passing to vringh_complete_kern().
+ * @gfp: flags for allocating larger riov/wiov.
+ *
+ * Returns 0 if there was no descriptor, 1 if there was, or -errno.
+ *
+ * Note that on error return, you can tell the difference between an
+ * invalid ring and a single invalid descriptor: in the former case,
+ * *head will be vrh->vring.num. You may be able to ignore an invalid
+ * descriptor, but there's not much you can do with an invalid ring.
+ *
+ * Note that you may need to clean up riov and wiov, even on error!
+ */
+int vringh_getdesc_kern(struct vringh *vrh,
+ struct vringh_kiov *riov,
+ struct vringh_kiov *wiov,
+ u16 *head,
+ gfp_t gfp)
+{
+ int err;
+
+ err = __vringh_get_head(vrh, getu16_kern, &vrh->last_avail_idx);
+ if (err < 0)
+ return err;
+
+ /* Empty... */
+ if (err == vrh->vring.num)
+ return 0;
+
+ *head = err;
+ err = __vringh_iov(vrh, *head, riov, wiov, no_range_check, NULL,
+ gfp, copydesc_kern);
+ if (err)
+ return err;
+
+ return 1;
+}
+EXPORT_SYMBOL(vringh_getdesc_kern);
+
+/**
+ * vringh_iov_pull_kern - copy bytes from vring_iov.
+ * @riov: the riov as passed to vringh_getdesc_kern() (updated as we consume)
+ * @dst: the place to copy.
+ * @len: the maximum length to copy.
+ *
+ * Returns the bytes copied <= len or a negative errno.
+ */
+ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len)
+{
+ return vringh_iov_xfer(riov, dst, len, xfer_kern);
+}
+EXPORT_SYMBOL(vringh_iov_pull_kern);
+
+/**
+ * vringh_iov_push_kern - copy bytes into vring_iov.
+ * @wiov: the wiov as passed to vringh_getdesc_kern() (updated as we consume)
+ * @dst: the place to copy.
+ * @len: the maximum length to copy.
+ *
+ * Returns the bytes copied <= len or a negative errno.
+ */
+ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov,
+ const void *src, size_t len)
+{
+ return vringh_iov_xfer(wiov, (void *)src, len, xfer_kern);
+}
+EXPORT_SYMBOL(vringh_iov_push_kern);
+
+/**
+ * vringh_abandon_kern - we've decided not to handle the descriptor(s).
+ * @vrh: the vring.
+ * @num: the number of descriptors to put back (ie. num
+ * vringh_get_kern() to undo).
+ *
+ * The next vringh_get_kern() will return the old descriptor(s) again.
+ */
+void vringh_abandon_kern(struct vringh *vrh, unsigned int num)
+{
+ /* We only update vring_avail_event(vr) when we want to be notified,
+ * so we haven't changed that yet. */
+ vrh->last_avail_idx -= num;
+}
+EXPORT_SYMBOL(vringh_abandon_kern);
+
+/**
+ * vringh_complete_kern - we've finished with descriptor, publish it.
+ * @vrh: the vring.
+ * @head: the head as filled in by vringh_getdesc_kern.
+ * @len: the length of data we have written.
+ *
+ * You should check vringh_need_notify_kern() after one or more calls
+ * to this function.
+ */
+int vringh_complete_kern(struct vringh *vrh, u16 head, u32 len)
+{
+ struct vring_used_elem used;
+
+ used.id = head;
+ used.len = len;
+
+ return __vringh_complete(vrh, &used, 1, putu16_kern, putused_kern);
+}
+EXPORT_SYMBOL(vringh_complete_kern);
+
+/**
+ * vringh_notify_enable_kern - we want to know if something changes.
+ * @vrh: the vring.
+ *
+ * This always enables notifications, but returns false if there are
+ * now more buffers available in the vring.
+ */
+bool vringh_notify_enable_kern(struct vringh *vrh)
+{
+ return __vringh_notify_enable(vrh, getu16_kern, putu16_kern);
+}
+EXPORT_SYMBOL(vringh_notify_enable_kern);
+
+/**
+ * vringh_notify_disable_kern - don't tell us if something changes.
+ * @vrh: the vring.
+ *
+ * This is our normal running state: we disable and then only enable when
+ * we're going to sleep.
+ */
+void vringh_notify_disable_kern(struct vringh *vrh)
+{
+ __vringh_notify_disable(vrh, putu16_kern);
+}
+EXPORT_SYMBOL(vringh_notify_disable_kern);
+
+/**
+ * vringh_need_notify_kern - must we tell the other side about used buffers?
+ * @vrh: the vring we've called vringh_complete_kern() on.
+ *
+ * Returns -errno or 0 if we don't need to tell the other side, 1 if we do.
+ */
+int vringh_need_notify_kern(struct vringh *vrh)
+{
+ return __vringh_need_notify(vrh, getu16_kern);
+}
+EXPORT_SYMBOL(vringh_need_notify_kern);
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 8dab163c5ef0..bd3ae324a1a2 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -108,7 +108,7 @@ static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
sg_init_one(&sg, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns);
/* We should always be able to add one buffer to an empty queue. */
- if (virtqueue_add_buf(vq, &sg, 1, 0, vb, GFP_KERNEL) < 0)
+ if (virtqueue_add_outbuf(vq, &sg, 1, vb, GFP_KERNEL) < 0)
BUG();
virtqueue_kick(vq);
@@ -256,7 +256,7 @@ static void stats_handle_request(struct virtio_balloon *vb)
if (!virtqueue_get_buf(vq, &len))
return;
sg_init_one(&sg, vb->stats, sizeof(vb->stats));
- if (virtqueue_add_buf(vq, &sg, 1, 0, vb, GFP_KERNEL) < 0)
+ if (virtqueue_add_outbuf(vq, &sg, 1, vb, GFP_KERNEL) < 0)
BUG();
virtqueue_kick(vq);
}
@@ -341,7 +341,7 @@ static int init_vqs(struct virtio_balloon *vb)
* use it to signal us later.
*/
sg_init_one(&sg, vb->stats, sizeof vb->stats);
- if (virtqueue_add_buf(vb->stats_vq, &sg, 1, 0, vb, GFP_KERNEL)
+ if (virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb, GFP_KERNEL)
< 0)
BUG();
virtqueue_kick(vb->stats_vq);
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index ffd7e7da5d3b..5217baf5528c 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -24,27 +24,6 @@
#include <linux/module.h>
#include <linux/hrtimer.h>
-/* virtio guest is communicating with a virtual "device" that actually runs on
- * a host processor. Memory barriers are used to control SMP effects. */
-#ifdef CONFIG_SMP
-/* Where possible, use SMP barriers which are more lightweight than mandatory
- * barriers, because mandatory barriers control MMIO effects on accesses
- * through relaxed memory I/O windows (which virtio-pci does not use). */
-#define virtio_mb(vq) \
- do { if ((vq)->weak_barriers) smp_mb(); else mb(); } while(0)
-#define virtio_rmb(vq) \
- do { if ((vq)->weak_barriers) smp_rmb(); else rmb(); } while(0)
-#define virtio_wmb(vq) \
- do { if ((vq)->weak_barriers) smp_wmb(); else wmb(); } while(0)
-#else
-/* We must force memory ordering even if guest is UP since host could be
- * running on another CPU, but SMP barriers are defined to barrier() in that
- * configuration. So fall back to mandatory barriers instead. */
-#define virtio_mb(vq) mb()
-#define virtio_rmb(vq) rmb()
-#define virtio_wmb(vq) wmb()
-#endif
-
#ifdef DEBUG
/* For development, we want to crash whenever the ring is screwed. */
#define BAD_RING(_vq, fmt, args...) \
@@ -119,16 +98,36 @@ struct vring_virtqueue
#define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
+static inline struct scatterlist *sg_next_chained(struct scatterlist *sg,
+ unsigned int *count)
+{
+ return sg_next(sg);
+}
+
+static inline struct scatterlist *sg_next_arr(struct scatterlist *sg,
+ unsigned int *count)
+{
+ if (--(*count) == 0)
+ return NULL;
+ return sg + 1;
+}
+
/* Set up an indirect table of descriptors and add it to the queue. */
-static int vring_add_indirect(struct vring_virtqueue *vq,
- struct scatterlist sg[],
- unsigned int out,
- unsigned int in,
- gfp_t gfp)
+static inline int vring_add_indirect(struct vring_virtqueue *vq,
+ struct scatterlist *sgs[],
+ struct scatterlist *(*next)
+ (struct scatterlist *, unsigned int *),
+ unsigned int total_sg,
+ unsigned int total_out,
+ unsigned int total_in,
+ unsigned int out_sgs,
+ unsigned int in_sgs,
+ gfp_t gfp)
{
struct vring_desc *desc;
unsigned head;
- int i;
+ struct scatterlist *sg;
+ int i, n;
/*
* We require lowmem mappings for the descriptors because
@@ -137,25 +136,31 @@ static int vring_add_indirect(struct vring_virtqueue *vq,
*/
gfp &= ~(__GFP_HIGHMEM | __GFP_HIGH);
- desc = kmalloc((out + in) * sizeof(struct vring_desc), gfp);
+ desc = kmalloc(total_sg * sizeof(struct vring_desc), gfp);
if (!desc)
return -ENOMEM;
- /* Transfer entries from the sg list into the indirect page */
- for (i = 0; i < out; i++) {
- desc[i].flags = VRING_DESC_F_NEXT;
- desc[i].addr = sg_phys(sg);
- desc[i].len = sg->length;
- desc[i].next = i+1;
- sg++;
+ /* Transfer entries from the sg lists into the indirect page */
+ i = 0;
+ for (n = 0; n < out_sgs; n++) {
+ for (sg = sgs[n]; sg; sg = next(sg, &total_out)) {
+ desc[i].flags = VRING_DESC_F_NEXT;
+ desc[i].addr = sg_phys(sg);
+ desc[i].len = sg->length;
+ desc[i].next = i+1;
+ i++;
+ }
}
- for (; i < (out + in); i++) {
- desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
- desc[i].addr = sg_phys(sg);
- desc[i].len = sg->length;
- desc[i].next = i+1;
- sg++;
+ for (; n < (out_sgs + in_sgs); n++) {
+ for (sg = sgs[n]; sg; sg = next(sg, &total_in)) {
+ desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
+ desc[i].addr = sg_phys(sg);
+ desc[i].len = sg->length;
+ desc[i].next = i+1;
+ i++;
+ }
}
+ BUG_ON(i != total_sg);
/* Last one doesn't continue. */
desc[i-1].flags &= ~VRING_DESC_F_NEXT;
@@ -176,29 +181,20 @@ static int vring_add_indirect(struct vring_virtqueue *vq,
return head;
}
-/**
- * virtqueue_add_buf - expose buffer to other end
- * @vq: the struct virtqueue we're talking about.
- * @sg: the description of the buffer(s).
- * @out_num: the number of sg readable by other side
- * @in_num: the number of sg which are writable (after readable ones)
- * @data: the token identifying the buffer.
- * @gfp: how to do memory allocations (if necessary).
- *
- * Caller must ensure we don't call this with other virtqueue operations
- * at the same time (except where noted).
- *
- * Returns zero or a negative error (ie. ENOSPC, ENOMEM).
- */
-int virtqueue_add_buf(struct virtqueue *_vq,
- struct scatterlist sg[],
- unsigned int out,
- unsigned int in,
- void *data,
- gfp_t gfp)
+static inline int virtqueue_add(struct virtqueue *_vq,
+ struct scatterlist *sgs[],
+ struct scatterlist *(*next)
+ (struct scatterlist *, unsigned int *),
+ unsigned int total_out,
+ unsigned int total_in,
+ unsigned int out_sgs,
+ unsigned int in_sgs,
+ void *data,
+ gfp_t gfp)
{
struct vring_virtqueue *vq = to_vvq(_vq);
- unsigned int i, avail, uninitialized_var(prev);
+ struct scatterlist *sg;
+ unsigned int i, n, avail, uninitialized_var(prev), total_sg;
int head;
START_USE(vq);
@@ -218,46 +214,54 @@ int virtqueue_add_buf(struct virtqueue *_vq,
}
#endif
+ total_sg = total_in + total_out;
+
/* If the host supports indirect descriptor tables, and we have multiple
* buffers, then go indirect. FIXME: tune this threshold */
- if (vq->indirect && (out + in) > 1 && vq->vq.num_free) {
- head = vring_add_indirect(vq, sg, out, in, gfp);
+ if (vq->indirect && total_sg > 1 && vq->vq.num_free) {
+ head = vring_add_indirect(vq, sgs, next, total_sg, total_out,
+ total_in,
+ out_sgs, in_sgs, gfp);
if (likely(head >= 0))
goto add_head;
}
- BUG_ON(out + in > vq->vring.num);
- BUG_ON(out + in == 0);
+ BUG_ON(total_sg > vq->vring.num);
+ BUG_ON(total_sg == 0);
- if (vq->vq.num_free < out + in) {
+ if (vq->vq.num_free < total_sg) {
pr_debug("Can't add buf len %i - avail = %i\n",
- out + in, vq->vq.num_free);
+ total_sg, vq->vq.num_free);
/* FIXME: for historical reasons, we force a notify here if
* there are outgoing parts to the buffer. Presumably the
* host should service the ring ASAP. */
- if (out)
+ if (out_sgs)
vq->notify(&vq->vq);
END_USE(vq);
return -ENOSPC;
}
/* We're about to use some buffers from the free list. */
- vq->vq.num_free -= out + in;
-
- head = vq->free_head;
- for (i = vq->free_head; out; i = vq->vring.desc[i].next, out--) {
- vq->vring.desc[i].flags = VRING_DESC_F_NEXT;
- vq->vring.desc[i].addr = sg_phys(sg);
- vq->vring.desc[i].len = sg->length;
- prev = i;
- sg++;
+ vq->vq.num_free -= total_sg;
+
+ head = i = vq->free_head;
+ for (n = 0; n < out_sgs; n++) {
+ for (sg = sgs[n]; sg; sg = next(sg, &total_out)) {
+ vq->vring.desc[i].flags = VRING_DESC_F_NEXT;
+ vq->vring.desc[i].addr = sg_phys(sg);
+ vq->vring.desc[i].len = sg->length;
+ prev = i;
+ i = vq->vring.desc[i].next;
+ }
}
- for (; in; i = vq->vring.desc[i].next, in--) {
- vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
- vq->vring.desc[i].addr = sg_phys(sg);
- vq->vring.desc[i].len = sg->length;
- prev = i;
- sg++;
+ for (; n < (out_sgs + in_sgs); n++) {
+ for (sg = sgs[n]; sg; sg = next(sg, &total_in)) {
+ vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
+ vq->vring.desc[i].addr = sg_phys(sg);
+ vq->vring.desc[i].len = sg->length;
+ prev = i;
+ i = vq->vring.desc[i].next;
+ }
}
/* Last one doesn't continue. */
vq->vring.desc[prev].flags &= ~VRING_DESC_F_NEXT;
@@ -276,7 +280,7 @@ add_head:
/* Descriptors and available array need to be set before we expose the
* new available array entries. */
- virtio_wmb(vq);
+ virtio_wmb(vq->weak_barriers);
vq->vring.avail->idx++;
vq->num_added++;
@@ -290,9 +294,122 @@ add_head:
return 0;
}
+
+/**
+ * virtqueue_add_buf - expose buffer to other end
+ * @vq: the struct virtqueue we're talking about.
+ * @sg: the description of the buffer(s).
+ * @out_num: the number of sg readable by other side
+ * @in_num: the number of sg which are writable (after readable ones)
+ * @data: the token identifying the buffer.
+ * @gfp: how to do memory allocations (if necessary).
+ *
+ * Caller must ensure we don't call this with other virtqueue operations
+ * at the same time (except where noted).
+ *
+ * Returns zero or a negative error (ie. ENOSPC, ENOMEM).
+ */
+int virtqueue_add_buf(struct virtqueue *_vq,
+ struct scatterlist sg[],
+ unsigned int out,
+ unsigned int in,
+ void *data,
+ gfp_t gfp)
+{
+ struct scatterlist *sgs[2];
+
+ sgs[0] = sg;
+ sgs[1] = sg + out;
+
+ return virtqueue_add(_vq, sgs, sg_next_arr,
+ out, in, out ? 1 : 0, in ? 1 : 0, data, gfp);
+}
EXPORT_SYMBOL_GPL(virtqueue_add_buf);
/**
+ * virtqueue_add_sgs - expose buffers to other end
+ * @vq: the struct virtqueue we're talking about.
+ * @sgs: array of terminated scatterlists.
+ * @out_num: the number of scatterlists readable by other side
+ * @in_num: the number of scatterlists which are writable (after readable ones)
+ * @data: the token identifying the buffer.
+ * @gfp: how to do memory allocations (if necessary).
+ *
+ * Caller must ensure we don't call this with other virtqueue operations
+ * at the same time (except where noted).
+ *
+ * Returns zero or a negative error (ie. ENOSPC, ENOMEM).
+ */
+int virtqueue_add_sgs(struct virtqueue *_vq,
+ struct scatterlist *sgs[],
+ unsigned int out_sgs,
+ unsigned int in_sgs,
+ void *data,
+ gfp_t gfp)
+{
+ unsigned int i, total_out, total_in;
+
+ /* Count them first. */
+ for (i = total_out = total_in = 0; i < out_sgs; i++) {
+ struct scatterlist *sg;
+ for (sg = sgs[i]; sg; sg = sg_next(sg))
+ total_out++;
+ }
+ for (; i < out_sgs + in_sgs; i++) {
+ struct scatterlist *sg;
+ for (sg = sgs[i]; sg; sg = sg_next(sg))
+ total_in++;
+ }
+ return virtqueue_add(_vq, sgs, sg_next_chained,
+ total_out, total_in, out_sgs, in_sgs, data, gfp);
+}
+EXPORT_SYMBOL_GPL(virtqueue_add_sgs);
+
+/**
+ * virtqueue_add_outbuf - expose output buffers to other end
+ * @vq: the struct virtqueue we're talking about.
+ * @sgs: array of scatterlists (need not be terminated!)
+ * @num: the number of scatterlists readable by other side
+ * @data: the token identifying the buffer.
+ * @gfp: how to do memory allocations (if necessary).
+ *
+ * Caller must ensure we don't call this with other virtqueue operations
+ * at the same time (except where noted).
+ *
+ * Returns zero or a negative error (ie. ENOSPC, ENOMEM).
+ */
+int virtqueue_add_outbuf(struct virtqueue *vq,
+ struct scatterlist sg[], unsigned int num,
+ void *data,
+ gfp_t gfp)
+{
+ return virtqueue_add(vq, &sg, sg_next_arr, num, 0, 1, 0, data, gfp);
+}
+EXPORT_SYMBOL_GPL(virtqueue_add_outbuf);
+
+/**
+ * virtqueue_add_inbuf - expose input buffers to other end
+ * @vq: the struct virtqueue we're talking about.
+ * @sgs: array of scatterlists (need not be terminated!)
+ * @num: the number of scatterlists writable by other side
+ * @data: the token identifying the buffer.
+ * @gfp: how to do memory allocations (if necessary).
+ *
+ * Caller must ensure we don't call this with other virtqueue operations
+ * at the same time (except where noted).
+ *
+ * Returns zero or a negative error (ie. ENOSPC, ENOMEM).
+ */
+int virtqueue_add_inbuf(struct virtqueue *vq,
+ struct scatterlist sg[], unsigned int num,
+ void *data,
+ gfp_t gfp)
+{
+ return virtqueue_add(vq, &sg, sg_next_arr, 0, num, 0, 1, data, gfp);
+}
+EXPORT_SYMBOL_GPL(virtqueue_add_inbuf);
+
+/**
* virtqueue_kick_prepare - first half of split virtqueue_kick call.
* @vq: the struct virtqueue
*
@@ -312,7 +429,7 @@ bool virtqueue_kick_prepare(struct virtqueue *_vq)
START_USE(vq);
/* We need to expose available array entries before checking avail
* event. */
- virtio_mb(vq);
+ virtio_mb(vq->weak_barriers);
old = vq->vring.avail->idx - vq->num_added;
new = vq->vring.avail->idx;
@@ -436,7 +553,7 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
}
/* Only get used array entries after they have been exposed by host. */
- virtio_rmb(vq);
+ virtio_rmb(vq->weak_barriers);
last_used = (vq->last_used_idx & (vq->vring.num - 1));
i = vq->vring.used->ring[last_used].id;
@@ -460,7 +577,7 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
* the read in the next get_buf call. */
if (!(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) {
vring_used_event(&vq->vring) = vq->last_used_idx;
- virtio_mb(vq);
+ virtio_mb(vq->weak_barriers);
}
#ifdef DEBUG
@@ -513,7 +630,7 @@ bool virtqueue_enable_cb(struct virtqueue *_vq)
* entry. Always do both to keep code simple. */
vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT;
vring_used_event(&vq->vring) = vq->last_used_idx;
- virtio_mb(vq);
+ virtio_mb(vq->weak_barriers);
if (unlikely(more_used(vq))) {
END_USE(vq);
return false;
@@ -553,7 +670,7 @@ bool virtqueue_enable_cb_delayed(struct virtqueue *_vq)
/* TODO: tune this threshold */
bufs = (u16)(vq->vring.avail->idx - vq->last_used_idx) * 3 / 4;
vring_used_event(&vq->vring) = vq->last_used_idx + bufs;
- virtio_mb(vq);
+ virtio_mb(vq->weak_barriers);
if (unlikely((u16)(vq->vring.used->idx - vq->last_used_idx) > bufs)) {
END_USE(vq);
return false;