summaryrefslogtreecommitdiffstats
path: root/drivers/vhost
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-12-13 03:07:07 +0100
committerLinus Torvalds <torvalds@linux-foundation.org>2012-12-13 03:07:07 +0100
commit6be35c700f742e911ecedd07fcc43d4439922334 (patch)
treeca9f37214d204465fcc2d79c82efd291e357c53c /drivers/vhost
parentMerge tag 'for-linus-20121212' of git://git.kernel.org/pub/scm/linux/kernel/g... (diff)
parentnet/mlx4_en: Add support for destination MAC in steering rules (diff)
downloadlinux-6be35c700f742e911ecedd07fcc43d4439922334.tar.xz
linux-6be35c700f742e911ecedd07fcc43d4439922334.zip
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking changes from David Miller: 1) Allow to dump, monitor, and change the bridge multicast database using netlink. From Cong Wang. 2) RFC 5961 TCP blind data injection attack mitigation, from Eric Dumazet. 3) Networking user namespace support from Eric W. Biederman. 4) tuntap/virtio-net multiqueue support by Jason Wang. 5) Support for checksum offload of encapsulated packets (basically, tunneled traffic can still be checksummed by HW). From Joseph Gasparakis. 6) Allow BPF filter access to VLAN tags, from Eric Dumazet and Daniel Borkmann. 7) Bridge port parameters over netlink and BPDU blocking support from Stephen Hemminger. 8) Improve data access patterns during inet socket demux by rearranging socket layout, from Eric Dumazet. 9) TIPC protocol updates and cleanups from Ying Xue, Paul Gortmaker, and Jon Maloy. 10) Update TCP socket hash sizing to be more in line with current day realities. The existing heurstics were choosen a decade ago. From Eric Dumazet. 11) Fix races, queue bloat, and excessive wakeups in ATM and associated drivers, from Krzysztof Mazur and David Woodhouse. 12) Support DOVE (Distributed Overlay Virtual Ethernet) extensions in VXLAN driver, from David Stevens. 13) Add "oops_only" mode to netconsole, from Amerigo Wang. 14) Support set and query of VEB/VEPA bridge mode via PF_BRIDGE, also allow DCB netlink to work on namespaces other than the initial namespace. From John Fastabend. 15) Support PTP in the Tigon3 driver, from Matt Carlson. 16) tun/vhost zero copy fixes and improvements, plus turn it on by default, from Michael S. Tsirkin. 17) Support per-association statistics in SCTP, from Michele Baldessari. And many, many, driver updates, cleanups, and improvements. Too numerous to mention individually. * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1722 commits) net/mlx4_en: Add support for destination MAC in steering rules net/mlx4_en: Use generic etherdevice.h functions. net: ethtool: Add destination MAC address to flow steering API bridge: add support of adding and deleting mdb entries bridge: notify mdb changes via netlink ndisc: Unexport ndisc_{build,send}_skb(). uapi: add missing netconf.h to export list pkt_sched: avoid requeues if possible solos-pci: fix double-free of TX skb in DMA mode bnx2: Fix accidental reversions. bna: Driver Version Updated to 3.1.2.1 bna: Firmware update bna: Add RX State bna: Rx Page Based Allocation bna: TX Intr Coalescing Fix bna: Tx and Rx Optimizations bna: Code Cleanup and Enhancements ath9k: check pdata variable before dereferencing it ath5k: RX timestamp is reported at end of frame ath9k_htc: RX timestamp is reported at end of frame ...
Diffstat (limited to 'drivers/vhost')
-rw-r--r--drivers/vhost/net.c147
-rw-r--r--drivers/vhost/tcm_vhost.c9
-rw-r--r--drivers/vhost/vhost.c59
-rw-r--r--drivers/vhost/vhost.h14
4 files changed, 154 insertions, 75 deletions
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 7f93f34b7f91..ebd08b21b234 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -30,9 +30,10 @@
#include "vhost.h"
-static int experimental_zcopytx;
+static int experimental_zcopytx = 1;
module_param(experimental_zcopytx, int, 0444);
-MODULE_PARM_DESC(experimental_zcopytx, "Enable Experimental Zero Copy TX");
+MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;"
+ " 1 -Enable; 0 - Disable");
/* Max number of bytes transferred before requeueing the job.
* Using this limit prevents one virtqueue from starving others. */
@@ -42,6 +43,21 @@ MODULE_PARM_DESC(experimental_zcopytx, "Enable Experimental Zero Copy TX");
#define VHOST_MAX_PEND 128
#define VHOST_GOODCOPY_LEN 256
+/*
+ * For transmit, used buffer len is unused; we override it to track buffer
+ * status internally; used for zerocopy tx only.
+ */
+/* Lower device DMA failed */
+#define VHOST_DMA_FAILED_LEN 3
+/* Lower device DMA done */
+#define VHOST_DMA_DONE_LEN 2
+/* Lower device DMA in progress */
+#define VHOST_DMA_IN_PROGRESS 1
+/* Buffer unused */
+#define VHOST_DMA_CLEAR_LEN 0
+
+#define VHOST_DMA_IS_DONE(len) ((len) >= VHOST_DMA_DONE_LEN)
+
enum {
VHOST_NET_VQ_RX = 0,
VHOST_NET_VQ_TX = 1,
@@ -62,8 +78,39 @@ struct vhost_net {
* We only do this when socket buffer fills up.
* Protected by tx vq lock. */
enum vhost_net_poll_state tx_poll_state;
+ /* Number of TX recently submitted.
+ * Protected by tx vq lock. */
+ unsigned tx_packets;
+ /* Number of times zerocopy TX recently failed.
+ * Protected by tx vq lock. */
+ unsigned tx_zcopy_err;
+ /* Flush in progress. Protected by tx vq lock. */
+ bool tx_flush;
};
+static void vhost_net_tx_packet(struct vhost_net *net)
+{
+ ++net->tx_packets;
+ if (net->tx_packets < 1024)
+ return;
+ net->tx_packets = 0;
+ net->tx_zcopy_err = 0;
+}
+
+static void vhost_net_tx_err(struct vhost_net *net)
+{
+ ++net->tx_zcopy_err;
+}
+
+static bool vhost_net_tx_select_zcopy(struct vhost_net *net)
+{
+ /* TX flush waits for outstanding DMAs to be done.
+ * Don't start new DMAs.
+ */
+ return !net->tx_flush &&
+ net->tx_packets / 64 >= net->tx_zcopy_err;
+}
+
static bool vhost_sock_zcopy(struct socket *sock)
{
return unlikely(experimental_zcopytx) &&
@@ -126,6 +173,55 @@ static void tx_poll_start(struct vhost_net *net, struct socket *sock)
net->tx_poll_state = VHOST_NET_POLL_STARTED;
}
+/* In case of DMA done not in order in lower device driver for some reason.
+ * upend_idx is used to track end of used idx, done_idx is used to track head
+ * of used idx. Once lower device DMA done contiguously, we will signal KVM
+ * guest used idx.
+ */
+static int vhost_zerocopy_signal_used(struct vhost_net *net,
+ struct vhost_virtqueue *vq)
+{
+ int i;
+ int j = 0;
+
+ for (i = vq->done_idx; i != vq->upend_idx; i = (i + 1) % UIO_MAXIOV) {
+ if (vq->heads[i].len == VHOST_DMA_FAILED_LEN)
+ vhost_net_tx_err(net);
+ if (VHOST_DMA_IS_DONE(vq->heads[i].len)) {
+ vq->heads[i].len = VHOST_DMA_CLEAR_LEN;
+ vhost_add_used_and_signal(vq->dev, vq,
+ vq->heads[i].id, 0);
+ ++j;
+ } else
+ break;
+ }
+ if (j)
+ vq->done_idx = i;
+ return j;
+}
+
+static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
+{
+ struct vhost_ubuf_ref *ubufs = ubuf->ctx;
+ struct vhost_virtqueue *vq = ubufs->vq;
+ int cnt = atomic_read(&ubufs->kref.refcount);
+
+ /*
+ * Trigger polling thread if guest stopped submitting new buffers:
+ * in this case, the refcount after decrement will eventually reach 1
+ * so here it is 2.
+ * We also trigger polling periodically after each 16 packets
+ * (the value 16 here is more or less arbitrary, it's tuned to trigger
+ * less than 10% of times).
+ */
+ if (cnt <= 2 || !(cnt % 16))
+ vhost_poll_queue(&vq->poll);
+ /* set len to mark this desc buffers done DMA */
+ vq->heads[ubuf->desc].len = success ?
+ VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN;
+ vhost_ubuf_put(ubufs);
+}
+
/* Expects to be always run from workqueue - which acts as
* read-size critical section for our kind of RCU. */
static void handle_tx(struct vhost_net *net)
@@ -146,7 +242,7 @@ static void handle_tx(struct vhost_net *net)
size_t hdr_size;
struct socket *sock;
struct vhost_ubuf_ref *uninitialized_var(ubufs);
- bool zcopy;
+ bool zcopy, zcopy_used;
/* TODO: check that we are running from vhost_worker? */
sock = rcu_dereference_check(vq->private_data, 1);
@@ -172,7 +268,7 @@ static void handle_tx(struct vhost_net *net)
for (;;) {
/* Release DMAs done buffers first */
if (zcopy)
- vhost_zerocopy_signal_used(vq);
+ vhost_zerocopy_signal_used(net, vq);
head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
ARRAY_SIZE(vq->iov),
@@ -224,10 +320,14 @@ static void handle_tx(struct vhost_net *net)
iov_length(vq->hdr, s), hdr_size);
break;
}
+ zcopy_used = zcopy && (len >= VHOST_GOODCOPY_LEN ||
+ vq->upend_idx != vq->done_idx);
+
/* use msg_control to pass vhost zerocopy ubuf info to skb */
- if (zcopy) {
+ if (zcopy_used) {
vq->heads[vq->upend_idx].id = head;
- if (len < VHOST_GOODCOPY_LEN) {
+ if (!vhost_net_tx_select_zcopy(net) ||
+ len < VHOST_GOODCOPY_LEN) {
/* copy don't need to wait for DMA done */
vq->heads[vq->upend_idx].len =
VHOST_DMA_DONE_LEN;
@@ -237,7 +337,8 @@ static void handle_tx(struct vhost_net *net)
} else {
struct ubuf_info *ubuf = &vq->ubuf_info[head];
- vq->heads[vq->upend_idx].len = len;
+ vq->heads[vq->upend_idx].len =
+ VHOST_DMA_IN_PROGRESS;
ubuf->callback = vhost_zerocopy_callback;
ubuf->ctx = vq->ubufs;
ubuf->desc = vq->upend_idx;
@@ -251,7 +352,7 @@ static void handle_tx(struct vhost_net *net)
/* TODO: Check specific error and bomb out unless ENOBUFS? */
err = sock->ops->sendmsg(NULL, sock, &msg, len);
if (unlikely(err < 0)) {
- if (zcopy) {
+ if (zcopy_used) {
if (ubufs)
vhost_ubuf_put(ubufs);
vq->upend_idx = ((unsigned)vq->upend_idx - 1) %
@@ -265,11 +366,12 @@ static void handle_tx(struct vhost_net *net)
if (err != len)
pr_debug("Truncated TX packet: "
" len %d != %zd\n", err, len);
- if (!zcopy)
+ if (!zcopy_used)
vhost_add_used_and_signal(&net->dev, vq, head, 0);
else
- vhost_zerocopy_signal_used(vq);
+ vhost_zerocopy_signal_used(net, vq);
total_len += len;
+ vhost_net_tx_packet(net);
if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
vhost_poll_queue(&vq->poll);
break;
@@ -587,6 +689,17 @@ static void vhost_net_flush(struct vhost_net *n)
{
vhost_net_flush_vq(n, VHOST_NET_VQ_TX);
vhost_net_flush_vq(n, VHOST_NET_VQ_RX);
+ if (n->dev.vqs[VHOST_NET_VQ_TX].ubufs) {
+ mutex_lock(&n->dev.vqs[VHOST_NET_VQ_TX].mutex);
+ n->tx_flush = true;
+ mutex_unlock(&n->dev.vqs[VHOST_NET_VQ_TX].mutex);
+ /* Wait for all lower device DMAs done. */
+ vhost_ubuf_put_and_wait(n->dev.vqs[VHOST_NET_VQ_TX].ubufs);
+ mutex_lock(&n->dev.vqs[VHOST_NET_VQ_TX].mutex);
+ n->tx_flush = false;
+ kref_init(&n->dev.vqs[VHOST_NET_VQ_TX].ubufs->kref);
+ mutex_unlock(&n->dev.vqs[VHOST_NET_VQ_TX].mutex);
+ }
}
static int vhost_net_release(struct inode *inode, struct file *f)
@@ -597,6 +710,7 @@ static int vhost_net_release(struct inode *inode, struct file *f)
vhost_net_stop(n, &tx_sock, &rx_sock);
vhost_net_flush(n);
+ vhost_dev_stop(&n->dev);
vhost_dev_cleanup(&n->dev, false);
if (tx_sock)
fput(tx_sock->file);
@@ -722,6 +836,10 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
r = vhost_init_used(vq);
if (r)
goto err_vq;
+
+ n->tx_packets = 0;
+ n->tx_zcopy_err = 0;
+ n->tx_flush = false;
}
mutex_unlock(&vq->mutex);
@@ -729,7 +847,7 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
if (oldubufs) {
vhost_ubuf_put_and_wait(oldubufs);
mutex_lock(&vq->mutex);
- vhost_zerocopy_signal_used(vq);
+ vhost_zerocopy_signal_used(n, vq);
mutex_unlock(&vq->mutex);
}
@@ -838,8 +956,11 @@ static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
return vhost_net_reset_owner(n);
default:
mutex_lock(&n->dev.mutex);
- r = vhost_dev_ioctl(&n->dev, ioctl, arg);
- vhost_net_flush(n);
+ r = vhost_dev_ioctl(&n->dev, ioctl, argp);
+ if (r == -ENOIOCTLCMD)
+ r = vhost_vring_ioctl(&n->dev, ioctl, argp);
+ else
+ vhost_net_flush(n);
mutex_unlock(&n->dev.mutex);
return r;
}
diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c
index aa31692064dd..79e7e4d45eb2 100644
--- a/drivers/vhost/tcm_vhost.c
+++ b/drivers/vhost/tcm_vhost.c
@@ -34,7 +34,6 @@
#include <linux/ctype.h>
#include <linux/compat.h>
#include <linux/eventfd.h>
-#include <linux/vhost.h>
#include <linux/fs.h>
#include <linux/miscdevice.h>
#include <asm/unaligned.h>
@@ -415,14 +414,12 @@ static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
{
struct tcm_vhost_cmd *tv_cmd;
struct tcm_vhost_nexus *tv_nexus;
- struct se_session *se_sess;
tv_nexus = tv_tpg->tpg_nexus;
if (!tv_nexus) {
pr_err("Unable to locate active struct tcm_vhost_nexus\n");
return ERR_PTR(-EIO);
}
- se_sess = tv_nexus->tvn_se_sess;
tv_cmd = kzalloc(sizeof(struct tcm_vhost_cmd), GFP_ATOMIC);
if (!tv_cmd) {
@@ -895,6 +892,7 @@ static int vhost_scsi_release(struct inode *inode, struct file *f)
vhost_scsi_clear_endpoint(s, &backend);
}
+ vhost_dev_stop(&s->dev);
vhost_dev_cleanup(&s->dev, false);
kfree(s);
return 0;
@@ -970,7 +968,10 @@ static long vhost_scsi_ioctl(struct file *f, unsigned int ioctl,
return vhost_scsi_set_features(vs, features);
default:
mutex_lock(&vs->dev.mutex);
- r = vhost_dev_ioctl(&vs->dev, ioctl, arg);
+ r = vhost_dev_ioctl(&vs->dev, ioctl, argp);
+ /* TODO: flush backend after dev ioctl. */
+ if (r == -ENOIOCTLCMD)
+ r = vhost_vring_ioctl(&vs->dev, ioctl, argp);
mutex_unlock(&vs->dev.mutex);
return r;
}
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index dedaf81d8f36..34389f75fe65 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -26,10 +26,6 @@
#include <linux/kthread.h>
#include <linux/cgroup.h>
-#include <linux/net.h>
-#include <linux/if_packet.h>
-#include <linux/if_arp.h>
-
#include "vhost.h"
enum {
@@ -414,28 +410,16 @@ long vhost_dev_reset_owner(struct vhost_dev *dev)
return 0;
}
-/* In case of DMA done not in order in lower device driver for some reason.
- * upend_idx is used to track end of used idx, done_idx is used to track head
- * of used idx. Once lower device DMA done contiguously, we will signal KVM
- * guest used idx.
- */
-int vhost_zerocopy_signal_used(struct vhost_virtqueue *vq)
+void vhost_dev_stop(struct vhost_dev *dev)
{
int i;
- int j = 0;
-
- for (i = vq->done_idx; i != vq->upend_idx; i = (i + 1) % UIO_MAXIOV) {
- if ((vq->heads[i].len == VHOST_DMA_DONE_LEN)) {
- vq->heads[i].len = VHOST_DMA_CLEAR_LEN;
- vhost_add_used_and_signal(vq->dev, vq,
- vq->heads[i].id, 0);
- ++j;
- } else
- break;
+
+ for (i = 0; i < dev->nvqs; ++i) {
+ if (dev->vqs[i].kick && dev->vqs[i].handle_kick) {
+ vhost_poll_stop(&dev->vqs[i].poll);
+ vhost_poll_flush(&dev->vqs[i].poll);
+ }
}
- if (j)
- vq->done_idx = i;
- return j;
}
/* Caller should have device mutex if and only if locked is set */
@@ -444,17 +428,6 @@ void vhost_dev_cleanup(struct vhost_dev *dev, bool locked)
int i;
for (i = 0; i < dev->nvqs; ++i) {
- if (dev->vqs[i].kick && dev->vqs[i].handle_kick) {
- vhost_poll_stop(&dev->vqs[i].poll);
- vhost_poll_flush(&dev->vqs[i].poll);
- }
- /* Wait for all lower device DMAs done. */
- if (dev->vqs[i].ubufs)
- vhost_ubuf_put_and_wait(dev->vqs[i].ubufs);
-
- /* Signal guest as appropriate. */
- vhost_zerocopy_signal_used(&dev->vqs[i]);
-
if (dev->vqs[i].error_ctx)
eventfd_ctx_put(dev->vqs[i].error_ctx);
if (dev->vqs[i].error)
@@ -634,7 +607,7 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
return 0;
}
-static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)
+long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp)
{
struct file *eventfp, *filep = NULL;
bool pollstart = false, pollstop = false;
@@ -829,9 +802,8 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)
}
/* Caller must have device mutex */
-long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, unsigned long arg)
+long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
{
- void __user *argp = (void __user *)arg;
struct file *eventfp, *filep = NULL;
struct eventfd_ctx *ctx = NULL;
u64 p;
@@ -902,7 +874,7 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, unsigned long arg)
fput(filep);
break;
default:
- r = vhost_set_vring(d, ioctl, argp);
+ r = -ENOIOCTLCMD;
break;
}
done:
@@ -1599,14 +1571,3 @@ void vhost_ubuf_put_and_wait(struct vhost_ubuf_ref *ubufs)
wait_event(ubufs->wait, !atomic_read(&ubufs->kref.refcount));
kfree(ubufs);
}
-
-void vhost_zerocopy_callback(struct ubuf_info *ubuf)
-{
- struct vhost_ubuf_ref *ubufs = ubuf->ctx;
- struct vhost_virtqueue *vq = ubufs->vq;
-
- vhost_poll_queue(&vq->poll);
- /* set len = 1 to mark this desc buffers done DMA */
- vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN;
- kref_put(&ubufs->kref, vhost_zerocopy_done_signal);
-}
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 1125af3d27d1..2639c58b23ab 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -7,17 +7,11 @@
#include <linux/mutex.h>
#include <linux/poll.h>
#include <linux/file.h>
-#include <linux/skbuff.h>
#include <linux/uio.h>
#include <linux/virtio_config.h>
#include <linux/virtio_ring.h>
#include <linux/atomic.h>
-/* This is for zerocopy, used buffer len is set to 1 when lower device DMA
- * done */
-#define VHOST_DMA_DONE_LEN 1
-#define VHOST_DMA_CLEAR_LEN 0
-
struct vhost_device;
struct vhost_work;
@@ -70,6 +64,8 @@ struct vhost_ubuf_ref *vhost_ubuf_alloc(struct vhost_virtqueue *, bool zcopy);
void vhost_ubuf_put(struct vhost_ubuf_ref *);
void vhost_ubuf_put_and_wait(struct vhost_ubuf_ref *);
+struct ubuf_info;
+
/* The virtqueue structure describes a queue attached to a device. */
struct vhost_virtqueue {
struct vhost_dev *dev;
@@ -167,7 +163,9 @@ long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs);
long vhost_dev_check_owner(struct vhost_dev *);
long vhost_dev_reset_owner(struct vhost_dev *);
void vhost_dev_cleanup(struct vhost_dev *, bool locked);
-long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, unsigned long arg);
+void vhost_dev_stop(struct vhost_dev *);
+long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, void __user *argp);
+long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp);
int vhost_vq_access_ok(struct vhost_virtqueue *vq);
int vhost_log_access_ok(struct vhost_dev *);
@@ -191,8 +189,6 @@ bool vhost_enable_notify(struct vhost_dev *, struct vhost_virtqueue *);
int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
unsigned int log_num, u64 len);
-void vhost_zerocopy_callback(struct ubuf_info *);
-int vhost_zerocopy_signal_used(struct vhost_virtqueue *vq);
#define vq_err(vq, fmt, ...) do { \
pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \