summaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2016-07-01 11:32:28 +0200
committerDavid S. Miller <davem@davemloft.net>2016-07-01 11:32:28 +0200
commitbeb528d01f887bbeeaceb601b6d346d07a338d07 (patch)
tree20949c1b4243a0e746570b3f08483b40c183ae57 /drivers
parentMerge branch 'sch_hfsc-fixes-cleanups' (diff)
parenttun: switch to use skb array for tx (diff)
downloadlinux-beb528d01f887bbeeaceb601b6d346d07a338d07.tar.xz
linux-beb528d01f887bbeeaceb601b6d346d07a338d07.zip
Merge branch 'tun-skb_array'
Jason Wang says: ==================== switch to use tx skb array in tun This series tries to switch to use skb array in tun. This is used to eliminate the spinlock contention between producer and consumer. The conversion was straightforward: just introdce a tx skb array and use it instead of sk_receive_queue. A minor issue is to keep the tx_queue_len behaviour, since tun used to use it for the length of sk_receive_queue. This is done through: - add the ability to resize multiple rings at once to avoid handling partial resize failure for mutiple rings. - add the support for zero length ring. - introduce a notifier which was triggered when tx_queue_len was changed for a netdev. - resize all queues during the tx_queue_len changing. Tests shows about 15% improvement on guest rx pps: Before: ~1300000pps After : ~1500000pps Changes from V3: - fix kbuild warnings - call NETDEV_CHANGE_TX_QUEUE_LEN on IFLA_TXQLEN Changes from V2: - add multiple rings resizing support for ptr_ring/skb_array - add zero length ring support - introdce a NETDEV_CHANGE_TX_QUEUE_LEN - drop new flags Changes from V1: - switch to use skb array instead of a customized circular buffer - add non-blocking support - rename .peek to .peek_len - drop lockless peeking since test show very minor improvement ==================== Acked-by: Michael S. Tsirkin <mst@redhat.com> Acked-from-altitude: 34697 feet. Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'drivers')
-rw-r--r--drivers/net/tun.c138
-rw-r--r--drivers/vhost/net.c16
2 files changed, 145 insertions, 9 deletions
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 4884802e0af1..74752159ec34 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -71,6 +71,7 @@
#include <net/sock.h>
#include <linux/seq_file.h>
#include <linux/uio.h>
+#include <linux/skb_array.h>
#include <asm/uaccess.h>
@@ -167,6 +168,7 @@ struct tun_file {
};
struct list_head next;
struct tun_struct *detached;
+ struct skb_array tx_array;
};
struct tun_flow_entry {
@@ -515,7 +517,11 @@ static struct tun_struct *tun_enable_queue(struct tun_file *tfile)
static void tun_queue_purge(struct tun_file *tfile)
{
- skb_queue_purge(&tfile->sk.sk_receive_queue);
+ struct sk_buff *skb;
+
+ while ((skb = skb_array_consume(&tfile->tx_array)) != NULL)
+ kfree_skb(skb);
+
skb_queue_purge(&tfile->sk.sk_error_queue);
}
@@ -560,6 +566,8 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
tun->dev->reg_state == NETREG_REGISTERED)
unregister_netdevice(tun->dev);
}
+ if (tun)
+ skb_array_cleanup(&tfile->tx_array);
sock_put(&tfile->sk);
}
}
@@ -613,6 +621,7 @@ static void tun_detach_all(struct net_device *dev)
static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filter)
{
struct tun_file *tfile = file->private_data;
+ struct net_device *dev = tun->dev;
int err;
err = security_tun_dev_attach(tfile->socket.sk, tun->security);
@@ -642,6 +651,13 @@ static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filte
if (!err)
goto out;
}
+
+ if (!tfile->detached &&
+ skb_array_init(&tfile->tx_array, dev->tx_queue_len, GFP_KERNEL)) {
+ err = -ENOMEM;
+ goto out;
+ }
+
tfile->queue_index = tun->numqueues;
tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN;
rcu_assign_pointer(tfile->tun, tun);
@@ -891,8 +907,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
nf_reset(skb);
- /* Enqueue packet */
- skb_queue_tail(&tfile->socket.sk->sk_receive_queue, skb);
+ if (skb_array_produce(&tfile->tx_array, skb))
+ goto drop;
/* Notify and wake up reader process */
if (tfile->flags & TUN_FASYNC)
@@ -1107,7 +1123,7 @@ static unsigned int tun_chr_poll(struct file *file, poll_table *wait)
poll_wait(file, sk_sleep(sk), wait);
- if (!skb_queue_empty(&sk->sk_receive_queue))
+ if (!skb_array_empty(&tfile->tx_array))
mask |= POLLIN | POLLRDNORM;
if (sock_writeable(sk) ||
@@ -1426,22 +1442,61 @@ done:
return total;
}
+static struct sk_buff *tun_ring_recv(struct tun_file *tfile, int noblock,
+ int *err)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ struct sk_buff *skb = NULL;
+
+ skb = skb_array_consume(&tfile->tx_array);
+ if (skb)
+ goto out;
+ if (noblock) {
+ *err = -EAGAIN;
+ goto out;
+ }
+
+ add_wait_queue(&tfile->wq.wait, &wait);
+ current->state = TASK_INTERRUPTIBLE;
+
+ while (1) {
+ skb = skb_array_consume(&tfile->tx_array);
+ if (skb)
+ break;
+ if (signal_pending(current)) {
+ *err = -ERESTARTSYS;
+ break;
+ }
+ if (tfile->socket.sk->sk_shutdown & RCV_SHUTDOWN) {
+ *err = -EFAULT;
+ break;
+ }
+
+ schedule();
+ }
+
+ current->state = TASK_RUNNING;
+ remove_wait_queue(&tfile->wq.wait, &wait);
+
+out:
+ return skb;
+}
+
static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
struct iov_iter *to,
int noblock)
{
struct sk_buff *skb;
ssize_t ret;
- int peeked, err, off = 0;
+ int err;
tun_debug(KERN_INFO, tun, "tun_do_read\n");
if (!iov_iter_count(to))
return 0;
- /* Read frames from queue */
- skb = __skb_recv_datagram(tfile->socket.sk, noblock ? MSG_DONTWAIT : 0,
- &peeked, &off, &err);
+ /* Read frames from ring */
+ skb = tun_ring_recv(tfile, noblock, &err);
if (!skb)
return err;
@@ -1574,8 +1629,25 @@ out:
return ret;
}
+static int tun_peek_len(struct socket *sock)
+{
+ struct tun_file *tfile = container_of(sock, struct tun_file, socket);
+ struct tun_struct *tun;
+ int ret = 0;
+
+ tun = __tun_get(tfile);
+ if (!tun)
+ return 0;
+
+ ret = skb_array_peek_len(&tfile->tx_array);
+ tun_put(tun);
+
+ return ret;
+}
+
/* Ops structure to mimic raw sockets with tun */
static const struct proto_ops tun_socket_ops = {
+ .peek_len = tun_peek_len,
.sendmsg = tun_sendmsg,
.recvmsg = tun_recvmsg,
};
@@ -2397,6 +2469,53 @@ static const struct ethtool_ops tun_ethtool_ops = {
.get_ts_info = ethtool_op_get_ts_info,
};
+static int tun_queue_resize(struct tun_struct *tun)
+{
+ struct net_device *dev = tun->dev;
+ struct tun_file *tfile;
+ struct skb_array **arrays;
+ int n = tun->numqueues + tun->numdisabled;
+ int ret, i;
+
+ arrays = kmalloc(sizeof *arrays * n, GFP_KERNEL);
+ if (!arrays)
+ return -ENOMEM;
+
+ for (i = 0; i < tun->numqueues; i++) {
+ tfile = rtnl_dereference(tun->tfiles[i]);
+ arrays[i] = &tfile->tx_array;
+ }
+ list_for_each_entry(tfile, &tun->disabled, next)
+ arrays[i++] = &tfile->tx_array;
+
+ ret = skb_array_resize_multiple(arrays, n,
+ dev->tx_queue_len, GFP_KERNEL);
+
+ kfree(arrays);
+ return ret;
+}
+
+static int tun_device_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct tun_struct *tun = netdev_priv(dev);
+
+ switch (event) {
+ case NETDEV_CHANGE_TX_QUEUE_LEN:
+ if (tun_queue_resize(tun))
+ return NOTIFY_BAD;
+ break;
+ default:
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block tun_notifier_block __read_mostly = {
+ .notifier_call = tun_device_event,
+};
static int __init tun_init(void)
{
@@ -2416,6 +2535,8 @@ static int __init tun_init(void)
pr_err("Can't register misc device %d\n", TUN_MINOR);
goto err_misc;
}
+
+ register_netdevice_notifier(&tun_notifier_block);
return 0;
err_misc:
rtnl_link_unregister(&tun_link_ops);
@@ -2427,6 +2548,7 @@ static void tun_cleanup(void)
{
misc_deregister(&tun_miscdev);
rtnl_link_unregister(&tun_link_ops);
+ unregister_netdevice_notifier(&tun_notifier_block);
}
/* Get an underlying socket object from tun file. Returns error unless file is
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 1d3e45f84549..e032ca397371 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -481,10 +481,14 @@ out:
static int peek_head_len(struct sock *sk)
{
+ struct socket *sock = sk->sk_socket;
struct sk_buff *head;
int len = 0;
unsigned long flags;
+ if (sock->ops->peek_len)
+ return sock->ops->peek_len(sock);
+
spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
head = skb_peek(&sk->sk_receive_queue);
if (likely(head)) {
@@ -497,6 +501,16 @@ static int peek_head_len(struct sock *sk)
return len;
}
+static int sk_has_rx_data(struct sock *sk)
+{
+ struct socket *sock = sk->sk_socket;
+
+ if (sock->ops->peek_len)
+ return sock->ops->peek_len(sock);
+
+ return skb_queue_empty(&sk->sk_receive_queue);
+}
+
static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
{
struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
@@ -513,7 +527,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
endtime = busy_clock() + vq->busyloop_timeout;
while (vhost_can_busy_poll(&net->dev, endtime) &&
- skb_queue_empty(&sk->sk_receive_queue) &&
+ !sk_has_rx_data(sk) &&
vhost_vq_avail_empty(&net->dev, vq))
cpu_relax_lowlatency();