diff options
author | David S. Miller <davem@davemloft.net> | 2016-07-01 11:32:28 +0200 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2016-07-01 11:32:28 +0200 |
commit | beb528d01f887bbeeaceb601b6d346d07a338d07 (patch) | |
tree | 20949c1b4243a0e746570b3f08483b40c183ae57 /drivers | |
parent | Merge branch 'sch_hfsc-fixes-cleanups' (diff) | |
parent | tun: switch to use skb array for tx (diff) | |
download | linux-beb528d01f887bbeeaceb601b6d346d07a338d07.tar.xz linux-beb528d01f887bbeeaceb601b6d346d07a338d07.zip |
Merge branch 'tun-skb_array'
Jason Wang says:
====================
switch to use tx skb array in tun
This series tries to switch to use skb array in tun. This is used to
eliminate the spinlock contention between producer and consumer. The
conversion was straightforward: just introdce a tx skb array and use
it instead of sk_receive_queue.
A minor issue is to keep the tx_queue_len behaviour, since tun used to
use it for the length of sk_receive_queue. This is done through:
- add the ability to resize multiple rings at once to avoid handling
partial resize failure for mutiple rings.
- add the support for zero length ring.
- introduce a notifier which was triggered when tx_queue_len was
changed for a netdev.
- resize all queues during the tx_queue_len changing.
Tests shows about 15% improvement on guest rx pps:
Before: ~1300000pps
After : ~1500000pps
Changes from V3:
- fix kbuild warnings
- call NETDEV_CHANGE_TX_QUEUE_LEN on IFLA_TXQLEN
Changes from V2:
- add multiple rings resizing support for ptr_ring/skb_array
- add zero length ring support
- introdce a NETDEV_CHANGE_TX_QUEUE_LEN
- drop new flags
Changes from V1:
- switch to use skb array instead of a customized circular buffer
- add non-blocking support
- rename .peek to .peek_len
- drop lockless peeking since test show very minor improvement
====================
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Acked-from-altitude: 34697 feet.
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/net/tun.c | 138 | ||||
-rw-r--r-- | drivers/vhost/net.c | 16 |
2 files changed, 145 insertions, 9 deletions
diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 4884802e0af1..74752159ec34 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -71,6 +71,7 @@ #include <net/sock.h> #include <linux/seq_file.h> #include <linux/uio.h> +#include <linux/skb_array.h> #include <asm/uaccess.h> @@ -167,6 +168,7 @@ struct tun_file { }; struct list_head next; struct tun_struct *detached; + struct skb_array tx_array; }; struct tun_flow_entry { @@ -515,7 +517,11 @@ static struct tun_struct *tun_enable_queue(struct tun_file *tfile) static void tun_queue_purge(struct tun_file *tfile) { - skb_queue_purge(&tfile->sk.sk_receive_queue); + struct sk_buff *skb; + + while ((skb = skb_array_consume(&tfile->tx_array)) != NULL) + kfree_skb(skb); + skb_queue_purge(&tfile->sk.sk_error_queue); } @@ -560,6 +566,8 @@ static void __tun_detach(struct tun_file *tfile, bool clean) tun->dev->reg_state == NETREG_REGISTERED) unregister_netdevice(tun->dev); } + if (tun) + skb_array_cleanup(&tfile->tx_array); sock_put(&tfile->sk); } } @@ -613,6 +621,7 @@ static void tun_detach_all(struct net_device *dev) static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filter) { struct tun_file *tfile = file->private_data; + struct net_device *dev = tun->dev; int err; err = security_tun_dev_attach(tfile->socket.sk, tun->security); @@ -642,6 +651,13 @@ static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filte if (!err) goto out; } + + if (!tfile->detached && + skb_array_init(&tfile->tx_array, dev->tx_queue_len, GFP_KERNEL)) { + err = -ENOMEM; + goto out; + } + tfile->queue_index = tun->numqueues; tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN; rcu_assign_pointer(tfile->tun, tun); @@ -891,8 +907,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) nf_reset(skb); - /* Enqueue packet */ - skb_queue_tail(&tfile->socket.sk->sk_receive_queue, skb); + if (skb_array_produce(&tfile->tx_array, skb)) + goto drop; /* Notify and wake up reader process */ if (tfile->flags & TUN_FASYNC) @@ -1107,7 +1123,7 @@ static unsigned int tun_chr_poll(struct file *file, poll_table *wait) poll_wait(file, sk_sleep(sk), wait); - if (!skb_queue_empty(&sk->sk_receive_queue)) + if (!skb_array_empty(&tfile->tx_array)) mask |= POLLIN | POLLRDNORM; if (sock_writeable(sk) || @@ -1426,22 +1442,61 @@ done: return total; } +static struct sk_buff *tun_ring_recv(struct tun_file *tfile, int noblock, + int *err) +{ + DECLARE_WAITQUEUE(wait, current); + struct sk_buff *skb = NULL; + + skb = skb_array_consume(&tfile->tx_array); + if (skb) + goto out; + if (noblock) { + *err = -EAGAIN; + goto out; + } + + add_wait_queue(&tfile->wq.wait, &wait); + current->state = TASK_INTERRUPTIBLE; + + while (1) { + skb = skb_array_consume(&tfile->tx_array); + if (skb) + break; + if (signal_pending(current)) { + *err = -ERESTARTSYS; + break; + } + if (tfile->socket.sk->sk_shutdown & RCV_SHUTDOWN) { + *err = -EFAULT; + break; + } + + schedule(); + } + + current->state = TASK_RUNNING; + remove_wait_queue(&tfile->wq.wait, &wait); + +out: + return skb; +} + static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, struct iov_iter *to, int noblock) { struct sk_buff *skb; ssize_t ret; - int peeked, err, off = 0; + int err; tun_debug(KERN_INFO, tun, "tun_do_read\n"); if (!iov_iter_count(to)) return 0; - /* Read frames from queue */ - skb = __skb_recv_datagram(tfile->socket.sk, noblock ? MSG_DONTWAIT : 0, - &peeked, &off, &err); + /* Read frames from ring */ + skb = tun_ring_recv(tfile, noblock, &err); if (!skb) return err; @@ -1574,8 +1629,25 @@ out: return ret; } +static int tun_peek_len(struct socket *sock) +{ + struct tun_file *tfile = container_of(sock, struct tun_file, socket); + struct tun_struct *tun; + int ret = 0; + + tun = __tun_get(tfile); + if (!tun) + return 0; + + ret = skb_array_peek_len(&tfile->tx_array); + tun_put(tun); + + return ret; +} + /* Ops structure to mimic raw sockets with tun */ static const struct proto_ops tun_socket_ops = { + .peek_len = tun_peek_len, .sendmsg = tun_sendmsg, .recvmsg = tun_recvmsg, }; @@ -2397,6 +2469,53 @@ static const struct ethtool_ops tun_ethtool_ops = { .get_ts_info = ethtool_op_get_ts_info, }; +static int tun_queue_resize(struct tun_struct *tun) +{ + struct net_device *dev = tun->dev; + struct tun_file *tfile; + struct skb_array **arrays; + int n = tun->numqueues + tun->numdisabled; + int ret, i; + + arrays = kmalloc(sizeof *arrays * n, GFP_KERNEL); + if (!arrays) + return -ENOMEM; + + for (i = 0; i < tun->numqueues; i++) { + tfile = rtnl_dereference(tun->tfiles[i]); + arrays[i] = &tfile->tx_array; + } + list_for_each_entry(tfile, &tun->disabled, next) + arrays[i++] = &tfile->tx_array; + + ret = skb_array_resize_multiple(arrays, n, + dev->tx_queue_len, GFP_KERNEL); + + kfree(arrays); + return ret; +} + +static int tun_device_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct tun_struct *tun = netdev_priv(dev); + + switch (event) { + case NETDEV_CHANGE_TX_QUEUE_LEN: + if (tun_queue_resize(tun)) + return NOTIFY_BAD; + break; + default: + break; + } + + return NOTIFY_DONE; +} + +static struct notifier_block tun_notifier_block __read_mostly = { + .notifier_call = tun_device_event, +}; static int __init tun_init(void) { @@ -2416,6 +2535,8 @@ static int __init tun_init(void) pr_err("Can't register misc device %d\n", TUN_MINOR); goto err_misc; } + + register_netdevice_notifier(&tun_notifier_block); return 0; err_misc: rtnl_link_unregister(&tun_link_ops); @@ -2427,6 +2548,7 @@ static void tun_cleanup(void) { misc_deregister(&tun_miscdev); rtnl_link_unregister(&tun_link_ops); + unregister_netdevice_notifier(&tun_notifier_block); } /* Get an underlying socket object from tun file. Returns error unless file is diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 1d3e45f84549..e032ca397371 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -481,10 +481,14 @@ out: static int peek_head_len(struct sock *sk) { + struct socket *sock = sk->sk_socket; struct sk_buff *head; int len = 0; unsigned long flags; + if (sock->ops->peek_len) + return sock->ops->peek_len(sock); + spin_lock_irqsave(&sk->sk_receive_queue.lock, flags); head = skb_peek(&sk->sk_receive_queue); if (likely(head)) { @@ -497,6 +501,16 @@ static int peek_head_len(struct sock *sk) return len; } +static int sk_has_rx_data(struct sock *sk) +{ + struct socket *sock = sk->sk_socket; + + if (sock->ops->peek_len) + return sock->ops->peek_len(sock); + + return skb_queue_empty(&sk->sk_receive_queue); +} + static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk) { struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; @@ -513,7 +527,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk) endtime = busy_clock() + vq->busyloop_timeout; while (vhost_can_busy_poll(&net->dev, endtime) && - skb_queue_empty(&sk->sk_receive_queue) && + !sk_has_rx_data(sk) && vhost_vq_avail_empty(&net->dev, vq)) cpu_relax_lowlatency(); |