summaryrefslogtreecommitdiffstats
path: root/drivers/net/virtio_net.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-11-13 09:40:34 +0100
committerLinus Torvalds <torvalds@linux-foundation.org>2013-11-13 09:40:34 +0100
commit42a2d923cc349583ebf6fdd52a7d35e1c2f7e6bd (patch)
tree2b2b0c03b5389c1301800119333967efafd994ca /drivers/net/virtio_net.c
parentMerge branch 'akpm' (patches from Andrew Morton) (diff)
parentMerge branch 'prandom' (diff)
downloadlinux-42a2d923cc349583ebf6fdd52a7d35e1c2f7e6bd.tar.xz
linux-42a2d923cc349583ebf6fdd52a7d35e1c2f7e6bd.zip
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller: 1) The addition of nftables. No longer will we need protocol aware firewall filtering modules, it can all live in userspace. At the core of nftables is a, for lack of a better term, virtual machine that executes byte codes to inspect packet or metadata (arriving interface index, etc.) and make verdict decisions. Besides support for loading packet contents and comparing them, the interpreter supports lookups in various datastructures as fundamental operations. For example sets are supports, and therefore one could create a set of whitelist IP address entries which have ACCEPT verdicts attached to them, and use the appropriate byte codes to do such lookups. Since the interpreted code is composed in userspace, userspace can do things like optimize things before giving it to the kernel. Another major improvement is the capability of atomically updating portions of the ruleset. In the existing netfilter implementation, one has to update the entire rule set in order to make a change and this is very expensive. Userspace tools exist to create nftables rules using existing netfilter rule sets, but both kernel implementations will need to co-exist for quite some time as we transition from the old to the new stuff. Kudos to Patrick McHardy, Pablo Neira Ayuso, and others who have worked so hard on this. 2) Daniel Borkmann and Hannes Frederic Sowa made several improvements to our pseudo-random number generator, mostly used for things like UDP port randomization and netfitler, amongst other things. In particular the taus88 generater is updated to taus113, and test cases are added. 3) Support 64-bit rates in HTB and TBF schedulers, from Eric Dumazet and Yang Yingliang. 4) Add support for new 577xx tigon3 chips to tg3 driver, from Nithin Sujir. 5) Fix two fatal flaws in TCP dynamic right sizing, from Eric Dumazet, Neal Cardwell, and Yuchung Cheng. 6) Allow IP_TOS and IP_TTL to be specified in sendmsg() ancillary control message data, much like other socket option attributes. From Francesco Fusco. 7) Allow applications to specify a cap on the rate computed automatically by the kernel for pacing flows, via a new SO_MAX_PACING_RATE socket option. From Eric Dumazet. 8) Make the initial autotuned send buffer sizing in TCP more closely reflect actual needs, from Eric Dumazet. 9) Currently early socket demux only happens for TCP sockets, but we can do it for connected UDP sockets too. Implementation from Shawn Bohrer. 10) Refactor inet socket demux with the goal of improving hash demux performance for listening sockets. With the main goals being able to use RCU lookups on even request sockets, and eliminating the listening lock contention. From Eric Dumazet. 11) The bonding layer has many demuxes in it's fast path, and an RCU conversion was started back in 3.11, several changes here extend the RCU usage to even more locations. From Ding Tianhong and Wang Yufen, based upon suggestions by Nikolay Aleksandrov and Veaceslav Falico. 12) Allow stackability of segmentation offloads to, in particular, allow segmentation offloading over tunnels. From Eric Dumazet. 13) Significantly improve the handling of secret keys we input into the various hash functions in the inet hashtables, TCP fast open, as well as syncookies. From Hannes Frederic Sowa. The key fundamental operation is "net_get_random_once()" which uses static keys. Hannes even extended this to ipv4/ipv6 fragmentation handling and our generic flow dissector. 14) The generic driver layer takes care now to set the driver data to NULL on device removal, so it's no longer necessary for drivers to explicitly set it to NULL any more. Many drivers have been cleaned up in this way, from Jingoo Han. 15) Add a BPF based packet scheduler classifier, from Daniel Borkmann. 16) Improve CRC32 interfaces and generic SKB checksum iterators so that SCTP's checksumming can more cleanly be handled. Also from Daniel Borkmann. 17) Add a new PMTU discovery mode, IP_PMTUDISC_INTERFACE, which forces using the interface MTU value. This helps avoid PMTU attacks, particularly on DNS servers. From Hannes Frederic Sowa. 18) Use generic XPS for transmit queue steering rather than internal (re-)implementation in virtio-net. From Jason Wang. * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1622 commits) random32: add test cases for taus113 implementation random32: upgrade taus88 generator to taus113 from errata paper random32: move rnd_state to linux/random.h random32: add prandom_reseed_late() and call when nonblocking pool becomes initialized random32: add periodic reseeding random32: fix off-by-one in seeding requirement PHY: Add RTL8201CP phy_driver to realtek xtsonic: add missing platform_set_drvdata() in xtsonic_probe() macmace: add missing platform_set_drvdata() in mace_probe() ethernet/arc/arc_emac: add missing platform_set_drvdata() in arc_emac_probe() ipv6: protect for_each_sk_fl_rcu in mem_check with rcu_read_lock_bh vlan: Implement vlan_dev_get_egress_qos_mask as an inline. ixgbe: add warning when max_vfs is out of range. igb: Update link modes display in ethtool netfilter: push reasm skb through instead of original frag skbs ip6_output: fragment outgoing reassembled skb properly MAINTAINERS: mv643xx_eth: take over maintainership from Lennart net_sched: tbf: support of 64bit rates ixgbe: deleting dfwd stations out of order can cause null ptr deref ixgbe: fix build err, num_rx_queues is only available with CONFIG_RPS ...
Diffstat (limited to 'drivers/net/virtio_net.c')
-rw-r--r--drivers/net/virtio_net.c219
1 files changed, 115 insertions, 104 deletions
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index bbc9cb84ec1f..01f4eb5c8b78 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -124,12 +124,14 @@ struct virtnet_info {
/* Lock for config space updates */
struct mutex config_lock;
+ /* Page_frag for GFP_KERNEL packet buffer allocation when we run
+ * low on memory.
+ */
+ struct page_frag alloc_frag;
+
/* Does the affinity hint is set for virtqueues? */
bool affinity_hint_set;
- /* Per-cpu variable to show the mapping from CPU to virtqueue */
- int __percpu *vq_index;
-
/* CPU hot plug notifier */
struct notifier_block nb;
};
@@ -217,33 +219,18 @@ static void skb_xmit_done(struct virtqueue *vq)
netif_wake_subqueue(vi->dev, vq2txq(vq));
}
-static void set_skb_frag(struct sk_buff *skb, struct page *page,
- unsigned int offset, unsigned int *len)
-{
- int size = min((unsigned)PAGE_SIZE - offset, *len);
- int i = skb_shinfo(skb)->nr_frags;
-
- __skb_fill_page_desc(skb, i, page, offset, size);
-
- skb->data_len += size;
- skb->len += size;
- skb->truesize += PAGE_SIZE;
- skb_shinfo(skb)->nr_frags++;
- skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
- *len -= size;
-}
-
/* Called from bottom half context */
static struct sk_buff *page_to_skb(struct receive_queue *rq,
- struct page *page, unsigned int len)
+ struct page *page, unsigned int offset,
+ unsigned int len, unsigned int truesize)
{
struct virtnet_info *vi = rq->vq->vdev->priv;
struct sk_buff *skb;
struct skb_vnet_hdr *hdr;
- unsigned int copy, hdr_len, offset;
+ unsigned int copy, hdr_len, hdr_padded_len;
char *p;
- p = page_address(page);
+ p = page_address(page) + offset;
/* copy small packet so we can reuse these pages for small data */
skb = netdev_alloc_skb_ip_align(vi->dev, GOOD_COPY_LEN);
@@ -254,16 +241,17 @@ static struct sk_buff *page_to_skb(struct receive_queue *rq,
if (vi->mergeable_rx_bufs) {
hdr_len = sizeof hdr->mhdr;
- offset = hdr_len;
+ hdr_padded_len = sizeof hdr->mhdr;
} else {
hdr_len = sizeof hdr->hdr;
- offset = sizeof(struct padded_vnet_hdr);
+ hdr_padded_len = sizeof(struct padded_vnet_hdr);
}
memcpy(hdr, p, hdr_len);
len -= hdr_len;
- p += offset;
+ offset += hdr_padded_len;
+ p += hdr_padded_len;
copy = len;
if (copy > skb_tailroom(skb))
@@ -273,6 +261,14 @@ static struct sk_buff *page_to_skb(struct receive_queue *rq,
len -= copy;
offset += copy;
+ if (vi->mergeable_rx_bufs) {
+ if (len)
+ skb_add_rx_frag(skb, 0, page, offset, len, truesize);
+ else
+ put_page(page);
+ return skb;
+ }
+
/*
* Verify that we can indeed put this data into a skb.
* This is here to handle cases when the device erroneously
@@ -284,9 +280,12 @@ static struct sk_buff *page_to_skb(struct receive_queue *rq,
dev_kfree_skb(skb);
return NULL;
}
-
+ BUG_ON(offset >= PAGE_SIZE);
while (len) {
- set_skb_frag(skb, page, offset, &len);
+ unsigned int frag_size = min((unsigned)PAGE_SIZE - offset, len);
+ skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, offset,
+ frag_size, truesize);
+ len -= frag_size;
page = (struct page *)page->private;
offset = 0;
}
@@ -297,33 +296,59 @@ static struct sk_buff *page_to_skb(struct receive_queue *rq,
return skb;
}
-static int receive_mergeable(struct receive_queue *rq, struct sk_buff *skb)
+static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb)
{
- struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb);
+ struct skb_vnet_hdr *hdr = skb_vnet_hdr(head_skb);
+ struct sk_buff *curr_skb = head_skb;
+ char *buf;
struct page *page;
- int num_buf, i, len;
+ int num_buf, len, offset;
num_buf = hdr->mhdr.num_buffers;
while (--num_buf) {
- i = skb_shinfo(skb)->nr_frags;
- if (i >= MAX_SKB_FRAGS) {
- pr_debug("%s: packet too long\n", skb->dev->name);
- skb->dev->stats.rx_length_errors++;
- return -EINVAL;
- }
- page = virtqueue_get_buf(rq->vq, &len);
- if (!page) {
+ int num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
+ buf = virtqueue_get_buf(rq->vq, &len);
+ if (unlikely(!buf)) {
pr_debug("%s: rx error: %d buffers missing\n",
- skb->dev->name, hdr->mhdr.num_buffers);
- skb->dev->stats.rx_length_errors++;
+ head_skb->dev->name, hdr->mhdr.num_buffers);
+ head_skb->dev->stats.rx_length_errors++;
return -EINVAL;
}
-
- if (len > PAGE_SIZE)
- len = PAGE_SIZE;
-
- set_skb_frag(skb, page, 0, &len);
-
+ if (unlikely(len > MAX_PACKET_LEN)) {
+ pr_debug("%s: rx error: merge buffer too long\n",
+ head_skb->dev->name);
+ len = MAX_PACKET_LEN;
+ }
+ if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
+ struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
+ if (unlikely(!nskb)) {
+ head_skb->dev->stats.rx_dropped++;
+ return -ENOMEM;
+ }
+ if (curr_skb == head_skb)
+ skb_shinfo(curr_skb)->frag_list = nskb;
+ else
+ curr_skb->next = nskb;
+ curr_skb = nskb;
+ head_skb->truesize += nskb->truesize;
+ num_skb_frags = 0;
+ }
+ if (curr_skb != head_skb) {
+ head_skb->data_len += len;
+ head_skb->len += len;
+ head_skb->truesize += MAX_PACKET_LEN;
+ }
+ page = virt_to_head_page(buf);
+ offset = buf - (char *)page_address(page);
+ if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
+ put_page(page);
+ skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
+ len, MAX_PACKET_LEN);
+ } else {
+ skb_add_rx_frag(curr_skb, num_skb_frags, page,
+ offset, len,
+ MAX_PACKET_LEN);
+ }
--rq->num;
}
return 0;
@@ -341,8 +366,10 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
if (unlikely(len < sizeof(struct virtio_net_hdr) + ETH_HLEN)) {
pr_debug("%s: short packet %i\n", dev->name, len);
dev->stats.rx_length_errors++;
- if (vi->mergeable_rx_bufs || vi->big_packets)
+ if (vi->big_packets)
give_pages(rq, buf);
+ else if (vi->mergeable_rx_bufs)
+ put_page(virt_to_head_page(buf));
else
dev_kfree_skb(buf);
return;
@@ -352,19 +379,28 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
skb = buf;
len -= sizeof(struct virtio_net_hdr);
skb_trim(skb, len);
+ } else if (vi->mergeable_rx_bufs) {
+ struct page *page = virt_to_head_page(buf);
+ skb = page_to_skb(rq, page,
+ (char *)buf - (char *)page_address(page),
+ len, MAX_PACKET_LEN);
+ if (unlikely(!skb)) {
+ dev->stats.rx_dropped++;
+ put_page(page);
+ return;
+ }
+ if (receive_mergeable(rq, skb)) {
+ dev_kfree_skb(skb);
+ return;
+ }
} else {
page = buf;
- skb = page_to_skb(rq, page, len);
+ skb = page_to_skb(rq, page, 0, len, PAGE_SIZE);
if (unlikely(!skb)) {
dev->stats.rx_dropped++;
give_pages(rq, page);
return;
}
- if (vi->mergeable_rx_bufs)
- if (receive_mergeable(rq, skb)) {
- dev_kfree_skb(skb);
- return;
- }
}
hdr = skb_vnet_hdr(skb);
@@ -501,18 +537,28 @@ static int add_recvbuf_big(struct receive_queue *rq, gfp_t gfp)
static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
{
- struct page *page;
+ struct virtnet_info *vi = rq->vq->vdev->priv;
+ char *buf = NULL;
int err;
- page = get_a_page(rq, gfp);
- if (!page)
+ if (gfp & __GFP_WAIT) {
+ if (skb_page_frag_refill(MAX_PACKET_LEN, &vi->alloc_frag,
+ gfp)) {
+ buf = (char *)page_address(vi->alloc_frag.page) +
+ vi->alloc_frag.offset;
+ get_page(vi->alloc_frag.page);
+ vi->alloc_frag.offset += MAX_PACKET_LEN;
+ }
+ } else {
+ buf = netdev_alloc_frag(MAX_PACKET_LEN);
+ }
+ if (!buf)
return -ENOMEM;
- sg_init_one(rq->sg, page_address(page), PAGE_SIZE);
-
- err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, page, gfp);
+ sg_init_one(rq->sg, buf, MAX_PACKET_LEN);
+ err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, buf, gfp);
if (err < 0)
- give_pages(rq, page);
+ put_page(virt_to_head_page(buf));
return err;
}
@@ -1065,7 +1111,6 @@ static int virtnet_vlan_rx_kill_vid(struct net_device *dev,
static void virtnet_clean_affinity(struct virtnet_info *vi, long hcpu)
{
int i;
- int cpu;
if (vi->affinity_hint_set) {
for (i = 0; i < vi->max_queue_pairs; i++) {
@@ -1075,16 +1120,6 @@ static void virtnet_clean_affinity(struct virtnet_info *vi, long hcpu)
vi->affinity_hint_set = false;
}
-
- i = 0;
- for_each_online_cpu(cpu) {
- if (cpu == hcpu) {
- *per_cpu_ptr(vi->vq_index, cpu) = -1;
- } else {
- *per_cpu_ptr(vi->vq_index, cpu) =
- ++i % vi->curr_queue_pairs;
- }
- }
}
static void virtnet_set_affinity(struct virtnet_info *vi)
@@ -1106,7 +1141,7 @@ static void virtnet_set_affinity(struct virtnet_info *vi)
for_each_online_cpu(cpu) {
virtqueue_set_affinity(vi->rq[i].vq, cpu);
virtqueue_set_affinity(vi->sq[i].vq, cpu);
- *per_cpu_ptr(vi->vq_index, cpu) = i;
+ netif_set_xps_queue(vi->dev, cpumask_of(cpu), i);
i++;
}
@@ -1220,28 +1255,6 @@ static int virtnet_change_mtu(struct net_device *dev, int new_mtu)
return 0;
}
-/* To avoid contending a lock hold by a vcpu who would exit to host, select the
- * txq based on the processor id.
- */
-static u16 virtnet_select_queue(struct net_device *dev, struct sk_buff *skb)
-{
- int txq;
- struct virtnet_info *vi = netdev_priv(dev);
-
- if (skb_rx_queue_recorded(skb)) {
- txq = skb_get_rx_queue(skb);
- } else {
- txq = *__this_cpu_ptr(vi->vq_index);
- if (txq == -1)
- txq = 0;
- }
-
- while (unlikely(txq >= dev->real_num_tx_queues))
- txq -= dev->real_num_tx_queues;
-
- return txq;
-}
-
static const struct net_device_ops virtnet_netdev = {
.ndo_open = virtnet_open,
.ndo_stop = virtnet_close,
@@ -1253,7 +1266,6 @@ static const struct net_device_ops virtnet_netdev = {
.ndo_get_stats64 = virtnet_stats,
.ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
.ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
- .ndo_select_queue = virtnet_select_queue,
#ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_poll_controller = virtnet_netpoll,
#endif
@@ -1336,8 +1348,10 @@ static void free_unused_bufs(struct virtnet_info *vi)
struct virtqueue *vq = vi->rq[i].vq;
while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
- if (vi->mergeable_rx_bufs || vi->big_packets)
+ if (vi->big_packets)
give_pages(&vi->rq[i], buf);
+ else if (vi->mergeable_rx_bufs)
+ put_page(virt_to_head_page(buf));
else
dev_kfree_skb(buf);
--vi->rq[i].num;
@@ -1562,10 +1576,6 @@ static int virtnet_probe(struct virtio_device *vdev)
if (vi->stats == NULL)
goto free;
- vi->vq_index = alloc_percpu(int);
- if (vi->vq_index == NULL)
- goto free_stats;
-
mutex_init(&vi->config_lock);
vi->config_enable = true;
INIT_WORK(&vi->config_work, virtnet_config_changed_work);
@@ -1592,7 +1602,7 @@ static int virtnet_probe(struct virtio_device *vdev)
/* Allocate/initialize the rx/tx queues, and invoke find_vqs */
err = init_vqs(vi);
if (err)
- goto free_index;
+ goto free_stats;
netif_set_real_num_tx_queues(dev, 1);
netif_set_real_num_rx_queues(dev, 1);
@@ -1643,8 +1653,8 @@ free_recv_bufs:
free_vqs:
cancel_delayed_work_sync(&vi->refill);
virtnet_del_vqs(vi);
-free_index:
- free_percpu(vi->vq_index);
+ if (vi->alloc_frag.page)
+ put_page(vi->alloc_frag.page);
free_stats:
free_percpu(vi->stats);
free:
@@ -1678,10 +1688,11 @@ static void virtnet_remove(struct virtio_device *vdev)
unregister_netdev(vi->dev);
remove_vq_common(vi);
+ if (vi->alloc_frag.page)
+ put_page(vi->alloc_frag.page);
flush_work(&vi->config_work);
- free_percpu(vi->vq_index);
free_percpu(vi->stats);
free_netdev(vi->dev);
}