From 584ec4355355ffac43571b02a314d43eb2f7fcbf Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Tue, 16 Jul 2013 10:49:41 -0400 Subject: atl1e: unmap partially mapped skb on dma error and free skb Ben Hutchings pointed out that my recent update to atl1e in commit 352900b583b2852152a1e05ea0e8b579292e731e ("atl1e: fix dma mapping warnings") was missing a bit of code. Specifically it reset the hardware tx ring to its origional state when we hit a dma error, but didn't unmap any exiting mappings from the operation. This patch fixes that up. It also remembers to free the skb in the event that an error occurs, so we don't leak. Untested, as I don't have hardware. I think its pretty straightforward, but please review closely. Signed-off-by: Neil Horman CC: Ben Hutchings CC: Jay Cliburn CC: Chris Snook CC: "David S. Miller" Signed-off-by: David S. Miller --- drivers/net/ethernet/atheros/atl1e/atl1e_main.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c index 6d1a62a84c9d..1966444590f6 100644 --- a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c +++ b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c @@ -1678,6 +1678,7 @@ static int atl1e_tx_map(struct atl1e_adapter *adapter, u16 f; int segment; int ring_start = adapter->tx_ring.next_to_use; + int ring_end; nr_frags = skb_shinfo(skb)->nr_frags; segment = (tpd->word3 >> TPD_SEGMENT_EN_SHIFT) & TPD_SEGMENT_EN_MASK; @@ -1721,6 +1722,15 @@ static int atl1e_tx_map(struct atl1e_adapter *adapter, map_len, PCI_DMA_TODEVICE); if (dma_mapping_error(&adapter->pdev->dev, tx_buffer->dma)) { + /* We need to unwind the mappings we've done */ + ring_end = adapter->tx_ring.next_to_use; + adapter->tx_ring.next_to_use = ring_start; + while (adapter->tx_ring.next_to_use != ring_end) { + tpd = atl1e_get_tpd(adapter); + tx_buffer = atl1e_get_tx_buffer(adapter, tpd); + pci_unmap_single(adapter->pdev, tx_buffer->dma, + tx_buffer->length, PCI_DMA_TODEVICE); + } /* Reset the tx rings next pointer */ adapter->tx_ring.next_to_use = ring_start; return -ENOSPC; @@ -1763,6 +1773,16 @@ static int atl1e_tx_map(struct atl1e_adapter *adapter, DMA_TO_DEVICE); if (dma_mapping_error(&adapter->pdev->dev, tx_buffer->dma)) { + /* We need to unwind the mappings we've done */ + ring_end = adapter->tx_ring.next_to_use; + adapter->tx_ring.next_to_use = ring_start; + while (adapter->tx_ring.next_to_use != ring_end) { + tpd = atl1e_get_tpd(adapter); + tx_buffer = atl1e_get_tx_buffer(adapter, tpd); + dma_unmap_page(&adapter->pdev->dev, tx_buffer->dma, + tx_buffer->length, DMA_TO_DEVICE); + } + /* Reset the ring next to use pointer */ adapter->tx_ring.next_to_use = ring_start; return -ENOSPC; @@ -1853,8 +1873,10 @@ static netdev_tx_t atl1e_xmit_frame(struct sk_buff *skb, return NETDEV_TX_OK; } - if (atl1e_tx_map(adapter, skb, tpd)) + if (atl1e_tx_map(adapter, skb, tpd)) { + dev_kfree_skb_any(skb); goto out; + } atl1e_tx_queue(adapter, tpd_req, tpd); -- cgit v1.2.3 From 31bd29776b8575b8cf7c25bbf7496a460278dc83 Mon Sep 17 00:00:00 2001 From: Hauke Mehrtens Date: Mon, 15 Jul 2013 13:26:27 +0200 Subject: bgmac: add dependency to phylib bgmac is using functions from phylib, add the dependency. Signed-off-by: Hauke Mehrtens Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers') diff --git a/drivers/net/ethernet/broadcom/Kconfig b/drivers/net/ethernet/broadcom/Kconfig index 1d680baf43d6..52c96036dcc4 100644 --- a/drivers/net/ethernet/broadcom/Kconfig +++ b/drivers/net/ethernet/broadcom/Kconfig @@ -131,6 +131,7 @@ config BNX2X_SRIOV config BGMAC tristate "BCMA bus GBit core support" depends on BCMA_HOST_SOC && HAS_DMA + select PHYLIB ---help--- This driver supports GBit MAC and BCM4706 GBit MAC cores on BCMA bus. They can be found on BCM47xx SoCs and provide gigabit ethernet. -- cgit v1.2.3 From 9a0f06fee1f453370e87b6568dc1d39d676f53fc Mon Sep 17 00:00:00 2001 From: Tim Gardner Date: Mon, 15 Jul 2013 08:56:45 -0600 Subject: mlx5 core: Fix __udivdi3 when compiling for 32 bit arches Cc: Eli Cohen Signed-off-by: Tim Gardner Acked-by: Randy Dunlap Reported-by: Randy Dunlap Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/debugfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c index 4273c06e2e96..9c7194b26ee2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c @@ -156,7 +156,7 @@ static ssize_t average_read(struct file *filp, char __user *buf, size_t count, stats = filp->private_data; spin_lock(&stats->lock); if (stats->n) - field = stats->sum / stats->n; + field = div64_u64(stats->sum, stats->n); spin_unlock(&stats->lock); ret = snprintf(tbuf, sizeof(tbuf), "%llu\n", field); if (ret > 0) { -- cgit v1.2.3 From 82a19eb8c02ab98bfe0bf6fa4915de370acb2858 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 16 Jul 2013 13:36:33 +0800 Subject: macvtap: fix the missing ret value of TUNSETQUEUE Commit 441ac0fcaadc76ad09771812382345001dd2b813 (macvtap: Convert to using rtnl lock) forget to return what macvtap_ioctl_set_queue() returns to its caller. This may break multiqueue API by always falling through to TUNGETFEATURES. Cc: Vlad Yasevich Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/macvtap.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers') diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index 876c72246ae9..0e5492ec753a 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -1107,6 +1107,7 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd, rtnl_lock(); ret = macvtap_ioctl_set_queue(file, u); rtnl_unlock(); + return ret; case TUNGETFEATURES: if (put_user(IFF_TAP | IFF_NO_PI | IFF_VNET_HDR | -- cgit v1.2.3 From 0fbe0d47b1ee2015c288abd4e7b32224f8bfa212 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 16 Jul 2013 13:36:34 +0800 Subject: macvtap: do not assume 802.1Q when send vlan packets The hard-coded 8021.q proto will break 802.1ad traffic. So switch to use vlan->proto. Cc: Basil Gor Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/macvtap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index 0e5492ec753a..a7c5654a2e5c 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -873,7 +873,7 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q, __be16 h_vlan_proto; __be16 h_vlan_TCI; } veth; - veth.h_vlan_proto = htons(ETH_P_8021Q); + veth.h_vlan_proto = skb->vlan_proto; veth.h_vlan_TCI = htons(vlan_tx_tag_get(skb)); vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto); -- cgit v1.2.3 From 52fe29e4bb614367c108b717c6d7fe5953eb7af3 Mon Sep 17 00:00:00 2001 From: Sarveshwar Bandi Date: Tue, 16 Jul 2013 12:44:02 +0530 Subject: be2net: Fix to avoid hardware workaround when not needed Hardware workaround requesting hardware to skip vlan insertion is necessary only when umc or qnq is enabled. Enabling this workaround in other scenarios could cause controller to stall. Signed-off-by: Sarveshwar Bandi Signed-off-by: David S. Miller --- drivers/net/ethernet/emulex/benet/be_main.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index 2df48bb0f1ca..181edb522450 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -782,16 +782,22 @@ static struct sk_buff *be_insert_vlan_in_pkt(struct be_adapter *adapter, if (vlan_tx_tag_present(skb)) vlan_tag = be_get_tx_vlan_tag(adapter, skb); - else if (qnq_async_evt_rcvd(adapter) && adapter->pvid) - vlan_tag = adapter->pvid; + + if (qnq_async_evt_rcvd(adapter) && adapter->pvid) { + if (!vlan_tag) + vlan_tag = adapter->pvid; + /* f/w workaround to set skip_hw_vlan = 1, informs the F/W to + * skip VLAN insertion + */ + if (skip_hw_vlan) + *skip_hw_vlan = true; + } if (vlan_tag) { skb = __vlan_put_tag(skb, htons(ETH_P_8021Q), vlan_tag); if (unlikely(!skb)) return skb; skb->vlan_tci = 0; - if (skip_hw_vlan) - *skip_hw_vlan = true; } /* Insert the outer VLAN, if any */ -- cgit v1.2.3 From f45708209dc445bac0844f6ce86e315a2ffe8a29 Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Tue, 16 Jul 2013 23:01:20 -0700 Subject: hyperv: Fix the NETIF_F_SG flag setting in netvsc SG mode is not currently supported by netvsc, so remove this flag for now. Otherwise, it will be unconditionally enabled by commit ec5f0615642 "Kill link between CSUM and SG features" Previously, the SG feature is disabled because CSUM is not set here. Signed-off-by: Haiyang Zhang Reviewed-by: K. Y. Srinivasan Signed-off-by: David S. Miller --- drivers/net/hyperv/netvsc_drv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 4dccead586be..23a0fff0df52 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -431,8 +431,8 @@ static int netvsc_probe(struct hv_device *dev, net->netdev_ops = &device_ops; /* TODO: Add GSO and Checksum offload */ - net->hw_features = NETIF_F_SG; - net->features = NETIF_F_SG | NETIF_F_HW_VLAN_CTAG_TX; + net->hw_features = 0; + net->features = NETIF_F_HW_VLAN_CTAG_TX; SET_ETHTOOL_OPS(net, ðtool_ops); SET_NETDEV_DEV(net, &dev->device); -- cgit v1.2.3 From fe5c3561e6f0ac7c9546209f01351113c1b77ec8 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Sat, 13 Jul 2013 10:18:18 -0700 Subject: vxlan: add necessary locking on device removal The socket management is now done in workqueue (outside of RTNL) and protected by vn->sock_lock. There were two possible bugs, first the vxlan device was removed from the VNI hash table per socket without holding lock. And there was a race when device is created and the workqueue could run after deletion. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'drivers') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 0ba1e7edbb1b..a5ba8dd7e6be 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1767,9 +1767,15 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, static void vxlan_dellink(struct net_device *dev, struct list_head *head) { + struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); struct vxlan_dev *vxlan = netdev_priv(dev); + flush_workqueue(vxlan_wq); + + spin_lock(&vn->sock_lock); hlist_del_rcu(&vxlan->hlist); + spin_unlock(&vn->sock_lock); + list_del(&vxlan->next); unregister_netdevice_queue(dev, head); } -- cgit v1.2.3 From 093b9c71b6e450e375f4646ba86faed0195ec7df Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 17 Jul 2013 08:09:37 +0100 Subject: xen-netfront: pull on receive skb may need to happen earlier Due to commit 3683243b ("xen-netfront: use __pskb_pull_tail to ensure linear area is big enough on RX") xennet_fill_frags() may end up filling MAX_SKB_FRAGS + 1 fragments in a receive skb, and only reduce the fragment count subsequently via __pskb_pull_tail(). That's a result of xennet_get_responses() allowing a maximum of one more slot to be consumed (and intermediately transformed into a fragment) if the head slot has a size less than or equal to RX_COPY_THRESHOLD. Hence we need to adjust xennet_fill_frags() to pull earlier if we reached the maximum fragment count - due to the described behavior of xennet_get_responses() this guarantees that at least the first fragment will get completely consumed, and hence the fragment count reduced. In order to not needlessly call __pskb_pull_tail() twice, make the original call conditional upon the pull target not having been reached yet, and defer the newly added one as much as possible (an alternative would have been to always call the function right before the call to xennet_fill_frags(), but that would imply more frequent cases of needing to call it twice). Signed-off-by: Jan Beulich Acked-by: Wei Liu Cc: Ian Campbell Cc: stable@vger.kernel.org (3.6 onwards) Acked-by: Ian Campbell Signed-off-by: David S. Miller --- drivers/net/xen-netfront.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) (limited to 'drivers') diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index ff7f111fffee..36808bf25677 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -286,8 +286,7 @@ no_skb: break; } - __skb_fill_page_desc(skb, 0, page, 0, 0); - skb_shinfo(skb)->nr_frags = 1; + skb_add_rx_frag(skb, 0, page, 0, 0, PAGE_SIZE); __skb_queue_tail(&np->rx_batch, skb); } @@ -831,7 +830,6 @@ static RING_IDX xennet_fill_frags(struct netfront_info *np, struct sk_buff_head *list) { struct skb_shared_info *shinfo = skb_shinfo(skb); - int nr_frags = shinfo->nr_frags; RING_IDX cons = np->rx.rsp_cons; struct sk_buff *nskb; @@ -840,19 +838,21 @@ static RING_IDX xennet_fill_frags(struct netfront_info *np, RING_GET_RESPONSE(&np->rx, ++cons); skb_frag_t *nfrag = &skb_shinfo(nskb)->frags[0]; - __skb_fill_page_desc(skb, nr_frags, - skb_frag_page(nfrag), - rx->offset, rx->status); + if (shinfo->nr_frags == MAX_SKB_FRAGS) { + unsigned int pull_to = NETFRONT_SKB_CB(skb)->pull_to; - skb->data_len += rx->status; + BUG_ON(pull_to <= skb_headlen(skb)); + __pskb_pull_tail(skb, pull_to - skb_headlen(skb)); + } + BUG_ON(shinfo->nr_frags >= MAX_SKB_FRAGS); + + skb_add_rx_frag(skb, shinfo->nr_frags, skb_frag_page(nfrag), + rx->offset, rx->status, PAGE_SIZE); skb_shinfo(nskb)->nr_frags = 0; kfree_skb(nskb); - - nr_frags++; } - shinfo->nr_frags = nr_frags; return cons; } @@ -933,7 +933,8 @@ static int handle_incoming_queue(struct net_device *dev, while ((skb = __skb_dequeue(rxq)) != NULL) { int pull_to = NETFRONT_SKB_CB(skb)->pull_to; - __pskb_pull_tail(skb, pull_to - skb_headlen(skb)); + if (pull_to > skb_headlen(skb)) + __pskb_pull_tail(skb, pull_to - skb_headlen(skb)); /* Ethernet work: Delayed to here as it peeks the header. */ skb->protocol = eth_type_trans(skb, dev); @@ -1019,16 +1020,10 @@ err: skb_shinfo(skb)->frags[0].page_offset = rx->offset; skb_frag_size_set(&skb_shinfo(skb)->frags[0], rx->status); skb->data_len = rx->status; + skb->len += rx->status; i = xennet_fill_frags(np, skb, &tmpq); - /* - * Truesize is the actual allocation size, even if the - * allocation is only partially used. - */ - skb->truesize += PAGE_SIZE * skb_shinfo(skb)->nr_frags; - skb->len += skb->data_len; - if (rx->flags & XEN_NETRXF_csum_blank) skb->ip_summed = CHECKSUM_PARTIAL; else if (rx->flags & XEN_NETRXF_data_validated) -- cgit v1.2.3 From 885291761dba2bfe04df4c0f7bb75e4c920ab82e Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 18 Jul 2013 10:55:15 +0800 Subject: tuntap: do not zerocopy if iov needs more pages than MAX_SKB_FRAGS We try to linearize part of the skb when the number of iov is greater than MAX_SKB_FRAGS. This is not enough since each single vector may occupy more than one pages, so zerocopy_sg_fromiovec() may still fail and may break the guest network. Solve this problem by calculate the pages needed for iov before trying to do zerocopy and switch to use copy instead of zerocopy if it needs more than MAX_SKB_FRAGS. This is done through introducing a new helper to count the pages for iov, and call uarg->callback() manually when switching from zerocopy to copy to notify vhost. We can do further optimization on top. The bug were introduced from commit 0690899b4d4501b3505be069b9a687e68ccbe15b (tun: experimental zero copy tx support) Cc: Michael S. Tsirkin Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/tun.c | 62 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 38 insertions(+), 24 deletions(-) (limited to 'drivers') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 5cdcf92eb310..db690a372260 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1035,6 +1035,29 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, return 0; } +static unsigned long iov_pages(const struct iovec *iv, int offset, + unsigned long nr_segs) +{ + unsigned long seg, base; + int pages = 0, len, size; + + while (nr_segs && (offset >= iv->iov_len)) { + offset -= iv->iov_len; + ++iv; + --nr_segs; + } + + for (seg = 0; seg < nr_segs; seg++) { + base = (unsigned long)iv[seg].iov_base + offset; + len = iv[seg].iov_len - offset; + size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; + pages += size; + offset = 0; + } + + return pages; +} + /* Get packet from user space buffer */ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, void *msg_control, const struct iovec *iv, @@ -1082,32 +1105,18 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, return -EINVAL; } - if (msg_control) - zerocopy = true; - - if (zerocopy) { - /* Userspace may produce vectors with count greater than - * MAX_SKB_FRAGS, so we need to linearize parts of the skb - * to let the rest of data to be fit in the frags. - */ - if (count > MAX_SKB_FRAGS) { - copylen = iov_length(iv, count - MAX_SKB_FRAGS); - if (copylen < offset) - copylen = 0; - else - copylen -= offset; - } else - copylen = 0; - /* There are 256 bytes to be copied in skb, so there is enough - * room for skb expand head in case it is used. + if (msg_control) { + /* There are 256 bytes to be copied in skb, so there is + * enough room for skb expand head in case it is used. * The rest of the buffer is mapped from userspace. */ - if (copylen < gso.hdr_len) - copylen = gso.hdr_len; - if (!copylen) - copylen = GOODCOPY_LEN; + copylen = gso.hdr_len ? gso.hdr_len : GOODCOPY_LEN; linear = copylen; - } else { + if (iov_pages(iv, offset + copylen, count) <= MAX_SKB_FRAGS) + zerocopy = true; + } + + if (!zerocopy) { copylen = len; linear = gso.hdr_len; } @@ -1121,8 +1130,13 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, if (zerocopy) err = zerocopy_sg_from_iovec(skb, iv, offset, count); - else + else { err = skb_copy_datagram_from_iovec(skb, 0, iv, offset, len); + if (!err && msg_control) { + struct ubuf_info *uarg = msg_control; + uarg->callback(uarg, false); + } + } if (err) { tun->dev->stats.rx_dropped++; -- cgit v1.2.3 From ece793fcfc417b3925844be88a6a6dc82ae8f7c6 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 18 Jul 2013 10:55:16 +0800 Subject: macvtap: do not zerocopy if iov needs more pages than MAX_SKB_FRAGS We try to linearize part of the skb when the number of iov is greater than MAX_SKB_FRAGS. This is not enough since each single vector may occupy more than one pages, so zerocopy_sg_fromiovec() may still fail and may break the guest network. Solve this problem by calculate the pages needed for iov before trying to do zerocopy and switch to use copy instead of zerocopy if it needs more than MAX_SKB_FRAGS. This is done through introducing a new helper to count the pages for iov, and call uarg->callback() manually when switching from zerocopy to copy to notify vhost. We can do further optimization on top. This bug were introduced from b92946e2919134ebe2a4083e4302236295ea2a73 (macvtap: zerocopy: validate vectors before building skb). Cc: Michael S. Tsirkin Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/macvtap.c | 62 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 37 insertions(+), 25 deletions(-) (limited to 'drivers') diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index a7c5654a2e5c..a98fb0ed6aef 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -698,6 +698,28 @@ static int macvtap_skb_to_vnet_hdr(const struct sk_buff *skb, return 0; } +static unsigned long iov_pages(const struct iovec *iv, int offset, + unsigned long nr_segs) +{ + unsigned long seg, base; + int pages = 0, len, size; + + while (nr_segs && (offset >= iv->iov_len)) { + offset -= iv->iov_len; + ++iv; + --nr_segs; + } + + for (seg = 0; seg < nr_segs; seg++) { + base = (unsigned long)iv[seg].iov_base + offset; + len = iv[seg].iov_len - offset; + size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; + pages += size; + offset = 0; + } + + return pages; +} /* Get packet from user space buffer */ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m, @@ -744,31 +766,15 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m, if (unlikely(count > UIO_MAXIOV)) goto err; - if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) - zerocopy = true; - - if (zerocopy) { - /* Userspace may produce vectors with count greater than - * MAX_SKB_FRAGS, so we need to linearize parts of the skb - * to let the rest of data to be fit in the frags. - */ - if (count > MAX_SKB_FRAGS) { - copylen = iov_length(iv, count - MAX_SKB_FRAGS); - if (copylen < vnet_hdr_len) - copylen = 0; - else - copylen -= vnet_hdr_len; - } - /* There are 256 bytes to be copied in skb, so there is enough - * room for skb expand head in case it is used. - * The rest buffer is mapped from userspace. - */ - if (copylen < vnet_hdr.hdr_len) - copylen = vnet_hdr.hdr_len; - if (!copylen) - copylen = GOODCOPY_LEN; + if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) { + copylen = vnet_hdr.hdr_len ? vnet_hdr.hdr_len : GOODCOPY_LEN; linear = copylen; - } else { + if (iov_pages(iv, vnet_hdr_len + copylen, count) + <= MAX_SKB_FRAGS) + zerocopy = true; + } + + if (!zerocopy) { copylen = len; linear = vnet_hdr.hdr_len; } @@ -780,9 +786,15 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m, if (zerocopy) err = zerocopy_sg_from_iovec(skb, iv, vnet_hdr_len, count); - else + else { err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len, len); + if (!err && m && m->msg_control) { + struct ubuf_info *uarg = m->msg_control; + uarg->callback(uarg, false); + } + } + if (err) goto err_kfree; -- cgit v1.2.3