From fd6055a806edc4019be1b9fb7d25262599bca5b1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 22 Aug 2017 09:39:28 -0700 Subject: udp: on peeking bad csum, drop packets even if not at head When peeking, if a bad csum is discovered, the skb is unlinked from the queue with __sk_queue_drop_skb and the peek operation restarted. __sk_queue_drop_skb only drops packets that match the queue head. This fails if the skb was found after the head, using SO_PEEK_OFF socket option. This causes an infinite loop. We MUST drop this problematic skb, and we can simply check if skb was already removed by another thread, by looking at skb->next : This pointer is set to NULL by the __skb_unlink() operation, that might have happened only under the spinlock protection. Many thanks to syzkaller team (and particularly Dmitry Vyukov who provided us nice C reproducers exhibiting the lockup) and Willem de Bruijn who provided first version for this patch and a test program. Fixes: 627d2d6b5500 ("udp: enable MSG_PEEK at non-zero offset") Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Cc: Willem de Bruijn Acked-by: Paolo Abeni Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/datagram.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/datagram.c b/net/core/datagram.c index a21ca8dee5ea..8c2f4489ff8f 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -362,7 +362,7 @@ int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue, if (flags & MSG_PEEK) { err = -ENOENT; spin_lock_bh(&sk_queue->lock); - if (skb == skb_peek(sk_queue)) { + if (skb->next) { __skb_unlink(skb, sk_queue); refcount_dec(&skb->users); if (destructor) -- cgit v1.2.3 From cd0a137acbb66208368353723f5f1480995cf1c4 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 22 Aug 2017 15:12:14 -0700 Subject: net: core: Specify skb_pad()/skb_put_padto() SKB freeing Rename skb_pad() into __skb_pad() and make it take a third argument: free_on_error which controls whether kfree_skb() should be called or not, skb_pad() directly makes use of it and passes true to preserve its existing behavior. Do exactly the same thing with __skb_put_padto() and skb_put_padto(). Suggested-by: David Miller Signed-off-by: Florian Fainelli Reviewed-by: Woojung Huh Signed-off-by: David S. Miller --- include/linux/skbuff.h | 41 +++++++++++++++++++++++++++++++++++++---- net/core/skbuff.c | 13 ++++++++----- 2 files changed, 45 insertions(+), 9 deletions(-) (limited to 'net/core') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index dbe29b6c9bd6..d67a8182e5eb 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -973,7 +973,23 @@ int __must_check skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg int __must_check skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len); int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer); -int skb_pad(struct sk_buff *skb, int pad); +int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error); + +/** + * skb_pad - zero pad the tail of an skb + * @skb: buffer to pad + * @pad: space to pad + * + * Ensure that a buffer is followed by a padding area that is zero + * filled. Used by network drivers which may DMA or transfer data + * beyond the buffer end onto the wire. + * + * May return error in out of memory cases. The skb is freed on error. + */ +static inline int skb_pad(struct sk_buff *skb, int pad) +{ + return __skb_pad(skb, pad, true); +} #define dev_kfree_skb(a) consume_skb(a) int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, @@ -2825,25 +2841,42 @@ static inline int skb_padto(struct sk_buff *skb, unsigned int len) * skb_put_padto - increase size and pad an skbuff up to a minimal size * @skb: buffer to pad * @len: minimal length + * @free_on_error: free buffer on error * * Pads up a buffer to ensure the trailing bytes exist and are * blanked. If the buffer already contains sufficient data it * is untouched. Otherwise it is extended. Returns zero on - * success. The skb is freed on error. + * success. The skb is freed on error if @free_on_error is true. */ -static inline int skb_put_padto(struct sk_buff *skb, unsigned int len) +static inline int __skb_put_padto(struct sk_buff *skb, unsigned int len, + bool free_on_error) { unsigned int size = skb->len; if (unlikely(size < len)) { len -= size; - if (skb_pad(skb, len)) + if (__skb_pad(skb, len, free_on_error)) return -ENOMEM; __skb_put(skb, len); } return 0; } +/** + * skb_put_padto - increase size and pad an skbuff up to a minimal size + * @skb: buffer to pad + * @len: minimal length + * + * Pads up a buffer to ensure the trailing bytes exist and are + * blanked. If the buffer already contains sufficient data it + * is untouched. Otherwise it is extended. Returns zero on + * success. The skb is freed on error. + */ +static inline int skb_put_padto(struct sk_buff *skb, unsigned int len) +{ + return __skb_put_padto(skb, len, true); +} + static inline int skb_add_data(struct sk_buff *skb, struct iov_iter *from, int copy) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index f990eb8b30a9..e07556606284 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1363,18 +1363,20 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, EXPORT_SYMBOL(skb_copy_expand); /** - * skb_pad - zero pad the tail of an skb + * __skb_pad - zero pad the tail of an skb * @skb: buffer to pad * @pad: space to pad + * @free_on_error: free buffer on error * * Ensure that a buffer is followed by a padding area that is zero * filled. Used by network drivers which may DMA or transfer data * beyond the buffer end onto the wire. * - * May return error in out of memory cases. The skb is freed on error. + * May return error in out of memory cases. The skb is freed on error + * if @free_on_error is true. */ -int skb_pad(struct sk_buff *skb, int pad) +int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error) { int err; int ntail; @@ -1403,10 +1405,11 @@ int skb_pad(struct sk_buff *skb, int pad) return 0; free_skb: - kfree_skb(skb); + if (free_on_error) + kfree_skb(skb); return err; } -EXPORT_SYMBOL(skb_pad); +EXPORT_SYMBOL(__skb_pad); /** * pskb_put - add data to the tail of a potentially fragmented buffer -- cgit v1.2.3 From 4e458debbb69af0cbde5bd6430d64519d5f59274 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 24 Aug 2017 15:48:21 -0700 Subject: bpf: fix bpf_setsockopts return value This patch fixes a bug causing any sock operations to always return EINVAL. Fixes: a5192c52377e ("bpf: fix to bpf_setsockops"). Reported-by: Neal Cardwell Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Acked-by: Craig Gallek Acked-by: Daniel Borkmann Acked-by: Lawrence Brakmo Signed-off-by: David S. Miller --- net/core/filter.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 6280a602604c..8eb81e5fae08 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2872,7 +2872,6 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, ret = -EINVAL; } } - ret = -EINVAL; #endif } else { ret = -EINVAL; -- cgit v1.2.3 From ebfa00c5745660fe7f0a91eea88d4dff658486c4 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 25 Aug 2017 13:10:12 +0200 Subject: tcp: fix refcnt leak with ebpf congestion control There are a few bugs around refcnt handling in the new BPF congestion control setsockopt: - The new ca is assigned to icsk->icsk_ca_ops even in the case where we cannot get a reference on it. This would lead to a use after free, since that ca is going away soon. - Changing the congestion control case doesn't release the refcnt on the previous ca. - In the reinit case, we first leak a reference on the old ca, then we call tcp_reinit_congestion_control on the ca that we have just assigned, leading to deinitializing the wrong ca (->release of the new ca on the old ca's data) and releasing the refcount on the ca that we actually want to use. This is visible by building (for example) BIC as a module and setting net.ipv4.tcp_congestion_control=bic, and using tcp_cong_kern.c from samples/bpf. This patch fixes the refcount issues, and moves reinit back into tcp core to avoid passing a ca pointer back to BPF. Fixes: 91b5b21c7c16 ("bpf: Add support for changing congestion control") Signed-off-by: Sabrina Dubroca Acked-by: Lawrence Brakmo Signed-off-by: David S. Miller --- include/net/tcp.h | 4 +--- net/core/filter.c | 7 ++----- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_cong.c | 19 ++++++++++++++----- 4 files changed, 18 insertions(+), 14 deletions(-) (limited to 'net/core') diff --git a/include/net/tcp.h b/include/net/tcp.h index ada65e767b28..f642a39f9eee 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1004,9 +1004,7 @@ void tcp_get_default_congestion_control(char *name); void tcp_get_available_congestion_control(char *buf, size_t len); void tcp_get_allowed_congestion_control(char *buf, size_t len); int tcp_set_allowed_congestion_control(char *allowed); -int tcp_set_congestion_control(struct sock *sk, const char *name, bool load); -void tcp_reinit_congestion_control(struct sock *sk, - const struct tcp_congestion_ops *ca); +int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool reinit); u32 tcp_slow_start(struct tcp_sock *tp, u32 acked); void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked); diff --git a/net/core/filter.c b/net/core/filter.c index 8eb81e5fae08..169974998c76 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2836,15 +2836,12 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, sk->sk_prot->setsockopt == tcp_setsockopt) { if (optname == TCP_CONGESTION) { char name[TCP_CA_NAME_MAX]; + bool reinit = bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN; strncpy(name, optval, min_t(long, optlen, TCP_CA_NAME_MAX-1)); name[TCP_CA_NAME_MAX-1] = 0; - ret = tcp_set_congestion_control(sk, name, false); - if (!ret && bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN) - /* replacing an existing ca */ - tcp_reinit_congestion_control(sk, - inet_csk(sk)->icsk_ca_ops); + ret = tcp_set_congestion_control(sk, name, false, reinit); } else { struct tcp_sock *tp = tcp_sk(sk); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 71ce33decd97..a3e91b552edc 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2481,7 +2481,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, name[val] = 0; lock_sock(sk); - err = tcp_set_congestion_control(sk, name, true); + err = tcp_set_congestion_control(sk, name, true, true); release_sock(sk); return err; } diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index fde983f6376b..421ea1b918da 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -189,8 +189,8 @@ void tcp_init_congestion_control(struct sock *sk) INET_ECN_dontxmit(sk); } -void tcp_reinit_congestion_control(struct sock *sk, - const struct tcp_congestion_ops *ca) +static void tcp_reinit_congestion_control(struct sock *sk, + const struct tcp_congestion_ops *ca) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -338,7 +338,7 @@ out: * tcp_reinit_congestion_control (if the current congestion control was * already initialized. */ -int tcp_set_congestion_control(struct sock *sk, const char *name, bool load) +int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool reinit) { struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_congestion_ops *ca; @@ -360,9 +360,18 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load) if (!ca) { err = -ENOENT; } else if (!load) { - icsk->icsk_ca_ops = ca; - if (!try_module_get(ca->owner)) + const struct tcp_congestion_ops *old_ca = icsk->icsk_ca_ops; + + if (try_module_get(ca->owner)) { + if (reinit) { + tcp_reinit_congestion_control(sk, ca); + } else { + icsk->icsk_ca_ops = ca; + module_put(old_ca->owner); + } + } else { err = -EBUSY; + } } else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))) { err = -EPERM; -- cgit v1.2.3 From 1e22391e8fbec9c3709bad82b997b108d1c6228b Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 25 Aug 2017 15:04:32 +0200 Subject: net: missing call of trace_napi_poll in busy_poll_stop Noticed that busy_poll_stop() also invoke the drivers napi->poll() function pointer, but didn't have an associated call to trace_napi_poll() like all other call sites. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index ce15a06d5558..818dfa6e7ab5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5289,6 +5289,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) * Ideally, a new ndo_busy_poll_stop() could avoid another round. */ rc = napi->poll(napi, BUSY_POLL_BUDGET); + trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); netpoll_poll_unlock(have_poll_lock); if (rc == BUSY_POLL_BUDGET) __napi_schedule(napi); -- cgit v1.2.3 From 25cc72a33835ed8a6f53180a822cadab855852ac Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 1 Sep 2017 10:52:31 +0200 Subject: mlxsw: spectrum: Forbid linking to devices that have uppers The mlxsw driver relies on NETDEV_CHANGEUPPER events to configure the device in case a port is enslaved to a master netdev such as bridge or bond. Since the driver ignores events unrelated to its ports and their uppers, it's possible to engineer situations in which the device's data path differs from the kernel's. One example to such a situation is when a port is enslaved to a bond that is already enslaved to a bridge. When the bond was enslaved the driver ignored the event - as the bond wasn't one of its uppers - and therefore a bridge port instance isn't created in the device. Until such configurations are supported forbid them by checking that the upper device doesn't have uppers of its own. Fixes: 0d65fc13042f ("mlxsw: spectrum: Implement LAG port join/leave") Signed-off-by: Ido Schimmel Reported-by: Nogah Frankel Tested-by: Nogah Frankel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 6 ++++++ include/linux/netdevice.h | 2 ++ net/core/dev.c | 3 ++- 3 files changed, 10 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 60bf8f27cc00..c6a3e61b53bd 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -4139,6 +4139,8 @@ static int mlxsw_sp_netdevice_port_upper_event(struct net_device *lower_dev, return -EINVAL; if (!info->linking) break; + if (netdev_has_any_upper_dev(upper_dev)) + return -EINVAL; if (netif_is_lag_master(upper_dev) && !mlxsw_sp_master_lag_check(mlxsw_sp, upper_dev, info->upper_info)) @@ -4258,6 +4260,10 @@ static int mlxsw_sp_netdevice_port_vlan_event(struct net_device *vlan_dev, upper_dev = info->upper_dev; if (!netif_is_bridge_master(upper_dev)) return -EINVAL; + if (!info->linking) + break; + if (netdev_has_any_upper_dev(upper_dev)) + return -EINVAL; break; case NETDEV_CHANGEUPPER: upper_dev = info->upper_dev; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 779b23595596..c99ba7914c0a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3866,6 +3866,8 @@ int netdev_walk_all_upper_dev_rcu(struct net_device *dev, bool netdev_has_upper_dev_all_rcu(struct net_device *dev, struct net_device *upper_dev); +bool netdev_has_any_upper_dev(struct net_device *dev); + void *netdev_lower_get_next_private(struct net_device *dev, struct list_head **iter); void *netdev_lower_get_next_private_rcu(struct net_device *dev, diff --git a/net/core/dev.c b/net/core/dev.c index 818dfa6e7ab5..86b4b0a79e7a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5668,12 +5668,13 @@ EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu); * Find out if a device is linked to an upper device and return true in case * it is. The caller must hold the RTNL lock. */ -static bool netdev_has_any_upper_dev(struct net_device *dev) +bool netdev_has_any_upper_dev(struct net_device *dev) { ASSERT_RTNL(); return !list_empty(&dev->adj_list.upper); } +EXPORT_SYMBOL(netdev_has_any_upper_dev); /** * netdev_master_upper_dev_get - Get master upper device -- cgit v1.2.3