From 71ffe9c77dd7a2b62207953091efa8dafec958dd Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 25 Jul 2013 10:37:49 +0200 Subject: netfilter: xt_TCPMSS: fix handling of malformed TCP header and options Make sure the packet has enough room for the TCP header and that it is not malformed. While at it, store tcph->doff*4 in a variable, as it is used several times. This patch also fixes a possible off by one in case of malformed TCP options. Reported-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_TCPMSS.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index 7011c71646f0..6113cc7efffc 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -52,7 +52,8 @@ tcpmss_mangle_packet(struct sk_buff *skb, { const struct xt_tcpmss_info *info = par->targinfo; struct tcphdr *tcph; - unsigned int tcplen, i; + int len, tcp_hdrlen; + unsigned int i; __be16 oldval; u16 newmss; u8 *opt; @@ -64,11 +65,14 @@ tcpmss_mangle_packet(struct sk_buff *skb, if (!skb_make_writable(skb, skb->len)) return -1; - tcplen = skb->len - tcphoff; + len = skb->len - tcphoff; + if (len < (int)sizeof(struct tcphdr)) + return -1; + tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff); + tcp_hdrlen = tcph->doff * 4; - /* Header cannot be larger than the packet */ - if (tcplen < tcph->doff*4) + if (len < tcp_hdrlen) return -1; if (info->mss == XT_TCPMSS_CLAMP_PMTU) { @@ -87,9 +91,8 @@ tcpmss_mangle_packet(struct sk_buff *skb, newmss = info->mss; opt = (u_int8_t *)tcph; - for (i = sizeof(struct tcphdr); i < tcph->doff*4; i += optlen(opt, i)) { - if (opt[i] == TCPOPT_MSS && tcph->doff*4 - i >= TCPOLEN_MSS && - opt[i+1] == TCPOLEN_MSS) { + for (i = sizeof(struct tcphdr); i <= tcp_hdrlen - TCPOLEN_MSS; i += optlen(opt, i)) { + if (opt[i] == TCPOPT_MSS && opt[i+1] == TCPOLEN_MSS) { u_int16_t oldmss; oldmss = (opt[i+2] << 8) | opt[i+3]; @@ -112,9 +115,10 @@ tcpmss_mangle_packet(struct sk_buff *skb, } /* There is data after the header so the option can't be added - without moving it, and doing so may make the SYN packet - itself too large. Accept the packet unmodified instead. */ - if (tcplen > tcph->doff*4) + * without moving it, and doing so may make the SYN packet + * itself too large. Accept the packet unmodified instead. + */ + if (len > tcp_hdrlen) return 0; /* @@ -143,10 +147,10 @@ tcpmss_mangle_packet(struct sk_buff *skb, newmss = min(newmss, (u16)1220); opt = (u_int8_t *)tcph + sizeof(struct tcphdr); - memmove(opt + TCPOLEN_MSS, opt, tcplen - sizeof(struct tcphdr)); + memmove(opt + TCPOLEN_MSS, opt, len - sizeof(struct tcphdr)); inet_proto_csum_replace2(&tcph->check, skb, - htons(tcplen), htons(tcplen + TCPOLEN_MSS), 1); + htons(len), htons(len + TCPOLEN_MSS), 1); opt[0] = TCPOPT_MSS; opt[1] = TCPOLEN_MSS; opt[2] = (newmss & 0xff00) >> 8; -- cgit v1.2.3 From a206bcb3b02025b23137f3228109d72e0f835c05 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 25 Jul 2013 10:46:46 +0200 Subject: netfilter: xt_TCPOPTSTRIP: fix possible off by one access Fix a possible off by one access since optlen() touches opt[offset+1] unsafely when i == tcp_hdrlen(skb) - 1. This patch replaces tcp_hdrlen() by the local variable tcp_hdrlen that stores the TCP header length, to save some cycles. Reported-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_TCPOPTSTRIP.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c index b68fa191710f..625fa1d636a0 100644 --- a/net/netfilter/xt_TCPOPTSTRIP.c +++ b/net/netfilter/xt_TCPOPTSTRIP.c @@ -38,7 +38,7 @@ tcpoptstrip_mangle_packet(struct sk_buff *skb, struct tcphdr *tcph; u_int16_t n, o; u_int8_t *opt; - int len; + int len, tcp_hdrlen; /* This is a fragment, no TCP header is available */ if (par->fragoff != 0) @@ -52,7 +52,9 @@ tcpoptstrip_mangle_packet(struct sk_buff *skb, return NF_DROP; tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff); - if (tcph->doff * 4 > len) + tcp_hdrlen = tcph->doff * 4; + + if (len < tcp_hdrlen) return NF_DROP; opt = (u_int8_t *)tcph; @@ -61,10 +63,10 @@ tcpoptstrip_mangle_packet(struct sk_buff *skb, * Walk through all TCP options - if we find some option to remove, * set all octets to %TCPOPT_NOP and adjust checksum. */ - for (i = sizeof(struct tcphdr); i < tcp_hdrlen(skb); i += optl) { + for (i = sizeof(struct tcphdr); i < tcp_hdrlen - 1; i += optl) { optl = optlen(opt, i); - if (i + optl > tcp_hdrlen(skb)) + if (i + optl > tcp_hdrlen) break; if (!tcpoptstrip_test_bit(info->strip_bmap, opt[i])) -- cgit v1.2.3 From e4d091d7bf787cd303383725b8071d0bae76f981 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 1 Aug 2013 12:36:57 +0300 Subject: netfilter: nfnetlink_{log,queue}: fix information leaks in netlink message These structs have a "_pad" member. Also the "phw" structs have an 8 byte "hw_addr[]" array but sometimes only the first 6 bytes are initialized. Signed-off-by: Dan Carpenter Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_log.c | 6 +++++- net/netfilter/nfnetlink_queue_core.c | 5 ++++- 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 962e9792e317..d92cc317bf8b 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -419,6 +419,7 @@ __build_packet_message(struct nfnl_log_net *log, nfmsg->version = NFNETLINK_V0; nfmsg->res_id = htons(inst->group_num); + memset(&pmsg, 0, sizeof(pmsg)); pmsg.hw_protocol = skb->protocol; pmsg.hook = hooknum; @@ -498,7 +499,10 @@ __build_packet_message(struct nfnl_log_net *log, if (indev && skb->dev && skb->mac_header != skb->network_header) { struct nfulnl_msg_packet_hw phw; - int len = dev_parse_header(skb, phw.hw_addr); + int len; + + memset(&phw, 0, sizeof(phw)); + len = dev_parse_header(skb, phw.hw_addr); if (len > 0) { phw.hw_addrlen = htons(len); if (nla_put(inst->skb, NFULA_HWADDR, sizeof(phw), &phw)) diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c index 971ea145ab3e..8a703c3dd318 100644 --- a/net/netfilter/nfnetlink_queue_core.c +++ b/net/netfilter/nfnetlink_queue_core.c @@ -463,7 +463,10 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, if (indev && entskb->dev && entskb->mac_header != entskb->network_header) { struct nfqnl_msg_packet_hw phw; - int len = dev_parse_header(entskb, phw.hw_addr); + int len; + + memset(&phw, 0, sizeof(phw)); + len = dev_parse_header(entskb, phw.hw_addr); if (len) { phw.hw_addrlen = htons(len); if (nla_put(skb, NFQA_HWADDR, sizeof(phw), &phw)) -- cgit v1.2.3 From d9af2d67e490b48f0d36f448d34e7bab9425f142 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Mon, 5 Aug 2013 16:47:38 +0200 Subject: net/vmw_vsock/af_vsock.c: drop unneeded semicolon Drop the semicolon at the end of the list_for_each_entry loop header. Signed-off-by: Julia Lawall Signed-off-by: David S. Miller --- net/vmw_vsock/af_vsock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 593071dabd1c..4d9334683f84 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -347,7 +347,7 @@ void vsock_for_each_connected_socket(void (*fn)(struct sock *sk)) for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) { struct vsock_sock *vsk; list_for_each_entry(vsk, &vsock_connected_table[i], - connected_table); + connected_table) fn(sk_vsock(vsk)); } -- cgit v1.2.3 From 0369722f024cd374f74eac6d261014403aa27ea2 Mon Sep 17 00:00:00 2001 From: "nikolay@redhat.com" Date: Sat, 3 Aug 2013 22:07:46 +0200 Subject: vlan: make vlan_dev_real_dev work over stacked vlans Sometimes we might have stacked vlans on top of each other, and we're interested in the first non-vlan real device on the path, so transform vlan_dev_real_dev to go over the stacked vlans and extract the first non-vlan device. Signed-off-by: Nikolay Aleksandrov Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- net/8021q/vlan_core.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index 4a78c4de9f20..6ee48aac776f 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -91,7 +91,12 @@ EXPORT_SYMBOL(__vlan_find_dev_deep); struct net_device *vlan_dev_real_dev(const struct net_device *dev) { - return vlan_dev_priv(dev)->real_dev; + struct net_device *ret = vlan_dev_priv(dev)->real_dev; + + while (is_vlan_dev(ret)) + ret = vlan_dev_priv(ret)->real_dev; + + return ret; } EXPORT_SYMBOL(vlan_dev_real_dev); -- cgit v1.2.3 From 07ce76aa9bcf8bc106a53c67548c5602f1598595 Mon Sep 17 00:00:00 2001 From: "nikolay@redhat.com" Date: Sat, 3 Aug 2013 22:07:47 +0200 Subject: net_sched: make dev_trans_start return vlan's real dev trans_start Vlan devices are LLTX and don't update their own trans_start, so if dev_trans_start has to be called with a vlan device then 0 or a stale value will be returned. Currently the bonding is the only such user, and it's needed for proper arp monitoring when the slaves are vlans. Fix this by extracting the vlan's real device trans_start. Suggested-by: David Miller Signed-off-by: Nikolay Aleksandrov Acked-by: Veaceslav Falico Signed-off-by: David S. Miller --- net/sched/sch_generic.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 4626cef4b76e..eeb8276d7a89 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -207,15 +208,19 @@ void __qdisc_run(struct Qdisc *q) unsigned long dev_trans_start(struct net_device *dev) { - unsigned long val, res = dev->trans_start; + unsigned long val, res; unsigned int i; + if (is_vlan_dev(dev)) + dev = vlan_dev_real_dev(dev); + res = dev->trans_start; for (i = 0; i < dev->num_tx_queues; i++) { val = netdev_get_tx_queue(dev, i)->trans_start; if (val && time_after(val, res)) res = val; } dev->trans_start = res; + return res; } EXPORT_SYMBOL(dev_trans_start); -- cgit v1.2.3 From 7921895a5e852fc99de347bc0600659997de9298 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 5 Aug 2013 12:49:35 +0200 Subject: net: esp{4,6}: fix potential MTU calculation overflows Commit 91657eafb ("xfrm: take net hdr len into account for esp payload size calculation") introduced a possible interger overflow in esp{4,6}_get_mtu() handlers in case of x->props.mode equals XFRM_MODE_TUNNEL. Thus, the following expression will overflow unsigned int net_adj; ... net_adj = 0; ... return ((mtu - x->props.header_len - crypto_aead_authsize(esp->aead) - net_adj) & ~(align - 1)) + (net_adj - 2); where (net_adj - 2) would be evaluated as + (0 - 2) in an unsigned context. Fix it by simply removing brackets as those operations here do not need to have special precedence. Signed-off-by: Daniel Borkmann Cc: Benjamin Poirier Cc: Steffen Klassert Acked-by: Benjamin Poirier Signed-off-by: David S. Miller --- net/ipv4/esp4.c | 2 +- net/ipv6/esp6.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index ab3d814bc80a..109ee89f123e 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -477,7 +477,7 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu) } return ((mtu - x->props.header_len - crypto_aead_authsize(esp->aead) - - net_adj) & ~(align - 1)) + (net_adj - 2); + net_adj) & ~(align - 1)) + net_adj - 2; } static void esp4_err(struct sk_buff *skb, u32 info) diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 40ffd72243a4..aeac0dc3635d 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -425,7 +425,7 @@ static u32 esp6_get_mtu(struct xfrm_state *x, int mtu) net_adj = 0; return ((mtu - x->props.header_len - crypto_aead_authsize(esp->aead) - - net_adj) & ~(align - 1)) + (net_adj - 2); + net_adj) & ~(align - 1)) + net_adj - 2; } static void esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, -- cgit v1.2.3 From fc7f8f5c53fdb82d4689e24df3da1a88bc3859f7 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Fri, 2 Aug 2013 19:07:38 +0200 Subject: neighbour: populate neigh_parms on alloc before calling ndo_neigh_setup dev->ndo_neigh_setup() might need some of the values of neigh_parms, so populate them before calling it. Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- net/core/neighbour.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 9232c68941ab..60533db8b72d 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1441,16 +1441,18 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev, atomic_set(&p->refcnt, 1); p->reachable_time = neigh_rand_reach_time(p->base_reachable_time); + dev_hold(dev); + p->dev = dev; + write_pnet(&p->net, hold_net(net)); + p->sysctl_table = NULL; if (ops->ndo_neigh_setup && ops->ndo_neigh_setup(dev, p)) { + release_net(net); + dev_put(dev); kfree(p); return NULL; } - dev_hold(dev); - p->dev = dev; - write_pnet(&p->net, hold_net(net)); - p->sysctl_table = NULL; write_lock_bh(&tbl->lock); p->next = tbl->parms.next; tbl->parms.next = p; -- cgit v1.2.3 From aab515d7c32a34300312416c50314e755ea6f765 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 5 Aug 2013 11:18:49 -0700 Subject: fib_trie: remove potential out of bound access AddressSanitizer [1] dynamic checker pointed a potential out of bound access in leaf_walk_rcu() We could allocate one more slot in tnode_new() to leave the prefetch() in-place but it looks not worth the pain. Bug added in commit 82cfbb008572b ("[IPV4] fib_trie: iterator recode") [1] : https://code.google.com/p/address-sanitizer/wiki/AddressSanitizerForKernel Reported-by: Andrey Konovalov Signed-off-by: Eric Dumazet Cc: Dmitry Vyukov Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 108a1e9c9eac..3df6d3edb2a1 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -71,7 +71,6 @@ #include #include #include -#include #include #include #include @@ -1761,10 +1760,8 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c) if (!c) continue; - if (IS_LEAF(c)) { - prefetch(rcu_dereference_rtnl(p->child[idx])); + if (IS_LEAF(c)) return (struct leaf *) c; - } /* Rescan start scanning in new node */ p = (struct tnode *) c; -- cgit v1.2.3 From 248ba8ec05a2c3b118c2224e57eb10c128176ab1 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Tue, 6 Aug 2013 00:32:05 +0200 Subject: bridge: don't try to update timers in case of broken MLD queries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently we are reading an uninitialized value for the max_delay variable when snooping an MLD query message of invalid length and would update our timers with that. Fixing this by simply ignoring such broken MLD queries (just like we do for IGMP already). This is a regression introduced by: "bridge: disable snooping if there is no querier" (b00589af3b04) Reported-by: Paul Bolle Signed-off-by: Linus Lüssing Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 61c5e819380e..08e576ada0b2 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1195,7 +1195,7 @@ static int br_ip6_multicast_query(struct net_bridge *br, max_delay = msecs_to_jiffies(ntohs(mld->mld_maxdelay)); if (max_delay) group = &mld->mld_mca; - } else if (skb->len >= sizeof(*mld2q)) { + } else { if (!pskb_may_pull(skb, sizeof(*mld2q))) { err = -EINVAL; goto out; -- cgit v1.2.3 From 00326ed6442c66021cd4b5e19e80f3e2027d5d42 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 5 Aug 2013 14:10:43 -0400 Subject: SUNRPC: Don't auto-disconnect from the local rpcbind socket There is no need for the kernel to time out the AF_LOCAL connection to the rpcbind socket, and doing so is problematic because when it is time to reconnect, our process may no longer be using the same mount namespace. Reported-by: Nix Signed-off-by: Trond Myklebust Cc: Jeff Layton Cc: stable@vger.kernel.org # 3.9.x --- net/sunrpc/rpcb_clnt.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net') diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 3df764dc330c..b0f723227157 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -238,6 +238,14 @@ static int rpcb_create_local_unix(struct net *net) .program = &rpcb_program, .version = RPCBVERS_2, .authflavor = RPC_AUTH_NULL, + /* + * We turn off the idle timeout to prevent the kernel + * from automatically disconnecting the socket. + * Otherwise, we'd have to cache the mount namespace + * of the caller and somehow pass that to the socket + * reconnect code. + */ + .flags = RPC_CLNT_CREATE_NO_IDLE_TIMEOUT, }; struct rpc_clnt *clnt, *clnt4; int result = 0; -- cgit v1.2.3 From 2ed0edf9090bf4afa2c6fc4f38575a85a80d4b20 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 5 Aug 2013 17:10:15 -0700 Subject: tcp: cubic: fix overflow error in bictcp_update() commit 17a6e9f1aa9 ("tcp_cubic: fix clock dependency") added an overflow error in bictcp_update() in following code : /* change the unit from HZ to bictcp_HZ */ t = ((tcp_time_stamp + msecs_to_jiffies(ca->delay_min>>3) - ca->epoch_start) << BICTCP_HZ) / HZ; Because msecs_to_jiffies() being unsigned long, compiler does implicit type promotion. We really want to constrain (tcp_time_stamp - ca->epoch_start) to a signed 32bit value, or else 't' has unexpected high values. This bugs triggers an increase of retransmit rates ~24 days after boot [1], as the high order bit of tcp_time_stamp flips. [1] for hosts with HZ=1000 Big thanks to Van Jacobson for spotting this problem. Diagnosed-by: Van Jacobson Signed-off-by: Eric Dumazet Cc: Neal Cardwell Cc: Yuchung Cheng Cc: Stephen Hemminger Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_cubic.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index a9077f441cb2..b6b591f0a788 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -206,8 +206,8 @@ static u32 cubic_root(u64 a) */ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) { - u64 offs; - u32 delta, t, bic_target, max_cnt; + u32 delta, bic_target, max_cnt; + u64 offs, t; ca->ack_cnt++; /* count the number of ACKs */ @@ -250,9 +250,11 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) * if the cwnd < 1 million packets !!! */ + t = (s32)(tcp_time_stamp - ca->epoch_start); + t += msecs_to_jiffies(ca->delay_min >> 3); /* change the unit from HZ to bictcp_HZ */ - t = ((tcp_time_stamp + msecs_to_jiffies(ca->delay_min>>3) - - ca->epoch_start) << BICTCP_HZ) / HZ; + t <<= BICTCP_HZ; + do_div(t, HZ); if (t < ca->bic_K) /* t - K */ offs = ca->bic_K - t; -- cgit v1.2.3 From 15401946f9b720efdd20bda3ae79725e9c586897 Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Tue, 6 Aug 2013 08:44:46 +0800 Subject: bridge: correct the comment for file br_sysfs_br.c br_sysfs_if.c is for sysfs attributes of bridge ports, while br_sysfs_br.c is for sysfs attributes of bridge itself. Correct the comment here. Signed-off-by: Wang Sheng-Hui Signed-off-by: David S. Miller --- net/bridge/br_sysfs_br.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index 394bb96b6087..3b9637fb7939 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -1,5 +1,5 @@ /* - * Sysfs attributes of bridge ports + * Sysfs attributes of bridge * Linux ethernet bridge * * Authors: -- cgit v1.2.3 From cd6b423afd3c08b27e1fed52db828ade0addbc6b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 5 Aug 2013 20:05:12 -0700 Subject: tcp: cubic: fix bug in bictcp_acked() While investigating about strange increase of retransmit rates on hosts ~24 days after boot, Van found hystart was disabled if ca->epoch_start was 0, as following condition is true when tcp_time_stamp high order bit is set. (s32)(tcp_time_stamp - ca->epoch_start) < HZ Quoting Van : At initialization & after every loss ca->epoch_start is set to zero so I believe that the above line will turn off hystart as soon as the 2^31 bit is set in tcp_time_stamp & hystart will stay off for 24 days. I think we've observed that cubic's restart is too aggressive without hystart so this might account for the higher drop rate we observe. Diagnosed-by: Van Jacobson Signed-off-by: Eric Dumazet Cc: Neal Cardwell Cc: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_cubic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index b6b591f0a788..b6ae92a51f58 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -416,7 +416,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) return; /* Discard delay samples right after fast recovery */ - if ((s32)(tcp_time_stamp - ca->epoch_start) < HZ) + if (ca->epoch_start && (s32)(tcp_time_stamp - ca->epoch_start) < HZ) return; delay = (rtt_us << 3) / USEC_PER_MSEC; -- cgit v1.2.3 From 786615bc1ce84150ded80daea6bd9f6297f48e73 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 5 Aug 2013 16:04:47 -0400 Subject: SUNRPC: If the rpcbind channel is disconnected, fail the call to unregister If rpcbind causes our connection to the AF_LOCAL socket to close after we've registered a service, then we want to be careful about reconnecting since the mount namespace may have changed. By simply refusing to reconnect the AF_LOCAL socket in the case of unregister, we avoid the need to somehow save the mount namespace. While this may lead to some services not unregistering properly, it should be safe. Signed-off-by: Trond Myklebust Cc: Nix Cc: Jeff Layton Cc: stable@vger.kernel.org # 3.9.x --- include/linux/sunrpc/sched.h | 1 + net/sunrpc/clnt.c | 4 ++++ net/sunrpc/netns.h | 1 + net/sunrpc/rpcb_clnt.c | 40 +++++++++++++++++++++++++++------------- 4 files changed, 33 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 6d870353674a..1821445708d6 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -121,6 +121,7 @@ struct rpc_task_setup { #define RPC_TASK_SOFTCONN 0x0400 /* Fail if can't connect */ #define RPC_TASK_SENT 0x0800 /* message was sent */ #define RPC_TASK_TIMEOUT 0x1000 /* fail with ETIMEDOUT on timeout */ +#define RPC_TASK_NOCONNECT 0x2000 /* return ENOTCONN if not connected */ #define RPC_IS_ASYNC(t) ((t)->tk_flags & RPC_TASK_ASYNC) #define RPC_IS_SWAPPER(t) ((t)->tk_flags & RPC_TASK_SWAPPER) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 74f6a704e374..ecbc4e3d83ad 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1660,6 +1660,10 @@ call_connect(struct rpc_task *task) task->tk_action = call_connect_status; if (task->tk_status < 0) return; + if (task->tk_flags & RPC_TASK_NOCONNECT) { + rpc_exit(task, -ENOTCONN); + return; + } xprt_connect(task); } } diff --git a/net/sunrpc/netns.h b/net/sunrpc/netns.h index 74d948f5d5a1..779742cfc1ff 100644 --- a/net/sunrpc/netns.h +++ b/net/sunrpc/netns.h @@ -23,6 +23,7 @@ struct sunrpc_net { struct rpc_clnt *rpcb_local_clnt4; spinlock_t rpcb_clnt_lock; unsigned int rpcb_users; + unsigned int rpcb_is_af_local : 1; struct mutex gssp_lock; wait_queue_head_t gssp_wq; diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index b0f723227157..1891a1022c17 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -204,13 +204,15 @@ void rpcb_put_local(struct net *net) } static void rpcb_set_local(struct net *net, struct rpc_clnt *clnt, - struct rpc_clnt *clnt4) + struct rpc_clnt *clnt4, + bool is_af_local) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); /* Protected by rpcb_create_local_mutex */ sn->rpcb_local_clnt = clnt; sn->rpcb_local_clnt4 = clnt4; + sn->rpcb_is_af_local = is_af_local ? 1 : 0; smp_wmb(); sn->rpcb_users = 1; dprintk("RPC: created new rpcb local clients (rpcb_local_clnt: " @@ -271,7 +273,7 @@ static int rpcb_create_local_unix(struct net *net) clnt4 = NULL; } - rpcb_set_local(net, clnt, clnt4); + rpcb_set_local(net, clnt, clnt4, true); out: return result; @@ -323,7 +325,7 @@ static int rpcb_create_local_net(struct net *net) clnt4 = NULL; } - rpcb_set_local(net, clnt, clnt4); + rpcb_set_local(net, clnt, clnt4, false); out: return result; @@ -384,13 +386,16 @@ static struct rpc_clnt *rpcb_create(struct net *net, const char *hostname, return rpc_create(&args); } -static int rpcb_register_call(struct rpc_clnt *clnt, struct rpc_message *msg) +static int rpcb_register_call(struct sunrpc_net *sn, struct rpc_clnt *clnt, struct rpc_message *msg, bool is_set) { - int result, error = 0; + int flags = RPC_TASK_NOCONNECT; + int error, result = 0; + if (is_set || !sn->rpcb_is_af_local) + flags = RPC_TASK_SOFTCONN; msg->rpc_resp = &result; - error = rpc_call_sync(clnt, msg, RPC_TASK_SOFTCONN); + error = rpc_call_sync(clnt, msg, flags); if (error < 0) { dprintk("RPC: failed to contact local rpcbind " "server (errno %d).\n", -error); @@ -447,16 +452,19 @@ int rpcb_register(struct net *net, u32 prog, u32 vers, int prot, unsigned short .rpc_argp = &map, }; struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); + bool is_set = false; dprintk("RPC: %sregistering (%u, %u, %d, %u) with local " "rpcbind\n", (port ? "" : "un"), prog, vers, prot, port); msg.rpc_proc = &rpcb_procedures2[RPCBPROC_UNSET]; - if (port) + if (port != 0) { msg.rpc_proc = &rpcb_procedures2[RPCBPROC_SET]; + is_set = true; + } - return rpcb_register_call(sn->rpcb_local_clnt, &msg); + return rpcb_register_call(sn, sn->rpcb_local_clnt, &msg, is_set); } /* @@ -469,6 +477,7 @@ static int rpcb_register_inet4(struct sunrpc_net *sn, const struct sockaddr_in *sin = (const struct sockaddr_in *)sap; struct rpcbind_args *map = msg->rpc_argp; unsigned short port = ntohs(sin->sin_port); + bool is_set = false; int result; map->r_addr = rpc_sockaddr2uaddr(sap, GFP_KERNEL); @@ -479,10 +488,12 @@ static int rpcb_register_inet4(struct sunrpc_net *sn, map->r_addr, map->r_netid); msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET]; - if (port) + if (port != 0) { msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET]; + is_set = true; + } - result = rpcb_register_call(sn->rpcb_local_clnt4, msg); + result = rpcb_register_call(sn, sn->rpcb_local_clnt4, msg, is_set); kfree(map->r_addr); return result; } @@ -497,6 +508,7 @@ static int rpcb_register_inet6(struct sunrpc_net *sn, const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)sap; struct rpcbind_args *map = msg->rpc_argp; unsigned short port = ntohs(sin6->sin6_port); + bool is_set = false; int result; map->r_addr = rpc_sockaddr2uaddr(sap, GFP_KERNEL); @@ -507,10 +519,12 @@ static int rpcb_register_inet6(struct sunrpc_net *sn, map->r_addr, map->r_netid); msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET]; - if (port) + if (port != 0) { msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET]; + is_set = true; + } - result = rpcb_register_call(sn->rpcb_local_clnt4, msg); + result = rpcb_register_call(sn, sn->rpcb_local_clnt4, msg, is_set); kfree(map->r_addr); return result; } @@ -527,7 +541,7 @@ static int rpcb_unregister_all_protofamilies(struct sunrpc_net *sn, map->r_addr = ""; msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET]; - return rpcb_register_call(sn->rpcb_local_clnt4, msg); + return rpcb_register_call(sn, sn->rpcb_local_clnt4, msg, false); } /** -- cgit v1.2.3 From 3e3be275851bc6fc90bfdcd732cd95563acd982b Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Wed, 7 Aug 2013 02:34:31 +0200 Subject: ipv6: don't stop backtracking in fib6_lookup_1 if subtree does not match In case a subtree did not match we currently stop backtracking and return NULL (root table from fib_lookup). This could yield in invalid routing table lookups when using subtrees. Instead continue to backtrack until a valid subtree or node is found and return this match. Also remove unneeded NULL check. Reported-by: Teco Boot Cc: YOSHIFUJI Hideaki Cc: David Lamparter Cc: Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index bff3d821c7eb..c4ff5bbb45c4 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -993,14 +993,22 @@ static struct fib6_node * fib6_lookup_1(struct fib6_node *root, if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) { #ifdef CONFIG_IPV6_SUBTREES - if (fn->subtree) - fn = fib6_lookup_1(fn->subtree, args + 1); + if (fn->subtree) { + struct fib6_node *sfn; + sfn = fib6_lookup_1(fn->subtree, + args + 1); + if (!sfn) + goto backtrack; + fn = sfn; + } #endif - if (!fn || fn->fn_flags & RTN_RTINFO) + if (fn->fn_flags & RTN_RTINFO) return fn; } } - +#ifdef CONFIG_IPV6_SUBTREES +backtrack: +#endif if (fn->fn_flags & RTN_ROOT) break; -- cgit v1.2.3 From 77a482bdb2e68d13fae87541b341905ba70d572b Mon Sep 17 00:00:00 2001 From: Timo Teräs Date: Tue, 6 Aug 2013 13:45:43 +0300 Subject: ip_gre: fix ipgre_header to return correct offset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix ipgre_header() (header_ops->create) to return the correct amount of bytes pushed. Most callers of dev_hard_header() seem to care only if it was success, but af_packet.c uses it as offset to the skb to copy from userspace only once. In practice this fixes packet socket sendto()/sendmsg() to gre tunnels. Regression introduced in c54419321455631079c7d6e60bc732dd0c5914c5 ("GRE: Refactor GRE tunneling code.") Cc: Pravin B Shelar Signed-off-by: Timo Teräs Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 1f6eab66f7ce..8d6939eeb492 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -383,7 +383,7 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev, if (daddr) memcpy(&iph->daddr, daddr, 4); if (iph->daddr) - return t->hlen; + return t->hlen + sizeof(*iph); return -(t->hlen + sizeof(*iph)); } -- cgit v1.2.3 From e11aada32b39a060e26fa4091cb968bd42e3bcbf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 6 Aug 2013 04:35:06 -0700 Subject: net: flow_dissector: add 802.1ad support Same behavior than 802.1q : finds the encapsulated protocol and skip 32bit header. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 00ee068efc1c..b84a1b155bc1 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -65,6 +65,7 @@ ipv6: nhoff += sizeof(struct ipv6hdr); break; } + case __constant_htons(ETH_P_8021AD): case __constant_htons(ETH_P_8021Q): { const struct vlan_hdr *vlan; struct vlan_hdr _vlan; -- cgit v1.2.3 From 288a9376371d425edeeea41a0310922c5fb2092d Mon Sep 17 00:00:00 2001 From: Eliezer Tamir Date: Wed, 7 Aug 2013 11:33:25 +0300 Subject: net: rename busy poll MIB counter Rename mib counter from "low latency" to "busy poll" v1 also moved the counter to the ip MIB (suggested by Shawn Bohrer) Eric Dumazet suggested that the current location is better. So v2 just renames the counter to fit the new naming convention. Signed-off-by: Eliezer Tamir Signed-off-by: David S. Miller --- include/net/busy_poll.h | 2 +- include/uapi/linux/snmp.h | 2 +- net/ipv4/proc.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 8e2dfc106aed..8a358a2c97e6 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -122,7 +122,7 @@ static inline bool sk_busy_loop(struct sock *sk, int nonblock) if (rc > 0) /* local bh are disabled so it is ok to use _BH */ NET_ADD_STATS_BH(sock_net(sk), - LINUX_MIB_LOWLATENCYRXPACKETS, rc); + LINUX_MIB_BUSYPOLLRXPACKETS, rc); } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) && !need_resched() && !busy_loop_timeout(end_time)); diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index af0a674cc677..a1356d3b54df 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -253,7 +253,7 @@ enum LINUX_MIB_TCPFASTOPENLISTENOVERFLOW, /* TCPFastOpenListenOverflow */ LINUX_MIB_TCPFASTOPENCOOKIEREQD, /* TCPFastOpenCookieReqd */ LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES, /* TCPSpuriousRtxHostQueues */ - LINUX_MIB_LOWLATENCYRXPACKETS, /* LowLatencyRxPackets */ + LINUX_MIB_BUSYPOLLRXPACKETS, /* BusyPollRxPackets */ __LINUX_MIB_MAX }; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 6577a1149a47..463bd1273346 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -273,7 +273,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW), SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD), SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES), - SNMP_MIB_ITEM("LowLatencyRxPackets", LINUX_MIB_LOWLATENCYRXPACKETS), + SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS), SNMP_MIB_SENTINEL }; -- cgit v1.2.3 From 645359930231d5e78fd3296a38b98c1a658a7ade Mon Sep 17 00:00:00 2001 From: Sridhar Samudrala Date: Thu, 8 Aug 2013 15:19:48 -0700 Subject: rtnetlink: Fix inverted check in ndo_dflt_fdb_del() Fix inverted check when deleting an fdb entry. Signed-off-by: Sridhar Samudrala Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 3de740834d1f..82d968527121 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2156,7 +2156,7 @@ int ndo_dflt_fdb_del(struct ndmsg *ndm, /* If aging addresses are supported device will need to * implement its own handler for this. */ - if (ndm->ndm_state & NUD_PERMANENT) { + if (!(ndm->ndm_state & NUD_PERMANENT)) { pr_info("%s: FDB only supports static addresses\n", dev->name); return -EINVAL; } -- cgit v1.2.3 From 356d7d88e088687b6578ca64601b0a2c9d145296 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Fri, 9 Aug 2013 17:21:27 -0700 Subject: netfilter: nf_conntrack: fix tcp_in_window for Fast Open Currently the conntrack checks if the ending sequence of a packet falls within the observed receive window. However it does so even if it has not observe any packet from the remote yet and uses an uninitialized receive window (td_maxwin). If a connection uses Fast Open to send a SYN-data packet which is dropped afterward in the network. The subsequent SYNs retransmits will all fail this check and be discarded, leading to a connection timeout. This is because the SYN retransmit does not contain data payload so end == initial sequence number (isn) + 1 sender->td_end == isn + syn_data_len receiver->td_maxwin == 0 The fix is to only apply this check after td_maxwin is initialized. Reported-by: Michael Chan Signed-off-by: Yuchung Cheng Acked-by: Eric Dumazet Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto_tcp.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 7dcc376eea5f..2f8010707d01 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -526,7 +526,7 @@ static bool tcp_in_window(const struct nf_conn *ct, const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple; __u32 seq, ack, sack, end, win, swin; s16 receiver_offset; - bool res; + bool res, in_recv_win; /* * Get the required data from the packet. @@ -649,14 +649,18 @@ static bool tcp_in_window(const struct nf_conn *ct, receiver->td_end, receiver->td_maxend, receiver->td_maxwin, receiver->td_scale); + /* Is the ending sequence in the receive window (if available)? */ + in_recv_win = !receiver->td_maxwin || + after(end, sender->td_end - receiver->td_maxwin - 1); + pr_debug("tcp_in_window: I=%i II=%i III=%i IV=%i\n", before(seq, sender->td_maxend + 1), - after(end, sender->td_end - receiver->td_maxwin - 1), + (in_recv_win ? 1 : 0), before(sack, receiver->td_end + 1), after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)); if (before(seq, sender->td_maxend + 1) && - after(end, sender->td_end - receiver->td_maxwin - 1) && + in_recv_win && before(sack, receiver->td_end + 1) && after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)) { /* @@ -725,7 +729,7 @@ static bool tcp_in_window(const struct nf_conn *ct, nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: %s ", before(seq, sender->td_maxend + 1) ? - after(end, sender->td_end - receiver->td_maxwin - 1) ? + in_recv_win ? before(sack, receiver->td_end + 1) ? after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1) ? "BUG" : "ACK is under the lower bound (possible overly delayed ACK)" -- cgit v1.2.3 From 9d2c9488cedb666bc8206fbdcdc1575e0fbc5929 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Tue, 6 Aug 2013 20:21:15 +0200 Subject: batman-adv: fix potential kernel paging errors for unicast transmissions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are several functions which might reallocate skb data. Currently some places keep reusing their old ethhdr pointer regardless of whether they became invalid after such a reallocation or not. This potentially leads to kernel paging errors. This patch fixes these by refetching the ethdr pointer after the potential reallocations. Signed-off-by: Linus Lüssing Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/bridge_loop_avoidance.c | 2 ++ net/batman-adv/gateway_client.c | 13 ++++++++++++- net/batman-adv/gateway_client.h | 3 +-- net/batman-adv/soft-interface.c | 9 ++++++++- net/batman-adv/unicast.c | 13 ++++++++++--- 5 files changed, 33 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index e14531f1ce1c..264de88db320 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -1529,6 +1529,8 @@ out: * in these cases, the skb is further handled by this function and * returns 1, otherwise it returns 0 and the caller shall further * process the skb. + * + * This call might reallocate skb data. */ int batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index f105219f4a4b..7614af31daff 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -508,6 +508,7 @@ out: return 0; } +/* this call might reallocate skb data */ static bool batadv_is_type_dhcprequest(struct sk_buff *skb, int header_len) { int ret = false; @@ -568,6 +569,7 @@ out: return ret; } +/* this call might reallocate skb data */ bool batadv_gw_is_dhcp_target(struct sk_buff *skb, unsigned int *header_len) { struct ethhdr *ethhdr; @@ -619,6 +621,12 @@ bool batadv_gw_is_dhcp_target(struct sk_buff *skb, unsigned int *header_len) if (!pskb_may_pull(skb, *header_len + sizeof(*udphdr))) return false; + + /* skb->data might have been reallocated by pskb_may_pull() */ + ethhdr = (struct ethhdr *)skb->data; + if (ntohs(ethhdr->h_proto) == ETH_P_8021Q) + ethhdr = (struct ethhdr *)(skb->data + VLAN_HLEN); + udphdr = (struct udphdr *)(skb->data + *header_len); *header_len += sizeof(*udphdr); @@ -634,12 +642,14 @@ bool batadv_gw_is_dhcp_target(struct sk_buff *skb, unsigned int *header_len) return true; } +/* this call might reallocate skb data */ bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, - struct sk_buff *skb, struct ethhdr *ethhdr) + struct sk_buff *skb) { struct batadv_neigh_node *neigh_curr = NULL, *neigh_old = NULL; struct batadv_orig_node *orig_dst_node = NULL; struct batadv_gw_node *curr_gw = NULL; + struct ethhdr *ethhdr; bool ret, out_of_range = false; unsigned int header_len = 0; uint8_t curr_tq_avg; @@ -648,6 +658,7 @@ bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, if (!ret) goto out; + ethhdr = (struct ethhdr *)skb->data; orig_dst_node = batadv_transtable_search(bat_priv, ethhdr->h_source, ethhdr->h_dest); if (!orig_dst_node) diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h index 039902dca4a6..1037d75da51f 100644 --- a/net/batman-adv/gateway_client.h +++ b/net/batman-adv/gateway_client.h @@ -34,7 +34,6 @@ void batadv_gw_node_delete(struct batadv_priv *bat_priv, void batadv_gw_node_purge(struct batadv_priv *bat_priv); int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset); bool batadv_gw_is_dhcp_target(struct sk_buff *skb, unsigned int *header_len); -bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, - struct sk_buff *skb, struct ethhdr *ethhdr); +bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, struct sk_buff *skb); #endif /* _NET_BATMAN_ADV_GATEWAY_CLIENT_H_ */ diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 700d0b49742d..0f04e1c302b4 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -180,6 +180,9 @@ static int batadv_interface_tx(struct sk_buff *skb, if (batadv_bla_tx(bat_priv, skb, vid)) goto dropped; + /* skb->data might have been reallocated by batadv_bla_tx() */ + ethhdr = (struct ethhdr *)skb->data; + /* Register the client MAC in the transtable */ if (!is_multicast_ether_addr(ethhdr->h_source)) batadv_tt_local_add(soft_iface, ethhdr->h_source, skb->skb_iif); @@ -220,6 +223,10 @@ static int batadv_interface_tx(struct sk_buff *skb, default: break; } + + /* reminder: ethhdr might have become unusable from here on + * (batadv_gw_is_dhcp_target() might have reallocated skb data) + */ } /* ethernet packet should be broadcasted */ @@ -266,7 +273,7 @@ static int batadv_interface_tx(struct sk_buff *skb, /* unicast packet */ } else { if (atomic_read(&bat_priv->gw_mode) != BATADV_GW_MODE_OFF) { - ret = batadv_gw_out_of_range(bat_priv, skb, ethhdr); + ret = batadv_gw_out_of_range(bat_priv, skb); if (ret) goto dropped; } diff --git a/net/batman-adv/unicast.c b/net/batman-adv/unicast.c index dc8b5d4dd636..688a0419756b 100644 --- a/net/batman-adv/unicast.c +++ b/net/batman-adv/unicast.c @@ -326,7 +326,9 @@ static bool batadv_unicast_push_and_fill_skb(struct sk_buff *skb, int hdr_size, * @skb: the skb containing the payload to encapsulate * @orig_node: the destination node * - * Returns false if the payload could not be encapsulated or true otherwise + * Returns false if the payload could not be encapsulated or true otherwise. + * + * This call might reallocate skb data. */ static bool batadv_unicast_prepare_skb(struct sk_buff *skb, struct batadv_orig_node *orig_node) @@ -343,7 +345,9 @@ static bool batadv_unicast_prepare_skb(struct sk_buff *skb, * @orig_node: the destination node * @packet_subtype: the batman 4addr packet subtype to use * - * Returns false if the payload could not be encapsulated or true otherwise + * Returns false if the payload could not be encapsulated or true otherwise. + * + * This call might reallocate skb data. */ bool batadv_unicast_4addr_prepare_skb(struct batadv_priv *bat_priv, struct sk_buff *skb, @@ -401,7 +405,7 @@ int batadv_unicast_generic_send_skb(struct batadv_priv *bat_priv, struct batadv_neigh_node *neigh_node; int data_len = skb->len; int ret = NET_RX_DROP; - unsigned int dev_mtu; + unsigned int dev_mtu, header_len; /* get routing information */ if (is_multicast_ether_addr(ethhdr->h_dest)) { @@ -429,10 +433,12 @@ find_router: switch (packet_type) { case BATADV_UNICAST: batadv_unicast_prepare_skb(skb, orig_node); + header_len = sizeof(struct batadv_unicast_packet); break; case BATADV_UNICAST_4ADDR: batadv_unicast_4addr_prepare_skb(bat_priv, skb, orig_node, packet_subtype); + header_len = sizeof(struct batadv_unicast_4addr_packet); break; default: /* this function supports UNICAST and UNICAST_4ADDR only. It @@ -441,6 +447,7 @@ find_router: goto out; } + ethhdr = (struct ethhdr *)(skb->data + header_len); unicast_packet = (struct batadv_unicast_packet *)skb->data; /* inform the destination node that we are still missing a correct route -- cgit v1.2.3 From d4cca39d90fca21c04315095de5d0e734e839a8b Mon Sep 17 00:00:00 2001 From: dingtianhong Date: Fri, 9 Aug 2013 17:12:58 +0800 Subject: tipc: avoid possible deadlock while enable and disable bearer We met lockdep warning when enable and disable the bearer for commands such as: tipc-config -netid=1234 -addr=1.1.3 -be=eth:eth0 tipc-config -netid=1234 -addr=1.1.3 -bd=eth:eth0 --------------------------------------------------- [ 327.693595] ====================================================== [ 327.693994] [ INFO: possible circular locking dependency detected ] [ 327.694519] 3.11.0-rc3-wwd-default #4 Tainted: G O [ 327.694882] ------------------------------------------------------- [ 327.695385] tipc-config/5825 is trying to acquire lock: [ 327.695754] (((timer))#2){+.-...}, at: [] del_timer_sync+0x0/0xd0 [ 327.696018] [ 327.696018] but task is already holding lock: [ 327.696018] (&(&b_ptr->lock)->rlock){+.-...}, at: [] bearer_disable+ 0xdd/0x120 [tipc] [ 327.696018] [ 327.696018] which lock already depends on the new lock. [ 327.696018] [ 327.696018] [ 327.696018] the existing dependency chain (in reverse order) is: [ 327.696018] [ 327.696018] -> #1 (&(&b_ptr->lock)->rlock){+.-...}: [ 327.696018] [] validate_chain+0x6dd/0x870 [ 327.696018] [] __lock_acquire+0x3db/0x670 [ 327.696018] [] lock_acquire+0x103/0x130 [ 327.696018] [] _raw_spin_lock_bh+0x41/0x80 [ 327.696018] [] disc_timeout+0x18/0xd0 [tipc] [ 327.696018] [] call_timer_fn+0xda/0x1e0 [ 327.696018] [] run_timer_softirq+0x2a7/0x2d0 [ 327.696018] [] __do_softirq+0x16a/0x2e0 [ 327.696018] [] irq_exit+0xd5/0xe0 [ 327.696018] [] smp_apic_timer_interrupt+0x45/0x60 [ 327.696018] [] apic_timer_interrupt+0x6f/0x80 [ 327.696018] [] arch_cpu_idle+0x1e/0x30 [ 327.696018] [] cpu_idle_loop+0x1fd/0x280 [ 327.696018] [] cpu_startup_entry+0x1e/0x20 [ 327.696018] [] start_secondary+0x89/0x90 [ 327.696018] [ 327.696018] -> #0 (((timer))#2){+.-...}: [ 327.696018] [] check_prev_add+0x43e/0x4b0 [ 327.696018] [] validate_chain+0x6dd/0x870 [ 327.696018] [] __lock_acquire+0x3db/0x670 [ 327.696018] [] lock_acquire+0x103/0x130 [ 327.696018] [] del_timer_sync+0x3d/0xd0 [ 327.696018] [] tipc_disc_delete+0x15/0x30 [tipc] [ 327.696018] [] bearer_disable+0xef/0x120 [tipc] [ 327.696018] [] tipc_disable_bearer+0x2f/0x60 [tipc] [ 327.696018] [] tipc_cfg_do_cmd+0x2e2/0x550 [tipc] [ 327.696018] [] handle_cmd+0x49/0xe0 [tipc] [ 327.696018] [] genl_family_rcv_msg+0x268/0x340 [ 327.696018] [] genl_rcv_msg+0x70/0xd0 [ 327.696018] [] netlink_rcv_skb+0x89/0xb0 [ 327.696018] [] genl_rcv+0x27/0x40 [ 327.696018] [] netlink_unicast+0x15e/0x1b0 [ 327.696018] [] netlink_sendmsg+0x22f/0x400 [ 327.696018] [] __sock_sendmsg+0x66/0x80 [ 327.696018] [] sock_aio_write+0x107/0x120 [ 327.696018] [] do_sync_write+0x7d/0xc0 [ 327.696018] [] vfs_write+0x186/0x190 [ 327.696018] [] SyS_write+0x60/0xb0 [ 327.696018] [] system_call_fastpath+0x16/0x1b [ 327.696018] [ 327.696018] other info that might help us debug this: [ 327.696018] [ 327.696018] Possible unsafe locking scenario: [ 327.696018] [ 327.696018] CPU0 CPU1 [ 327.696018] ---- ---- [ 327.696018] lock(&(&b_ptr->lock)->rlock); [ 327.696018] lock(((timer))#2); [ 327.696018] lock(&(&b_ptr->lock)->rlock); [ 327.696018] lock(((timer))#2); [ 327.696018] [ 327.696018] *** DEADLOCK *** [ 327.696018] [ 327.696018] 5 locks held by tipc-config/5825: [ 327.696018] #0: (cb_lock){++++++}, at: [] genl_rcv+0x18/0x40 [ 327.696018] #1: (genl_mutex){+.+.+.}, at: [] genl_rcv_msg+0xa6/0xd0 [ 327.696018] #2: (config_mutex){+.+.+.}, at: [] tipc_cfg_do_cmd+0x39/ 0x550 [tipc] [ 327.696018] #3: (tipc_net_lock){++.-..}, at: [] tipc_disable_bearer+ 0x18/0x60 [tipc] [ 327.696018] #4: (&(&b_ptr->lock)->rlock){+.-...}, at: [] bearer_disable+0xdd/0x120 [tipc] [ 327.696018] [ 327.696018] stack backtrace: [ 327.696018] CPU: 2 PID: 5825 Comm: tipc-config Tainted: G O 3.11.0-rc3-wwd- default #4 [ 327.696018] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007 [ 327.696018] 00000000ffffffff ffff880037fa77a8 ffffffff814d03dd 0000000000000000 [ 327.696018] ffff880037fa7808 ffff880037fa77e8 ffffffff810b1c4f 0000000037fa77e8 [ 327.696018] ffff880037fa7808 ffff880037e4db40 0000000000000000 ffff880037e4e318 [ 327.696018] Call Trace: [ 327.696018] [] dump_stack+0x4d/0xa0 [ 327.696018] [] print_circular_bug+0x10f/0x120 [ 327.696018] [] check_prev_add+0x43e/0x4b0 [ 327.696018] [] validate_chain+0x6dd/0x870 [ 327.696018] [] ? sched_clock_cpu+0xd8/0x110 [ 327.696018] [] __lock_acquire+0x3db/0x670 [ 327.696018] [] lock_acquire+0x103/0x130 [ 327.696018] [] ? try_to_del_timer_sync+0x70/0x70 [ 327.696018] [] del_timer_sync+0x3d/0xd0 [ 327.696018] [] ? try_to_del_timer_sync+0x70/0x70 [ 327.696018] [] tipc_disc_delete+0x15/0x30 [tipc] [ 327.696018] [] bearer_disable+0xef/0x120 [tipc] [ 327.696018] [] tipc_disable_bearer+0x2f/0x60 [tipc] [ 327.696018] [] tipc_cfg_do_cmd+0x2e2/0x550 [tipc] [ 327.696018] [] ? security_capable+0x13/0x20 [ 327.696018] [] handle_cmd+0x49/0xe0 [tipc] [ 327.696018] [] genl_family_rcv_msg+0x268/0x340 [ 327.696018] [] genl_rcv_msg+0x70/0xd0 [ 327.696018] [] ? genl_lock+0x20/0x20 [ 327.696018] [] netlink_rcv_skb+0x89/0xb0 [ 327.696018] [] ? genl_rcv+0x18/0x40 [ 327.696018] [] genl_rcv+0x27/0x40 [ 327.696018] [] netlink_unicast+0x15e/0x1b0 [ 327.696018] [] ? memcpy_fromiovec+0x6c/0x90 [ 327.696018] [] netlink_sendmsg+0x22f/0x400 [ 327.696018] [] __sock_sendmsg+0x66/0x80 [ 327.696018] [] sock_aio_write+0x107/0x120 [ 327.696018] [] ? release_sock+0x8c/0xa0 [ 327.696018] [] do_sync_write+0x7d/0xc0 [ 327.696018] [] ? rw_verify_area+0x54/0x100 [ 327.696018] [] vfs_write+0x186/0x190 [ 327.696018] [] SyS_write+0x60/0xb0 [ 327.696018] [] system_call_fastpath+0x16/0x1b ----------------------------------------------------------------------- The problem is that the tipc_link_delete() will cancel the timer disc_timeout() when the b_ptr->lock is hold, but the disc_timeout() still call b_ptr->lock to finish the work, so the dead lock occurs. We should unlock the b_ptr->lock when del the disc_timeout(). Remove link_timeout() still met the same problem, the patch: http://article.gmane.org/gmane.network.tipc.general/4380 fix the problem, so no need to send patch for fix link_timeout() deadlock warming. Signed-off-by: Wang Weidong Signed-off-by: Ding Tianhong Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/bearer.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index cb29ef7ba2f0..609c30c80816 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -460,6 +460,7 @@ static void bearer_disable(struct tipc_bearer *b_ptr) { struct tipc_link *l_ptr; struct tipc_link *temp_l_ptr; + struct tipc_link_req *temp_req; pr_info("Disabling bearer <%s>\n", b_ptr->name); spin_lock_bh(&b_ptr->lock); @@ -468,9 +469,13 @@ static void bearer_disable(struct tipc_bearer *b_ptr) list_for_each_entry_safe(l_ptr, temp_l_ptr, &b_ptr->links, link_list) { tipc_link_delete(l_ptr); } - if (b_ptr->link_req) - tipc_disc_delete(b_ptr->link_req); + temp_req = b_ptr->link_req; + b_ptr->link_req = NULL; spin_unlock_bh(&b_ptr->lock); + + if (temp_req) + tipc_disc_delete(temp_req); + memset(b_ptr, 0, sizeof(struct tipc_bearer)); } -- cgit v1.2.3 From ac4f9599362475662efb6efbb334cbcec98d4778 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 9 Aug 2013 15:09:08 +0200 Subject: net: sctp: sctp_assoc_control_transport: fix MTU size in SCTP_PF state The SCTP Quick failover draft [1] section 5.1, point 5 says that the cwnd should be 1 MTU. So, instead of 1, set it to 1 MTU. [1] https://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05 Reported-by: Karl Heiss Signed-off-by: Daniel Borkmann Cc: Neil Horman Acked-by: Vlad Yasevich Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/associola.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/associola.c b/net/sctp/associola.c index bce5b79662a6..ab67efc64b24 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -846,12 +846,12 @@ void sctp_assoc_control_transport(struct sctp_association *asoc, else spc_state = SCTP_ADDR_AVAILABLE; /* Don't inform ULP about transition from PF to - * active state and set cwnd to 1, see SCTP + * active state and set cwnd to 1 MTU, see SCTP * Quick failover draft section 5.1, point 5 */ if (transport->state == SCTP_PF) { ulp_notify = false; - transport->cwnd = 1; + transport->cwnd = asoc->pathmtu; } transport->state = SCTP_ACTIVE; break; -- cgit v1.2.3 From 771085d6bf3c52de29fc213e5bad07a82e57c23e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 9 Aug 2013 16:25:21 +0200 Subject: net: sctp: sctp_transport_destroy{, _rcu}: fix potential pointer corruption Probably this one is quite unlikely to be triggered, but it's more safe to do the call_rcu() at the end after we have dropped the reference on the asoc and freed sctp packet chunks. The reason why is because in sctp_transport_destroy_rcu() the transport is being kfree()'d, and if we're unlucky enough we could run into corrupted pointers. Probably that's more of theoretical nature, but it's safer to have this simple fix. Introduced by commit 8c98653f ("sctp: sctp_close: fix release of bindings for deferred call_rcu's"). I also did the 8c98653f regression test and it's fine that way. Signed-off-by: Daniel Borkmann Acked-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/transport.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/transport.c b/net/sctp/transport.c index bdbbc3fd7c14..8fdd16046d66 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -181,12 +181,12 @@ static void sctp_transport_destroy(struct sctp_transport *transport) return; } - call_rcu(&transport->rcu, sctp_transport_destroy_rcu); - sctp_packet_free(&transport->packet); if (transport->asoc) sctp_association_put(transport->asoc); + + call_rcu(&transport->rcu, sctp_transport_destroy_rcu); } /* Start T3_rtx timer if it is not already running and update the heartbeat -- cgit v1.2.3 From 58ad436fcf49810aa006016107f494c9ac9013db Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 13 Aug 2013 09:04:05 +0200 Subject: genetlink: fix family dump race When dumping generic netlink families, only the first dump call is locked with genl_lock(), which protects the list of families, and thus subsequent calls can access the data without locking, racing against family addition/removal. This can cause a crash. Fix it - the locking needs to be conditional because the first time around it's already locked. A similar bug was reported to me on an old kernel (3.4.47) but the exact scenario that happened there is no longer possible, on those kernels the first round wasn't locked either. Looking at the current code I found the race described above, which had also existed on the old kernel. Cc: stable@vger.kernel.org Reported-by: Andrei Otcheretianski Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- net/netlink/genetlink.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 512718adb0d5..f85f8a2ad6cf 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -789,6 +789,10 @@ static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb) struct net *net = sock_net(skb->sk); int chains_to_skip = cb->args[0]; int fams_to_skip = cb->args[1]; + bool need_locking = chains_to_skip || fams_to_skip; + + if (need_locking) + genl_lock(); for (i = chains_to_skip; i < GENL_FAM_TAB_SIZE; i++) { n = 0; @@ -810,6 +814,9 @@ errout: cb->args[0] = i; cb->args[1] = n; + if (need_locking) + genl_unlock(); + return skb->len; } -- cgit v1.2.3 From 4221f40513233fa8edeef7fc82e44163fde03b9b Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Tue, 13 Aug 2013 01:41:06 -0700 Subject: ip_tunnel: Do not use inner ip-header-id for tunnel ip-header-id. Using inner-id for tunnel id is not safe in some rare cases. E.g. packets coming from multiple sources entering same tunnel can have same id. Therefore on tunnel packet receive we could have packets from two different stream but with same source and dst IP with same ip-id which could confuse ip packet reassembly. Following patch reverts optimization from commit 490ab08127 (IP_GRE: Fix IP-Identification.) CC: Jarno Rajahalme CC: Ansis Atteka Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 14 -------------- net/ipv4/ip_tunnel_core.c | 4 +--- 2 files changed, 1 insertion(+), 17 deletions(-) (limited to 'net') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 781b3cf86a2f..a354db5b7662 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -145,20 +145,6 @@ static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph, return INET_ECN_encapsulate(tos, inner); } -static inline void tunnel_ip_select_ident(struct sk_buff *skb, - const struct iphdr *old_iph, - struct dst_entry *dst) -{ - struct iphdr *iph = ip_hdr(skb); - - /* Use inner packet iph-id if possible. */ - if (skb->protocol == htons(ETH_P_IP) && old_iph->id) - iph->id = old_iph->id; - else - __ip_select_ident(iph, dst, - (skb_shinfo(skb)->gso_segs ?: 1) - 1); -} - int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto); int iptunnel_xmit(struct net *net, struct rtable *rt, struct sk_buff *skb, diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 7167b08977df..850525b34899 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -76,9 +76,7 @@ int iptunnel_xmit(struct net *net, struct rtable *rt, iph->daddr = dst; iph->saddr = src; iph->ttl = ttl; - tunnel_ip_select_ident(skb, - (const struct iphdr *)skb_inner_network_header(skb), - &rt->dst); + __ip_select_ident(iph, &rt->dst, (skb_shinfo(skb)->gso_segs ?: 1) - 1); err = ip_local_out(skb); if (unlikely(net_xmit_eval(err))) -- cgit v1.2.3 From 3e805ad288c524bb65aad3f1e004402223d3d504 Mon Sep 17 00:00:00 2001 From: Asbjoern Sloth Toennesen Date: Mon, 12 Aug 2013 16:30:09 +0000 Subject: rtnetlink: rtnl_bridge_getlink: Call nlmsg_find_attr() with ifinfomsg header Fix the iproute2 command `bridge vlan show`, after switching from rtgenmsg to ifinfomsg. Let's start with a little history: Feb 20: Vlad Yasevich got his VLAN-aware bridge patchset included in the 3.9 merge window. In the kernel commit 6cbdceeb, he added attribute support to bridge GETLINK requests sent with rtgenmsg. Mar 6th: Vlad got this iproute2 reference implementation of the bridge vlan netlink interface accepted (iproute2 9eff0e5c) Apr 25th: iproute2 switched from using rtgenmsg to ifinfomsg (63338dca) http://patchwork.ozlabs.org/patch/239602/ http://marc.info/?t=136680900700007 Apr 28th: Linus released 3.9 Apr 30th: Stephen released iproute2 3.9.0 The `bridge vlan show` command haven't been working since the switch to ifinfomsg, or in a released version of iproute2. Since the kernel side only supports rtgenmsg, which iproute2 switched away from just prior to the iproute2 3.9.0 release. I haven't been able to find any documentation, about neither rtgenmsg nor ifinfomsg, and in which situation to use which, but kernel commit 88c5b5ce seams to suggest that ifinfomsg should be used. Fixing this in kernel will break compatibility, but I doubt that anybody have been using it due to this bug in the user space reference implementation, at least not without noticing this bug. That said the functionality is still fully functional in 3.9, when reversing iproute2 commit 63338dca. This could also be fixed in iproute2, but thats an ugly patch that would reintroduce rtgenmsg in iproute2, and from searching in netdev it seams like rtgenmsg usage is discouraged. I'm assuming that the only reason that Vlad implemented the kernel side to use rtgenmsg, was because iproute2 was using it at the time. Signed-off-by: Asbjoern Sloth Toennesen Reviewed-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 82d968527121..ca198c1d1d30 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2384,7 +2384,7 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) struct nlattr *extfilt; u32 filter_mask = 0; - extfilt = nlmsg_find_attr(cb->nlh, sizeof(struct rtgenmsg), + extfilt = nlmsg_find_attr(cb->nlh, sizeof(struct ifinfomsg), IFLA_EXT_MASK); if (extfilt) filter_mask = nla_get_u32(extfilt); -- cgit v1.2.3 From 30444e981ba28e892c439017fbc011d867f02a7d Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Mon, 13 May 2013 08:41:06 -0700 Subject: openvswitch: Fix bad merge resolution. git silently included an extra hunk in vport_cmd_set() during automatic merging. This code is unreachable so it does not actually introduce a problem but it is clearly incorrect. Signed-off-by: Jesse Gross --- net/openvswitch/datapath.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index f7e3a0d84c40..f2ed7600084e 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -2076,9 +2076,6 @@ static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info) ovs_notify(reply, info, &ovs_dp_vport_multicast_group); return 0; - rtnl_unlock(); - return 0; - exit_free: kfree_skb(reply); exit_unlock: -- cgit v1.2.3 From 42415c90ceaf50c792e29823e359463bc6d4ee05 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Tue, 30 Jul 2013 15:44:14 -0700 Subject: openvswitch: Use correct type while allocating flex array. Flex array is used to allocate hash buckets which is type struct hlist_head, but we use `struct hlist_head *` to calculate array size. Since hlist_head is of size pointer it works fine. Following patch use correct type. Signed-off-by: Pravin B Shelar Signed-off-by: Jesse Gross --- net/openvswitch/flow.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 5c519b121e1b..1aa84dc58777 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -240,7 +240,7 @@ static struct flex_array *alloc_buckets(unsigned int n_buckets) struct flex_array *buckets; int i, err; - buckets = flex_array_alloc(sizeof(struct hlist_head *), + buckets = flex_array_alloc(sizeof(struct hlist_head), n_buckets, GFP_KERNEL); if (!buckets) return NULL; -- cgit v1.2.3 From 36bf5cc66d60868bcc10aff209defed5a7b71c1d Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Wed, 14 Aug 2013 15:50:36 -0700 Subject: openvswitch: Reset tunnel key between input and output. It doesn't make sense to output a tunnel packet using the same parameters that it was received with since that will generally just result in the packet going back to us. As a result, userspace assumes that the tunnel key is cleared when transitioning through the switch. In the majority of cases this doesn't matter since a packet is either going to a tunnel port (in which the key is overwritten with new values) or to a non-tunnel port (in which case the key is ignored). However, it's theoreticaly possible that userspace could rely on the documented behavior, so this corrects it. Signed-off-by: Jesse Gross --- net/openvswitch/actions.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 22c5f399f1cf..ab101f715447 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -535,6 +535,7 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb) { struct sw_flow_actions *acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts); + OVS_CB(skb)->tun_key = NULL; return do_execute_actions(dp, skb, acts->actions, acts->actions_len, false); } -- cgit v1.2.3 From 8a8e3d84b1719a56f9151909e80ea6ebc5b8e318 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 14 Aug 2013 23:47:11 +0200 Subject: net_sched: restore "linklayer atm" handling commit 56b765b79 ("htb: improved accuracy at high rates") broke the "linklayer atm" handling. tc class add ... htb rate X ceil Y linklayer atm The linklayer setting is implemented by modifying the rate table which is send to the kernel. No direct parameter were transferred to the kernel indicating the linklayer setting. The commit 56b765b79 ("htb: improved accuracy at high rates") removed the use of the rate table system. To keep compatible with older iproute2 utils, this patch detects the linklayer by parsing the rate table. It also supports future versions of iproute2 to send this linklayer parameter to the kernel directly. This is done by using the __reserved field in struct tc_ratespec, to convey the choosen linklayer option, but only using the lower 4 bits of this field. Linklayer detection is limited to speeds below 100Mbit/s, because at high rates the rtab is gets too inaccurate, so bad that several fields contain the same values, this resembling the ATM detect. Fields even start to contain "0" time to send, e.g. at 1000Mbit/s sending a 96 bytes packet cost "0", thus the rtab have been more broken than we first realized. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/net/sch_generic.h | 9 ++++++++- include/uapi/linux/pkt_sched.h | 10 +++++++++- net/sched/sch_api.c | 41 +++++++++++++++++++++++++++++++++++++++++ net/sched/sch_generic.c | 1 + net/sched/sch_htb.c | 13 +++++++++++++ 5 files changed, 72 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 6eab63363e59..e5ae0c50fa9c 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -683,13 +683,19 @@ struct psched_ratecfg { u64 rate_bytes_ps; /* bytes per second */ u32 mult; u16 overhead; + u8 linklayer; u8 shift; }; static inline u64 psched_l2t_ns(const struct psched_ratecfg *r, unsigned int len) { - return ((u64)(len + r->overhead) * r->mult) >> r->shift; + len += r->overhead; + + if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) + return ((u64)(DIV_ROUND_UP(len,48)*53) * r->mult) >> r->shift; + + return ((u64)len * r->mult) >> r->shift; } extern void psched_ratecfg_precompute(struct psched_ratecfg *r, const struct tc_ratespec *conf); @@ -700,6 +706,7 @@ static inline void psched_ratecfg_getrate(struct tc_ratespec *res, memset(res, 0, sizeof(*res)); res->rate = r->rate_bytes_ps; res->overhead = r->overhead; + res->linklayer = (r->linklayer & TC_LINKLAYER_MASK); } #endif diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index dbd71b0c7d8c..09d62b9228ff 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -73,9 +73,17 @@ struct tc_estimator { #define TC_H_ROOT (0xFFFFFFFFU) #define TC_H_INGRESS (0xFFFFFFF1U) +/* Need to corrospond to iproute2 tc/tc_core.h "enum link_layer" */ +enum tc_link_layer { + TC_LINKLAYER_UNAWARE, /* Indicate unaware old iproute2 util */ + TC_LINKLAYER_ETHERNET, + TC_LINKLAYER_ATM, +}; +#define TC_LINKLAYER_MASK 0x0F /* limit use to lower 4 bits */ + struct tc_ratespec { unsigned char cell_log; - unsigned char __reserved; + __u8 linklayer; /* lower 4 bits */ unsigned short overhead; short cell_align; unsigned short mpu; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 281c1bded1f6..51b968d3febb 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -285,6 +285,45 @@ static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind) return q; } +/* The linklayer setting were not transferred from iproute2, in older + * versions, and the rate tables lookup systems have been dropped in + * the kernel. To keep backward compatible with older iproute2 tc + * utils, we detect the linklayer setting by detecting if the rate + * table were modified. + * + * For linklayer ATM table entries, the rate table will be aligned to + * 48 bytes, thus some table entries will contain the same value. The + * mpu (min packet unit) is also encoded into the old rate table, thus + * starting from the mpu, we find low and high table entries for + * mapping this cell. If these entries contain the same value, when + * the rate tables have been modified for linklayer ATM. + * + * This is done by rounding mpu to the nearest 48 bytes cell/entry, + * and then roundup to the next cell, calc the table entry one below, + * and compare. + */ +static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab) +{ + int low = roundup(r->mpu, 48); + int high = roundup(low+1, 48); + int cell_low = low >> r->cell_log; + int cell_high = (high >> r->cell_log) - 1; + + /* rtab is too inaccurate at rates > 100Mbit/s */ + if ((r->rate > (100000000/8)) || (rtab[0] == 0)) { + pr_debug("TC linklayer: Giving up ATM detection\n"); + return TC_LINKLAYER_ETHERNET; + } + + if ((cell_high > cell_low) && (cell_high < 256) + && (rtab[cell_low] == rtab[cell_high])) { + pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n", + cell_low, cell_high, rtab[cell_high]); + return TC_LINKLAYER_ATM; + } + return TC_LINKLAYER_ETHERNET; +} + static struct qdisc_rate_table *qdisc_rtab_list; struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab) @@ -308,6 +347,8 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *ta rtab->rate = *r; rtab->refcnt = 1; memcpy(rtab->data, nla_data(tab), 1024); + if (r->linklayer == TC_LINKLAYER_UNAWARE) + r->linklayer = __detect_linklayer(r, rtab->data); rtab->next = qdisc_rtab_list; qdisc_rtab_list = rtab; } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index eeb8276d7a89..48be3d5c0d92 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -909,6 +909,7 @@ void psched_ratecfg_precompute(struct psched_ratecfg *r, memset(r, 0, sizeof(*r)); r->overhead = conf->overhead; r->rate_bytes_ps = conf->rate; + r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK); r->mult = 1; /* * The deal here is to replace a divide by a reciprocal one diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 45e751527dfc..c2178b15ca6e 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1329,6 +1329,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, struct htb_sched *q = qdisc_priv(sch); struct htb_class *cl = (struct htb_class *)*arg, *parent; struct nlattr *opt = tca[TCA_OPTIONS]; + struct qdisc_rate_table *rtab = NULL, *ctab = NULL; struct nlattr *tb[TCA_HTB_MAX + 1]; struct tc_htb_opt *hopt; @@ -1350,6 +1351,18 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, if (!hopt->rate.rate || !hopt->ceil.rate) goto failure; + /* Keeping backward compatible with rate_table based iproute2 tc */ + if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE) { + rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]); + if (rtab) + qdisc_put_rtab(rtab); + } + if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE) { + ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]); + if (ctab) + qdisc_put_rtab(ctab); + } + if (!cl) { /* new class */ struct Qdisc *new_q; int prio; -- cgit v1.2.3