From aea9d711f3d68c656ad31ab578ecfb0bb5cd7f97 Mon Sep 17 00:00:00 2001 From: Sven Wegener Date: Wed, 9 Jun 2010 16:10:57 +0200 Subject: ipvs: Add missing locking during connection table hashing and unhashing The code that hashes and unhashes connections from the connection table is missing locking of the connection being modified, which opens up a race condition and results in memory corruption when this race condition is hit. Here is what happens in pretty verbose form: CPU 0 CPU 1 ------------ ------------ An active connection is terminated and we schedule ip_vs_conn_expire() on this CPU to expire this connection. IRQ assignment is changed to this CPU, but the expire timer stays scheduled on the other CPU. New connection from same ip:port comes in right before the timer expires, we find the inactive connection in our connection table and get a reference to it. We proper lock the connection in tcp_state_transition() and read the connection flags in set_tcp_state(). ip_vs_conn_expire() gets called, we unhash the connection from our connection table and remove the hashed flag in ip_vs_conn_unhash(), without proper locking! While still holding proper locks we write the connection flags in set_tcp_state() and this sets the hashed flag again. ip_vs_conn_expire() fails to expire the connection, because the other CPU has incremented the reference count. We try to re-insert the connection into our connection table, but this fails in ip_vs_conn_hash(), because the hashed flag has been set by the other CPU. We re-schedule execution of ip_vs_conn_expire(). Now this connection has the hashed flag set, but isn't actually hashed in our connection table and has a dangling list_head. We drop the reference we held on the connection and schedule the expire timer for timeouting the connection on this CPU. Further packets won't be able to find this connection in our connection table. ip_vs_conn_expire() gets called again, we think it's already hashed, but the list_head is dangling and while removing the connection from our connection table we write to the memory location where this list_head points to. The result will probably be a kernel oops at some other point in time. This race condition is pretty subtle, but it can be triggered remotely. It needs the IRQ assignment change or another circumstance where packets coming from the same ip:port for the same service are being processed on different CPUs. And it involves hitting the exact time at which ip_vs_conn_expire() gets called. It can be avoided by making sure that all packets from one connection are always processed on the same CPU and can be made harder to exploit by changing the connection timeouts to some custom values. Signed-off-by: Sven Wegener Cc: stable@kernel.org Acked-by: Simon Horman Signed-off-by: Patrick McHardy --- net/netfilter/ipvs/ip_vs_conn.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index d8f7e8ef67b4..ff04e9edbed6 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -162,6 +162,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport); ct_write_lock(hash); + spin_lock(&cp->lock); if (!(cp->flags & IP_VS_CONN_F_HASHED)) { list_add(&cp->c_list, &ip_vs_conn_tab[hash]); @@ -174,6 +175,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) ret = 0; } + spin_unlock(&cp->lock); ct_write_unlock(hash); return ret; @@ -193,6 +195,7 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport); ct_write_lock(hash); + spin_lock(&cp->lock); if (cp->flags & IP_VS_CONN_F_HASHED) { list_del(&cp->c_list); @@ -202,6 +205,7 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) } else ret = 0; + spin_unlock(&cp->lock); ct_write_unlock(hash); return ret; -- cgit v1.2.3 From e897082fe7a5b591dc4dd5599ac39081a7c8e482 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 13 Jun 2010 10:36:30 +0000 Subject: net: fix deliver_no_wcard regression on loopback device deliver_no_wcard is not being set in skb_copy_header. In the skb_cloned case it is not being cleared and may cause the skb to be dropped when the loopback device pushes it back up the stack. Signed-off-by: John Fastabend Acked-by: Eric Dumazet Tested-by: Markus Trippelsdorf Signed-off-by: David S. Miller --- net/core/skbuff.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 9f07e749d7b1..bcf2fa3e0ddc 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -532,6 +532,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->ip_summed = old->ip_summed; skb_copy_queue_mapping(new, old); new->priority = old->priority; + new->deliver_no_wcard = old->deliver_no_wcard; #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) new->ipvs_property = old->ipvs_property; #endif -- cgit v1.2.3 From e8d15e6460cb0eea00f2574a80d94496943403ba Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 13 Jun 2010 10:50:46 +0000 Subject: net: rxhash already set in __copy_skb_header No need to copy rxhash again in __skb_clone() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index bcf2fa3e0ddc..34432b4e96bb 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -570,7 +570,6 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) C(len); C(data_len); C(mac_len); - C(rxhash); n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len; n->cloned = 1; n->nohdr = 0; -- cgit v1.2.3 From fed396a585d8e1870b326f2e8e1888a72957abb8 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 15 Jun 2010 21:43:07 -0700 Subject: bridge: Fix OOM crash in deliver_clone The bridge multicast patches introduced an OOM crash in the forward path, when deliver_clone fails to clone the skb. Reported-by: Mark Wagner Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/bridge/br_forward.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index a98ef1393097..a4e72a89e4ff 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -140,10 +140,10 @@ static int deliver_clone(const struct net_bridge_port *prev, void (*__packet_hook)(const struct net_bridge_port *p, struct sk_buff *skb)) { + struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev; + skb = skb_clone(skb, GFP_ATOMIC); if (!skb) { - struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev; - dev->stats.tx_dropped++; return -ENOMEM; } -- cgit v1.2.3 From 021570e55b7152843376b9d9f60624e3e05ac054 Mon Sep 17 00:00:00 2001 From: Christoph Fritz Date: Wed, 16 Jun 2010 16:37:34 +0200 Subject: mac80211: fix warn, enum may be used uninitialized MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit regression introduced by b8d92c9c141ee3dc9b3537b1f0ffb4a54ea8d9b2 In function ‘ieee80211_work_rx_queued_mgmt’: warning: ‘rma’ may be used uninitialized in this function this re-adds default value WORK_ACT_NONE back to rma Signed-off-by: Christoph Fritz Signed-off-by: John W. Linville --- net/mac80211/work.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/work.c b/net/mac80211/work.c index be3d4a698692..b025dc7bb0fd 100644 --- a/net/mac80211/work.c +++ b/net/mac80211/work.c @@ -715,7 +715,7 @@ static void ieee80211_work_rx_queued_mgmt(struct ieee80211_local *local, struct ieee80211_rx_status *rx_status; struct ieee80211_mgmt *mgmt; struct ieee80211_work *wk; - enum work_action rma; + enum work_action rma = WORK_ACT_NONE; u16 fc; rx_status = (struct ieee80211_rx_status *) skb->cb; -- cgit v1.2.3 From fa68a7822780fdc1295f7efb7e4313e62b447e75 Mon Sep 17 00:00:00 2001 From: Tom Hughes Date: Tue, 15 Jun 2010 22:24:28 +0000 Subject: Clear IFF_XMIT_DST_RELEASE for teql interfaces https://bugzilla.kernel.org/show_bug.cgi?id=16183 The sch_teql module, which can be used to load balance over a set of underlying interfaces, stopped working after 2.6.30 and has been broken in all kernels since then for any underlying interface which requires the addition of link level headers. The problem is that the transmit routine relies on being able to access the destination address in the skb in order to do address resolution once it has decided which underlying interface it is going to transmit through. In 2.6.31 the IFF_XMIT_DST_RELEASE flag was introduced, and set by default for all interfaces, which causes the destination address to be released before the transmit routine for the interface is called. The solution is to clear that flag for teql interfaces. Signed-off-by: Tom Hughes Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_teql.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 3415b6ce1c0a..807643bdcbac 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -449,6 +449,7 @@ static __init void teql_master_setup(struct net_device *dev) dev->tx_queue_len = 100; dev->flags = IFF_NOARP; dev->hard_header_len = LL_MAX_HEADER; + dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; } static LIST_HEAD(master_dev_list); -- cgit v1.2.3 From 25442e06d20aaba7d7b16438078a562b3e4cf19b Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Tue, 15 Jun 2010 06:14:12 +0000 Subject: bridge: fdb cleanup runs too often It is common in end-node, non STP bridges to set forwarding delay to zero; which causes the forwarding database cleanup to run every clock tick. Change to run only as soon as needed or at next ageing timer interval which ever is sooner. Use round_jiffies_up macro rather than attempting round up by changing value. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/br_fdb.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 26637439965b..b01dde35a69e 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -128,7 +128,7 @@ void br_fdb_cleanup(unsigned long _data) { struct net_bridge *br = (struct net_bridge *)_data; unsigned long delay = hold_time(br); - unsigned long next_timer = jiffies + br->forward_delay; + unsigned long next_timer = jiffies + br->ageing_time; int i; spin_lock_bh(&br->hash_lock); @@ -149,9 +149,7 @@ void br_fdb_cleanup(unsigned long _data) } spin_unlock_bh(&br->hash_lock); - /* Add HZ/4 to ensure we round the jiffies upwards to be after the next - * timer, otherwise we might round down and will have no-op run. */ - mod_timer(&br->gc_timer, round_jiffies(next_timer + HZ/4)); + mod_timer(&br->gc_timer, round_jiffies_up(next_timer)); } /* Completely flush all dynamic entries in forwarding database.*/ -- cgit v1.2.3 From 26cde9f7e2747b6d254b704594eed87ab959afa5 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 15 Jun 2010 01:52:25 +0000 Subject: udp: Fix bogus UFO packet generation It has been reported that the new UFO software fallback path fails under certain conditions with NFS. I tracked the problem down to the generation of UFO packets that are smaller than the MTU. The software fallback path simply discards these packets. This patch fixes the problem by not generating such packets on the UFO path. Signed-off-by: Herbert Xu Reviewed-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 9a4a6c96cb0d..041d41df1224 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -873,8 +873,10 @@ int ip_append_data(struct sock *sk, !exthdrlen) csummode = CHECKSUM_PARTIAL; + skb = skb_peek_tail(&sk->sk_write_queue); + inet->cork.length += length; - if (((length> mtu) || !skb_queue_empty(&sk->sk_write_queue)) && + if (((length > mtu) || (skb && skb_is_gso(skb))) && (sk->sk_protocol == IPPROTO_UDP) && (rt->u.dst.dev->features & NETIF_F_UFO)) { err = ip_ufo_append_data(sk, getfrag, from, length, hh_len, @@ -892,7 +894,7 @@ int ip_append_data(struct sock *sk, * adding appropriate IP header. */ - if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) + if (!skb) goto alloc_new_skb; while (length > 0) { @@ -1121,7 +1123,8 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, return -EINVAL; inet->cork.length += size; - if ((sk->sk_protocol == IPPROTO_UDP) && + if ((size + skb->len > mtu) && + (sk->sk_protocol == IPPROTO_UDP) && (rt->u.dst.dev->features & NETIF_F_UFO)) { skb_shinfo(skb)->gso_size = mtu - fragheaderlen; skb_shinfo(skb)->gso_type = SKB_GSO_UDP; -- cgit v1.2.3 From b1312c89f0016f778cac4f1536f1434e132f8713 Mon Sep 17 00:00:00 2001 From: Timo Teräs Date: Thu, 24 Jun 2010 14:35:00 -0700 Subject: xfrm: check bundle policy existance before dereferencing it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the bundle validation code to not assume having a valid policy. When we have multiple transformations for a xfrm policy, the bundle instance will be a chain of bundles with only the first one having the policy reference. When policy_genid is bumped it will expire the first bundle in the chain which is equivalent of expiring the whole chain. Reported-bisected-and-tested-by: Justin P. Mattock Signed-off-by: Timo Teräs Signed-off-by: David S. Miller --- net/xfrm/xfrm_policy.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 4bf27d901333..af1c173be4ad 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2300,7 +2300,8 @@ int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *first, return 0; if (xdst->xfrm_genid != dst->xfrm->genid) return 0; - if (xdst->policy_genid != atomic_read(&xdst->pols[0]->genid)) + if (xdst->num_pols > 0 && + xdst->policy_genid != atomic_read(&xdst->pols[0]->genid)) return 0; if (strict && fl && -- cgit v1.2.3 From 1a61a83ff59378a5613d8c706c4a660c353b62a8 Mon Sep 17 00:00:00 2001 From: "Gustavo F. Padovan" Date: Fri, 18 Jun 2010 14:24:00 +0000 Subject: Bluetooth: Bring back var 'i' increment commit ff6e2163f28a1094fb5ca5950fe2b43c3cf6bc7a accidentally added a regression on the bnep code. Fixing it. Signed-off-by: Gustavo F. Padovan Signed-off-by: David S. Miller --- net/bluetooth/bnep/netdev.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/bluetooth/bnep/netdev.c b/net/bluetooth/bnep/netdev.c index 0faad5ce6dc4..8c100c9dae28 100644 --- a/net/bluetooth/bnep/netdev.c +++ b/net/bluetooth/bnep/netdev.c @@ -104,6 +104,8 @@ static void bnep_net_set_mc_list(struct net_device *dev) break; memcpy(__skb_put(skb, ETH_ALEN), ha->addr, ETH_ALEN); memcpy(__skb_put(skb, ETH_ALEN), ha->addr, ETH_ALEN); + + i++; } r->len = htons(skb->len - len); } -- cgit v1.2.3 From 9f888160bdcccf0565dd2774956b8d9456e610be Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Mon, 21 Jun 2010 11:00:13 +0000 Subject: ipv6: fix NULL reference in proxy neighbor discovery The addition of TLLAO option created a kernel OOPS regression for the case where neighbor advertisement is being sent via proxy path. When using proxy, ipv6_get_ifaddr() returns NULL causing the NULL dereference. Change causing the bug was: commit f7734fdf61ec6bb848e0bafc1fb8bad2c124bb50 Author: Octavian Purdila Date: Fri Oct 2 11:39:15 2009 +0000 make TLLAO option for NA packets configurable Signed-off-by: Stephen Hemminger Acked-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/ipv6/ndisc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 0abdc242ddb7..2efef52fb461 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -586,6 +586,7 @@ static void ndisc_send_na(struct net_device *dev, struct neighbour *neigh, src_addr = solicited_addr; if (ifp->flags & IFA_F_OPTIMISTIC) override = 0; + inc_opt |= ifp->idev->cnf.force_tllao; in6_ifa_put(ifp); } else { if (ipv6_dev_get_saddr(dev_net(dev), dev, daddr, @@ -599,7 +600,6 @@ static void ndisc_send_na(struct net_device *dev, struct neighbour *neigh, icmp6h.icmp6_solicited = solicited; icmp6h.icmp6_override = override; - inc_opt |= ifp->idev->cnf.force_tllao; __ndisc_send(dev, neigh, daddr, src_addr, &icmp6h, solicited_addr, inc_opt ? ND_OPT_TARGET_LL_ADDR : 0); -- cgit v1.2.3