From efcdbf24fd5daa88060869e51ed49f68b7ac8708 Mon Sep 17 00:00:00 2001
From: Arun Sharma <asharma@fb.com>
Date: Mon, 30 Jan 2012 14:16:06 -0800
Subject: net: Disambiguate kernel message

Some of our machines were reporting:

TCP: too many of orphaned sockets

even when the number of orphaned sockets was well below the
limit.

We print a different message depending on whether we're out
of TCP memory or there are too many orphaned sockets.

Also move the check out of line and cleanup the messages
that were printed.

Signed-off-by: Arun Sharma <asharma@fb.com>
Suggested-by: Mohan Srinivasan <mohan@fb.com>
Cc: netdev@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: David Miller <davem@davemloft.net>
Cc: Glauber Costa <glommer@parallels.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c       | 19 +++++++++++++++----
 net/ipv4/tcp_timer.c |  5 +----
 2 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 06373b4a449a..a34f5cfdd44c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1876,6 +1876,20 @@ void tcp_shutdown(struct sock *sk, int how)
 }
 EXPORT_SYMBOL(tcp_shutdown);
 
+bool tcp_check_oom(struct sock *sk, int shift)
+{
+	bool too_many_orphans, out_of_socket_memory;
+
+	too_many_orphans = tcp_too_many_orphans(sk, shift);
+	out_of_socket_memory = tcp_out_of_memory(sk);
+
+	if (too_many_orphans && net_ratelimit())
+		pr_info("TCP: too many orphaned sockets\n");
+	if (out_of_socket_memory && net_ratelimit())
+		pr_info("TCP: out of memory -- consider tuning tcp_mem\n");
+	return too_many_orphans || out_of_socket_memory;
+}
+
 void tcp_close(struct sock *sk, long timeout)
 {
 	struct sk_buff *skb;
@@ -2015,10 +2029,7 @@ adjudge_to_death:
 	}
 	if (sk->sk_state != TCP_CLOSE) {
 		sk_mem_reclaim(sk);
-		if (tcp_too_many_orphans(sk, 0)) {
-			if (net_ratelimit())
-				printk(KERN_INFO "TCP: too many of orphaned "
-				       "sockets\n");
+		if (tcp_check_oom(sk, 0)) {
 			tcp_set_state(sk, TCP_CLOSE);
 			tcp_send_active_reset(sk, GFP_ATOMIC);
 			NET_INC_STATS_BH(sock_net(sk),
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index a516d1e399df..cd2e0723266d 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -77,10 +77,7 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset)
 	if (sk->sk_err_soft)
 		shift++;
 
-	if (tcp_too_many_orphans(sk, shift)) {
-		if (net_ratelimit())
-			printk(KERN_INFO "Out of socket memory\n");
-
+	if (tcp_check_oom(sk, shift)) {
 		/* Catch exceptional cases, when connection requires reset.
 		 *      1. Last segment was sent recently. */
 		if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
-- 
cgit v1.2.3


From 786f528119722f564a22ad953411374e06116333 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Wed, 1 Feb 2012 09:32:25 +0000
Subject: ethtool: Null-terminate filename passed to ethtool_ops::flash_device

The parameters for ETHTOOL_FLASHDEV include a filename, which ought to
be null-terminated.  Currently the only driver that implements
ethtool_ops::flash_device attempts to add a null terminator if
necessary, but does it wrongly.  Do it in the ethtool core instead.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/emulex/benet/be_ethtool.c | 6 +-----
 net/core/ethtool.c                             | 2 ++
 2 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/drivers/net/ethernet/emulex/benet/be_ethtool.c b/drivers/net/ethernet/emulex/benet/be_ethtool.c
index 6db6b6ae5e9b..802e5ddef8a8 100644
--- a/drivers/net/ethernet/emulex/benet/be_ethtool.c
+++ b/drivers/net/ethernet/emulex/benet/be_ethtool.c
@@ -716,12 +716,8 @@ static int
 be_do_flash(struct net_device *netdev, struct ethtool_flash *efl)
 {
 	struct be_adapter *adapter = netdev_priv(netdev);
-	char file_name[ETHTOOL_FLASH_MAX_FILENAME];
 
-	file_name[ETHTOOL_FLASH_MAX_FILENAME - 1] = 0;
-	strcpy(file_name, efl->data);
-
-	return be_load_fw(adapter, file_name);
+	return be_load_fw(adapter, efl->data);
 }
 
 static int
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 369b41894527..3f79db1b612a 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1190,6 +1190,8 @@ static noinline_for_stack int ethtool_flash_device(struct net_device *dev,
 	if (!dev->ethtool_ops->flash_device)
 		return -EOPNOTSUPP;
 
+	efl.data[ETHTOOL_FLASH_MAX_FILENAME - 1] = 0;
+
 	return dev->ethtool_ops->flash_device(dev, &efl);
 }
 
-- 
cgit v1.2.3


From 07ae2dfcf4f7143ce191c6436da1c33f179af0d6 Mon Sep 17 00:00:00 2001
From: Eliad Peller <eliad@wizery.com>
Date: Wed, 1 Feb 2012 18:48:09 +0200
Subject: mac80211: timeout a single frame in the rx reorder buffer

The current code checks for stored_mpdu_num > 1, causing
the reorder_timer to be triggered indefinitely, but the
frame is never timed-out (until the next packet is received)

Signed-off-by: Eliad Peller <eliad@wizery.com>
Cc: <stable@vger.kernel.org>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/rx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 751409120769..5a5e504a8ffb 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -611,7 +611,7 @@ static void ieee80211_sta_reorder_release(struct ieee80211_hw *hw,
 	index = seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) %
 						tid_agg_rx->buf_size;
 	if (!tid_agg_rx->reorder_buf[index] &&
-	    tid_agg_rx->stored_mpdu_num > 1) {
+	    tid_agg_rx->stored_mpdu_num) {
 		/*
 		 * No buffers ready to be released, but check whether any
 		 * frames in the reorder buffer have timed out.
-- 
cgit v1.2.3


From c43b874d5d714f271b80d4c3f49e05d0cbf51ed2 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Thu, 2 Feb 2012 00:07:00 +0000
Subject: tcp: properly initialize tcp memory limits

Commit 4acb4190 tries to fix the using uninitialized value
introduced by commit 3dc43e3,  but it would make the
per-socket memory limits too small.

This patch fixes this and also remove the redundant codes
introduced in 4acb4190.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: Glauber Costa <glommer@parallels.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/sysctl_net_ipv4.c | 6 ------
 net/ipv4/tcp.c             | 4 ++--
 2 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 4cb9cd2f2c39..7a7724da9bff 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -778,7 +778,6 @@ EXPORT_SYMBOL_GPL(net_ipv4_ctl_path);
 static __net_init int ipv4_sysctl_init_net(struct net *net)
 {
 	struct ctl_table *table;
-	unsigned long limit;
 
 	table = ipv4_net_table;
 	if (!net_eq(net, &init_net)) {
@@ -815,11 +814,6 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
 	net->ipv4.sysctl_rt_cache_rebuild_count = 4;
 
 	tcp_init_mem(net);
-	limit = nr_free_buffer_pages() / 8;
-	limit = max(limit, 128UL);
-	net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3;
-	net->ipv4.sysctl_tcp_mem[1] = limit;
-	net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2;
 
 	net->ipv4.ipv4_hdr = register_net_sysctl_table(net,
 			net_ipv4_ctl_path, table);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index a34f5cfdd44c..37755ccc0e96 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3229,7 +3229,6 @@ __setup("thash_entries=", set_thash_entries);
 
 void tcp_init_mem(struct net *net)
 {
-	/* Set per-socket limits to no more than 1/128 the pressure threshold */
 	unsigned long limit = nr_free_buffer_pages() / 8;
 	limit = max(limit, 128UL);
 	net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3;
@@ -3298,7 +3297,8 @@ void __init tcp_init(void)
 	sysctl_max_syn_backlog = max(128, cnt / 256);
 
 	tcp_init_mem(&init_net);
-	limit = nr_free_buffer_pages() / 8;
+	/* Set per-socket limits to no more than 1/128 the pressure threshold */
+	limit = nr_free_buffer_pages() << (PAGE_SHIFT - 10);
 	limit = max(limit, 128UL);
 	max_share = min(4UL*1024*1024, limit);
 
-- 
cgit v1.2.3


From b01377a4200d0dfc7b04a8daabb4739727353703 Mon Sep 17 00:00:00 2001
From: "sjur.brandeland@stericsson.com" <sjur.brandeland@stericsson.com>
Date: Thu, 2 Feb 2012 01:21:02 +0000
Subject: caif: Bugfix list_del_rcu race in cfmuxl_ctrlcmd.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Always use cfmuxl_remove_uplayer when removing a up-layer.
cfmuxl_ctrlcmd() can be called independently and in parallel with
cfmuxl_remove_uplayer(). The race between them could cause list_del_rcu
to be called on a node which has been already taken out from the list.
That lead to a (rare) crash on accessing poisoned node->prev inside
list_del_rcu.

This fix ensures that deletion are done holding the same lock.

Reported-by: Dmitry Tarnyagin <dmitry.tarnyagin@stericsson.com>
Signed-off-by: Sjur Brændeland <sjur.brandeland@stericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/caif/cfmuxl.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/caif/cfmuxl.c b/net/caif/cfmuxl.c
index b36f24a4c8e7..94b08612a4d8 100644
--- a/net/caif/cfmuxl.c
+++ b/net/caif/cfmuxl.c
@@ -248,7 +248,6 @@ static void cfmuxl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
 {
 	struct cfmuxl *muxl = container_obj(layr);
 	struct cflayer *layer;
-	int idx;
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(layer, &muxl->srvl_list, node) {
@@ -257,14 +256,9 @@ static void cfmuxl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
 
 			if ((ctrl == _CAIF_CTRLCMD_PHYIF_DOWN_IND ||
 				ctrl == CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND) &&
-					layer->id != 0) {
-
-				idx = layer->id % UP_CACHE_SIZE;
-				spin_lock_bh(&muxl->receive_lock);
-				RCU_INIT_POINTER(muxl->up_cache[idx], NULL);
-				list_del_rcu(&layer->node);
-				spin_unlock_bh(&muxl->receive_lock);
-			}
+					layer->id != 0)
+				cfmuxl_remove_uplayer(layr, layer->id);
+
 			/* NOTE: ctrlcmd is not allowed to block */
 			layer->ctrlcmd(layer, ctrl, phyid);
 		}
-- 
cgit v1.2.3


From ba7605745d5c99f0e71b3ec6c7cb5ed6afe540ad Mon Sep 17 00:00:00 2001
From: Dmitry Tarnyagin <dmitry.tarnyagin@stericsson.com>
Date: Thu, 2 Feb 2012 01:21:03 +0000
Subject: caif: Bugfix double kfree_skb upon xmit failure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SKB is freed twice upon send error. The Network stack consumes SKB even
when it returns error code.

Signed-off-by: Sjur Brændeland <sjur.brandeland@stericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/caif/caif_socket.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index a98628086452..a97d97a3a512 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -539,8 +539,10 @@ static int transmit_skb(struct sk_buff *skb, struct caifsock *cf_sk,
 	pkt = cfpkt_fromnative(CAIF_DIR_OUT, skb);
 	memset(skb->cb, 0, sizeof(struct caif_payload_info));
 
-	if (cf_sk->layer.dn == NULL)
+	if (cf_sk->layer.dn == NULL) {
+		kfree_skb(skb);
 		return -EINVAL;
+	}
 
 	return cf_sk->layer.dn->transmit(cf_sk->layer.dn, pkt);
 }
@@ -683,10 +685,10 @@ static int caif_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 		}
 		err = transmit_skb(skb, cf_sk,
 				msg->msg_flags&MSG_DONTWAIT, timeo);
-		if (err < 0) {
-			kfree_skb(skb);
+		if (err < 0)
+			/* skb is already freed */
 			goto pipe_err;
-		}
+
 		sent += size;
 	}
 
-- 
cgit v1.2.3


From 5962b35c1de3254a2f03b95efd3b7854b874d7b7 Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Fri, 3 Feb 2012 05:18:43 +0000
Subject: netprio_cgroup: Fix obo in get_prioidx

It was recently pointed out to me that the get_prioidx function sets a bit in
the prioidx map prior to checking to see if the index being set is out of
bounds.  This patch corrects that, avoiding the possiblity of us writing beyond
the end of the array

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Reported-by: Stanislaw Gruszka <sgruszka@redhat.com>
CC: Stanislaw Gruszka <sgruszka@redhat.com>
CC: "David S. Miller" <davem@davemloft.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/netprio_cgroup.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 3a9fd4826b75..9ae183a9a381 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -58,11 +58,12 @@ static int get_prioidx(u32 *prio)
 
 	spin_lock_irqsave(&prioidx_map_lock, flags);
 	prioidx = find_first_zero_bit(prioidx_map, sizeof(unsigned long) * PRIOIDX_SZ);
+	if (prioidx == sizeof(unsigned long) * PRIOIDX_SZ) {
+		spin_unlock_irqrestore(&prioidx_map_lock, flags);
+		return -ENOSPC;
+	}
 	set_bit(prioidx, prioidx_map);
 	spin_unlock_irqrestore(&prioidx_map_lock, flags);
-	if (prioidx == sizeof(unsigned long) * PRIOIDX_SZ)
-		return -ENOSPC;
-
 	atomic_set(&max_prioidx, prioidx);
 	*prio = prioidx;
 	return 0;
-- 
cgit v1.2.3


From e2446eaab5585555a38ea0df4e01ff313dbb4ac9 Mon Sep 17 00:00:00 2001
From: Shawn Lu <shawn.lu@ericsson.com>
Date: Sat, 4 Feb 2012 12:38:09 +0000
Subject: tcp_v4_send_reset: binding oif to iif in no sock case

Binding RST packet outgoing interface to incoming interface
for tcp v4 when there is no socket associate with it.
when sk is not NULL, using sk->sk_bound_dev_if instead.
(suggested by Eric Dumazet).

This has few benefits:
1. tcp_v6_send_reset already did that.
2. This helps tcp connect with SO_BINDTODEVICE set. When
connection is lost, we still able to sending out RST using
same interface.
3. we are sending reply, it is most likely to be succeed
if iif is used

Signed-off-by: Shawn Lu <shawn.lu@ericsson.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 337ba4cca052..94d683a61cba 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -651,6 +651,11 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 	arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
+	/* When socket is gone, all binding information is lost.
+	 * routing might fail in this case. using iif for oif to
+	 * make sure we can deliver it
+	 */
+	arg.bound_dev_if = sk ? sk->sk_bound_dev_if : inet_iif(skb);
 
 	net = dev_net(skb_dst(skb)->dev);
 	arg.tos = ip_hdr(skb)->tos;
-- 
cgit v1.2.3


From 6d25886ee2fbc05a7bf4dae5f5ae345cb73df2fd Mon Sep 17 00:00:00 2001
From: Anisse Astier <anisse@astier.eu>
Date: Tue, 7 Feb 2012 07:39:11 +0000
Subject: net: Fix build regression when INET_UDP_DIAG=y and IPV6=m

Tested-by: Anisse Astier <anisse@astier.eu>

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index aa2a2c79776f..d183262943d9 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -409,7 +409,7 @@ config INET_TCP_DIAG
 
 config INET_UDP_DIAG
 	tristate "UDP: socket monitoring interface"
-	depends on INET_DIAG
+	depends on INET_DIAG && (IPV6 || IPV6=n)
 	default n
 	---help---
 	  Support for UDP socket monitoring interface used by the ss tool.
-- 
cgit v1.2.3


From 5ca3b72c5da47d95b83857b768def6172fbc080a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 8 Feb 2012 08:51:50 +0000
Subject: gro: more generic L2 header check

Shlomo Pongratz reported GRO L2 header check was suited for Ethernet
only, and failed on IB/ipoib traffic.

He provided a patch faking a zeroed header to let GRO aggregates frames.

Roland Dreier, Herbert Xu, and others suggested we change GRO L2 header
check to be more generic, ie not assuming L2 header is 14 bytes, but
taking into account hard_header_len.

__napi_gro_receive() has special handling for the common case (Ethernet)
to avoid a memcmp() call and use an inline optimized function instead.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Reported-by: Shlomo Pongratz <shlomop@mellanox.com>
Cc: Roland Dreier <roland@kernel.org>
Cc: Or Gerlitz <ogerlitz@mellanox.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Tested-by: Sean Hefty <sean.hefty@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 115dee1d985d..6ca32f6b3105 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3500,14 +3500,20 @@ static inline gro_result_t
 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 {
 	struct sk_buff *p;
+	unsigned int maclen = skb->dev->hard_header_len;
 
 	for (p = napi->gro_list; p; p = p->next) {
 		unsigned long diffs;
 
 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
 		diffs |= p->vlan_tci ^ skb->vlan_tci;
-		diffs |= compare_ether_header(skb_mac_header(p),
-					      skb_gro_mac_header(skb));
+		if (maclen == ETH_HLEN)
+			diffs |= compare_ether_header(skb_mac_header(p),
+						      skb_gro_mac_header(skb));
+		else if (!diffs)
+			diffs = memcmp(skb_mac_header(p),
+				       skb_gro_mac_header(skb),
+				       maclen);
 		NAPI_GRO_CB(p)->same_flow = !diffs;
 		NAPI_GRO_CB(p)->flush = 0;
 	}
-- 
cgit v1.2.3


From 16bda13d90c8d5da243e2cfa1677e62ecce26860 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 6 Feb 2012 15:14:37 -0500
Subject: net: Make qdisc_skb_cb upper size bound explicit.

Just like skb->cb[], so that qdisc_skb_cb can be encapsulated inside
of other data structures.

This is intended to be used by IPoIB so that it can remember
addressing information stored at hard_header_ops->create() time that
it can fetch when the packet gets to the transmit routine.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 9 ++++++++-
 net/sched/sch_choke.c     | 3 +--
 net/sched/sch_netem.c     | 3 +--
 net/sched/sch_sfb.c       | 3 +--
 net/sched/sch_sfq.c       | 5 ++---
 5 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index f6bb08b73ca4..55ce96b53b09 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -220,9 +220,16 @@ struct tcf_proto {
 
 struct qdisc_skb_cb {
 	unsigned int		pkt_len;
-	long			data[];
+	unsigned char		data[24];
 };
 
+static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz)
+{
+	struct qdisc_skb_cb *qcb;
+	BUILD_BUG_ON(sizeof(skb->cb) < sizeof(unsigned int) + sz);
+	BUILD_BUG_ON(sizeof(qcb->data) < sz);
+}
+
 static inline int qdisc_qlen(const struct Qdisc *q)
 {
 	return q->q.qlen;
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index e465064d39a3..7e267d7b9c75 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -148,8 +148,7 @@ struct choke_skb_cb {
 
 static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
 {
-	BUILD_BUG_ON(sizeof(skb->cb) <
-		sizeof(struct qdisc_skb_cb) + sizeof(struct choke_skb_cb));
+	qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb));
 	return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
 }
 
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 2776012132ea..e83d61ca78ca 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -130,8 +130,7 @@ struct netem_skb_cb {
 
 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
 {
-	BUILD_BUG_ON(sizeof(skb->cb) <
-		sizeof(struct qdisc_skb_cb) + sizeof(struct netem_skb_cb));
+	qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
 	return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
 }
 
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index 96e42cae4c7a..d7eea99333e9 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -94,8 +94,7 @@ struct sfb_skb_cb {
 
 static inline struct sfb_skb_cb *sfb_skb_cb(const struct sk_buff *skb)
 {
-	BUILD_BUG_ON(sizeof(skb->cb) <
-		sizeof(struct qdisc_skb_cb) + sizeof(struct sfb_skb_cb));
+	qdisc_cb_private_validate(skb, sizeof(struct sfb_skb_cb));
 	return (struct sfb_skb_cb *)qdisc_skb_cb(skb)->data;
 }
 
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 67494aef9acf..60d47180f043 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -166,9 +166,8 @@ struct sfq_skb_cb {
 
 static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb)
 {
-       BUILD_BUG_ON(sizeof(skb->cb) <
-               sizeof(struct qdisc_skb_cb) + sizeof(struct sfq_skb_cb));
-       return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data;
+	qdisc_cb_private_validate(skb, sizeof(struct sfq_skb_cb));
+	return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data;
 }
 
 static unsigned int sfq_hash(const struct sfq_sched_data *q,
-- 
cgit v1.2.3


From a87dfe14a78501c931a4d5481efff6a809aa907d Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Fri, 10 Feb 2012 05:43:36 +0000
Subject: netprio_cgroup: fix an off-by-one bug

# mount -t cgroup xxx /mnt
  # mkdir /mnt/tmp
  # cat /mnt/tmp/net_prio.ifpriomap
  lo 0
  eth0 0
  virbr0 0
  # echo 'lo 999' > /mnt/tmp/net_prio.ifpriomap
  # cat /mnt/tmp/net_prio.ifpriomap
  lo 999
  eth0 0
  virbr0 4101267344

We got weired output, because we exceeded the boundary of the array.
We may even crash the kernel..

Origionally-authored-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: "David S. Miller" <davem@davemloft.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/netprio_cgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 9ae183a9a381..72c638780805 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -108,7 +108,7 @@ static void extend_netdev_table(struct net_device *dev, u32 new_len)
 static void update_netdev_tables(void)
 {
 	struct net_device *dev;
-	u32 max_len = atomic_read(&max_prioidx);
+	u32 max_len = atomic_read(&max_prioidx) + 1;
 	struct netprio_map *map;
 
 	rtnl_lock();
-- 
cgit v1.2.3


From f5c38208d32412d72b97a4f0d44af0eb39feb20b Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Fri, 10 Feb 2012 05:43:37 +0000
Subject: netprio_cgroup: don't allocate prio table when a device is registered

So we delay the allocation till the priority is set through cgroup,
and this makes skb_update_priority() faster when it's not set.

This also eliminates an off-by-one bug similar with the one fixed
in the previous patch.

Origionally-authored-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: "David S. Miller" <davem@davemloft.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/netprio_cgroup.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'net')

diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 72c638780805..4dacc44637ef 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -271,7 +271,6 @@ static int netprio_device_event(struct notifier_block *unused,
 {
 	struct net_device *dev = ptr;
 	struct netprio_map *old;
-	u32 max_len = atomic_read(&max_prioidx);
 
 	/*
 	 * Note this is called with rtnl_lock held so we have update side
@@ -279,11 +278,6 @@ static int netprio_device_event(struct notifier_block *unused,
 	 */
 
 	switch (event) {
-
-	case NETDEV_REGISTER:
-		if (max_len)
-			extend_netdev_table(dev, max_len);
-		break;
 	case NETDEV_UNREGISTER:
 		old = rtnl_dereference(dev->priomap);
 		RCU_INIT_POINTER(dev->priomap, NULL);
-- 
cgit v1.2.3


From 2b73bc65e2771372c818db7955709c8caedbf8b9 Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Fri, 10 Feb 2012 05:43:38 +0000
Subject: netprio_cgroup: fix wrong memory access when NETPRIO_CGROUP=m

When the netprio_cgroup module is not loaded, net_prio_subsys_id
is -1, and so sock_update_prioidx() accesses cgroup_subsys array
with negative index subsys[-1].

Make the code resembles cls_cgroup code, which is bug free.

Origionally-authored-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: "David S. Miller" <davem@davemloft.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netprio_cgroup.h | 48 ++++++++++++++++++++++++++++++++++++--------
 net/core/sock.c              |  7 ++-----
 2 files changed, 42 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h
index 7b2d43139c8e..d58fdec47597 100644
--- a/include/net/netprio_cgroup.h
+++ b/include/net/netprio_cgroup.h
@@ -37,19 +37,51 @@ extern int net_prio_subsys_id;
 
 extern void sock_update_netprioidx(struct sock *sk);
 
-static inline struct cgroup_netprio_state
-		*task_netprio_state(struct task_struct *p)
+#if IS_BUILTIN(CONFIG_NETPRIO_CGROUP)
+
+static inline u32 task_netprioidx(struct task_struct *p)
 {
-#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
-	return container_of(task_subsys_state(p, net_prio_subsys_id),
-			    struct cgroup_netprio_state, css);
-#else
-	return NULL;
-#endif
+	struct cgroup_netprio_state *state;
+	u32 idx;
+
+	rcu_read_lock();
+	state = container_of(task_subsys_state(p, net_prio_subsys_id),
+			     struct cgroup_netprio_state, css);
+	idx = state->prioidx;
+	rcu_read_unlock();
+	return idx;
+}
+
+#elif IS_MODULE(CONFIG_NETPRIO_CGROUP)
+
+static inline u32 task_netprioidx(struct task_struct *p)
+{
+	struct cgroup_netprio_state *state;
+	int subsys_id;
+	u32 idx = 0;
+
+	rcu_read_lock();
+	subsys_id = rcu_dereference_index_check(net_prio_subsys_id,
+						rcu_read_lock_held());
+	if (subsys_id >= 0) {
+		state = container_of(task_subsys_state(p, subsys_id),
+				     struct cgroup_netprio_state, css);
+		idx = state->prioidx;
+	}
+	rcu_read_unlock();
+	return idx;
 }
 
 #else
 
+static inline u32 task_netprioidx(struct task_struct *p)
+{
+	return 0;
+}
+
+#endif /* CONFIG_NETPRIO_CGROUP */
+
+#else
 #define sock_update_netprioidx(sk)
 #endif
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 3e81fd2e3c75..02f8dfe320b7 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1171,13 +1171,10 @@ EXPORT_SYMBOL(sock_update_classid);
 
 void sock_update_netprioidx(struct sock *sk)
 {
-	struct cgroup_netprio_state *state;
 	if (in_interrupt())
 		return;
-	rcu_read_lock();
-	state = task_netprio_state(current);
-	sk->sk_cgrp_prioidx = state ? state->prioidx : 0;
-	rcu_read_unlock();
+
+	sk->sk_cgrp_prioidx = task_netprioidx(current);
 }
 EXPORT_SYMBOL_GPL(sock_update_netprioidx);
 #endif
-- 
cgit v1.2.3


From 5dc7883f2a7c25f8df40d7479687153558cd531b Mon Sep 17 00:00:00 2001
From: Li Wei <lw@cn.fujitsu.com>
Date: Thu, 9 Feb 2012 21:15:25 +0000
Subject: ipv4: Fix wrong order of ip_rt_get_source() and update iph->daddr.

This patch fix a bug which introduced by commit ac8a4810 (ipv4: Save
nexthop address of LSRR/SSRR option to IPCB.).In that patch, we saved
the nexthop of SRR in ip_option->nexthop and update iph->daddr until
we get to ip_forward_options(), but we need to update it before
ip_rt_get_source(), otherwise we may get a wrong src.

Signed-off-by: Li Wei <lw@cn.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_options.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 1e60f7679075..42dd1a90edea 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -573,8 +573,8 @@ void ip_forward_options(struct sk_buff *skb)
 		}
 		if (srrptr + 3 <= srrspace) {
 			opt->is_changed = 1;
-			ip_rt_get_source(&optptr[srrptr-1], skb, rt);
 			ip_hdr(skb)->daddr = opt->nexthop;
+			ip_rt_get_source(&optptr[srrptr-1], skb, rt);
 			optptr[2] = srrptr+4;
 		} else if (net_ratelimit())
 			printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n");
-- 
cgit v1.2.3


From 70620c46ac2b45c24b0f22002fdf5ddd1f7daf81 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Fri, 10 Feb 2012 04:07:11 +0000
Subject: net: Don't proxy arp respond if iif == rt->dst.dev if private VLAN is
 disabled

Commit 653241 (net: RFC3069, private VLAN proxy arp support) changed
the behavior of arp proxy to send arp replies back out on the interface
the request came in even if the private VLAN feature is disabled.

Previously we checked rt->dst.dev != skb->dev for in scenarios, when
proxy arp is enabled on for the netdevice and also when individual proxy
neighbour entries have been added.

This patch adds the check back for the pneigh_lookup() scenario.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Jesper Dangaard Brouer <hawk@comx.dk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/arp.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 59402be133f0..63e49890ad31 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -863,7 +863,8 @@ static int arp_process(struct sk_buff *skb)
 			if (addr_type == RTN_UNICAST  &&
 			    (arp_fwd_proxy(in_dev, dev, rt) ||
 			     arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
-			     pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) {
+			     (rt->dst.dev != dev &&
+			      pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) {
 				n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
 				if (n)
 					neigh_release(n);
-- 
cgit v1.2.3