From 7fdcf13b292e8b2e38e42de24be2503e37b2cf97 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 1 Dec 2011 14:00:15 -0500
Subject: SUNRPC: Fix the execution time statistics in the face of RPC restarts

If the rpc_task gets restarted, then we want to ensure that we don't
double-count the execution time statistics, timeout data, etc.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/sched.c | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index d12ffa545811..00a1a2acd587 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -590,6 +590,27 @@ void rpc_prepare_task(struct rpc_task *task)
 	task->tk_ops->rpc_call_prepare(task, task->tk_calldata);
 }
 
+static void
+rpc_init_task_statistics(struct rpc_task *task)
+{
+	/* Initialize retry counters */
+	task->tk_garb_retry = 2;
+	task->tk_cred_retry = 2;
+	task->tk_rebind_retry = 2;
+
+	/* starting timestamp */
+	task->tk_start = ktime_get();
+}
+
+static void
+rpc_reset_task_statistics(struct rpc_task *task)
+{
+	task->tk_timeouts = 0;
+	task->tk_flags &= ~(RPC_CALL_MAJORSEEN|RPC_TASK_KILLED|RPC_TASK_SENT);
+
+	rpc_init_task_statistics(task);
+}
+
 /*
  * Helper that calls task->tk_ops->rpc_call_done if it exists
  */
@@ -602,6 +623,7 @@ void rpc_exit_task(struct rpc_task *task)
 			WARN_ON(RPC_ASSASSINATED(task));
 			/* Always release the RPC slot and buffer memory */
 			xprt_release(task);
+			rpc_reset_task_statistics(task);
 		}
 	}
 }
@@ -804,11 +826,6 @@ static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *ta
 	task->tk_calldata = task_setup_data->callback_data;
 	INIT_LIST_HEAD(&task->tk_task);
 
-	/* Initialize retry counters */
-	task->tk_garb_retry = 2;
-	task->tk_cred_retry = 2;
-	task->tk_rebind_retry = 2;
-
 	task->tk_priority = task_setup_data->priority - RPC_PRIORITY_LOW;
 	task->tk_owner = current->tgid;
 
@@ -818,8 +835,7 @@ static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *ta
 	if (task->tk_ops->rpc_call_prepare != NULL)
 		task->tk_action = rpc_prepare_task;
 
-	/* starting timestamp */
-	task->tk_start = ktime_get();
+	rpc_init_task_statistics(task);
 
 	dprintk("RPC:       new task initialized, procpid %u\n",
 				task_pid_nr(current));
-- 
cgit v1.2.3


From c25573b5134294c0be82bfaecc6d08136835b271 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 1 Dec 2011 14:16:17 -0500
Subject: SUNRPC: Ensure we always bump the backlog queue in xprt_free_slot

Whenever we free a slot, we know that the resulting xprt->num_reqs will
be less than xprt->max_reqs, so we know that we can release at least one
backlogged rpc_task.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: stable@vger.kernel.org [>=3.1]
---
 net/sunrpc/xprt.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index f4385e45a5fc..c64c0ef519b5 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -995,13 +995,11 @@ out_init_req:
 
 static void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req)
 {
-	if (xprt_dynamic_free_slot(xprt, req))
-		return;
-
-	memset(req, 0, sizeof(*req));	/* mark unused */
-
 	spin_lock(&xprt->reserve_lock);
-	list_add(&req->rq_list, &xprt->free);
+	if (!xprt_dynamic_free_slot(xprt, req)) {
+		memset(req, 0, sizeof(*req));	/* mark unused */
+		list_add(&req->rq_list, &xprt->free);
+	}
 	rpc_wake_up_next(&xprt->backlog);
 	spin_unlock(&xprt->reserve_lock);
 }
-- 
cgit v1.2.3


From f8c141c3e915e3a040d4c1badde28e23f8cbe255 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Fri, 9 Dec 2011 09:35:39 +0300
Subject: nfc: signedness bug in __nci_request()

wait_for_completion_interruptible_timeout() returns -ERESTARTSYS if
interrupted so completion_rc needs to be signed.  The current code
probably returns -ETIMEDOUT if we hit this situation, but after this
patch is applied it will return -ERESTARTSYS.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/nfc/nci/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c
index 4047e29acb3b..25dae3f8f5c2 100644
--- a/net/nfc/nci/core.c
+++ b/net/nfc/nci/core.c
@@ -68,7 +68,7 @@ static int __nci_request(struct nci_dev *ndev,
 	__u32 timeout)
 {
 	int rc = 0;
-	unsigned long completion_rc;
+	long completion_rc;
 
 	ndev->req_status = NCI_REQ_PEND;
 
-- 
cgit v1.2.3


From 36e999a83a4a4badd389901eb6d23a30e199b8db Mon Sep 17 00:00:00 2001
From: Mat Martineau <mathewm@codeaurora.org>
Date: Thu, 8 Dec 2011 17:23:21 -0800
Subject: Bluetooth: Prevent uninitialized data access in L2CAP configuration

When configuring an ERTM or streaming mode connection, remote devices
are expected to send an RFC option in a successful config response.  A
misbehaving remote device might not send an RFC option, and the L2CAP
code should not access uninitialized data in this case.

Signed-off-by: Mat Martineau <mathewm@codeaurora.org>
Acked-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Gustavo F. Padovan <padovan@profusion.mobi>
---
 net/bluetooth/l2cap_core.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 5ea94a1eecf2..17b5b1cd9657 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -2152,7 +2152,7 @@ static int l2cap_parse_conf_rsp(struct l2cap_chan *chan, void *rsp, int len, voi
 	void *ptr = req->data;
 	int type, olen;
 	unsigned long val;
-	struct l2cap_conf_rfc rfc;
+	struct l2cap_conf_rfc rfc = { .mode = L2CAP_MODE_BASIC };
 
 	BT_DBG("chan %p, rsp %p, len %d, req %p", chan, rsp, len, data);
 
@@ -2271,6 +2271,16 @@ static void l2cap_conf_rfc_get(struct l2cap_chan *chan, void *rsp, int len)
 		}
 	}
 
+	/* Use sane default values in case a misbehaving remote device
+	 * did not send an RFC option.
+	 */
+	rfc.mode = chan->mode;
+	rfc.retrans_timeout = cpu_to_le16(L2CAP_DEFAULT_RETRANS_TO);
+	rfc.monitor_timeout = cpu_to_le16(L2CAP_DEFAULT_MONITOR_TO);
+	rfc.max_pdu_size = cpu_to_le16(chan->imtu);
+
+	BT_ERR("Expected RFC option was not found, using defaults");
+
 done:
 	switch (rfc.mode) {
 	case L2CAP_MODE_ERTM:
-- 
cgit v1.2.3


From 79e654787c67f6b05f73366ff8ccac72ba7249e6 Mon Sep 17 00:00:00 2001
From: Mat Martineau <mathewm@codeaurora.org>
Date: Tue, 6 Dec 2011 16:23:26 -0800
Subject: Bluetooth: Clear RFCOMM session timer when disconnecting last channel

When the last RFCOMM data channel is closed, a timer is normally set
up to disconnect the control channel at a later time.  If the control
channel disconnect command is sent with the timer pending, the timer
needs to be cancelled.

If the timer is not cancelled in this situation, the reference
counting logic for the RFCOMM session does not work correctly when the
remote device closes the L2CAP connection.  The session is freed at
the wrong time, leading to a kernel panic.

Signed-off-by: Mat Martineau <mathewm@codeaurora.org>
Acked-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Gustavo F. Padovan <padovan@profusion.mobi>
---
 net/bluetooth/rfcomm/core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index 4e32e18211f9..2d28dfe98389 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -1146,6 +1146,7 @@ static int rfcomm_recv_ua(struct rfcomm_session *s, u8 dlci)
 			if (list_empty(&s->dlcs)) {
 				s->state = BT_DISCONN;
 				rfcomm_send_disc(s, 0);
+				rfcomm_session_clear_timer(s);
 			}
 
 			break;
-- 
cgit v1.2.3


From d7660918fce210f421cc58c060ca3de71e4ffd37 Mon Sep 17 00:00:00 2001
From: "Gustavo F. Padovan" <padovan@profusion.mobi>
Date: Sun, 18 Dec 2011 22:33:30 -0200
Subject: Revert "Bluetooth: Revert: Fix L2CAP connection establishment"

This reverts commit 4dff523a913197e3314c7b0d08734ab037709093.

It was reported that this patch cause issues when trying to connect to
legacy devices so reverting it.

Reported-by: David Fries <david@fries.net>
Signed-off-by: Gustavo F. Padovan <padovan@profusion.mobi>
---
 net/bluetooth/hci_conn.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index e0af7237cd92..c1c597e3e198 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -673,7 +673,7 @@ int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type)
 		goto encrypt;
 
 auth:
-	if (test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend))
+	if (test_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend))
 		return 0;
 
 	if (!hci_conn_auth(conn, sec_level, auth_type))
-- 
cgit v1.2.3


From 9cef310fcdee12b49b8b4c96fd8f611c8873d284 Mon Sep 17 00:00:00 2001
From: Alex Juncu <ajuncu@ixiacom.com>
Date: Thu, 15 Dec 2011 23:01:25 +0000
Subject: llc: llc_cmsg_rcv was getting called after sk_eat_skb.

Received non stream protocol packets were calling llc_cmsg_rcv that used a
skb after that skb was released by sk_eat_skb. This caused received STP
packets to generate kernel panics.

Signed-off-by: Alexandru Juncu <ajuncu@ixiacom.com>
Signed-off-by: Kunjan Naik <knaik@ixiacom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/llc/af_llc.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index dfd3a648a551..a18e6c3d36e3 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -833,15 +833,15 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock,
 		copied += used;
 		len -= used;
 
+		/* For non stream protcols we get one packet per recvmsg call */
+		if (sk->sk_type != SOCK_STREAM)
+			goto copy_uaddr;
+
 		if (!(flags & MSG_PEEK)) {
 			sk_eat_skb(sk, skb, 0);
 			*seq = 0;
 		}
 
-		/* For non stream protcols we get one packet per recvmsg call */
-		if (sk->sk_type != SOCK_STREAM)
-			goto copy_uaddr;
-
 		/* Partial read */
 		if (used + offset < skb->len)
 			continue;
@@ -857,6 +857,12 @@ copy_uaddr:
 	}
 	if (llc_sk(sk)->cmsg_flags)
 		llc_cmsg_rcv(msg, skb);
+
+	if (!(flags & MSG_PEEK)) {
+			sk_eat_skb(sk, skb, 0);
+			*seq = 0;
+	}
+
 	goto out;
 }
 
-- 
cgit v1.2.3


From 2692ba61a82203404abd7dd2a027bda962861f74 Mon Sep 17 00:00:00 2001
From: Xi Wang <xi.wang@gmail.com>
Date: Fri, 16 Dec 2011 12:44:15 +0000
Subject: sctp: fix incorrect overflow check on autoclose

Commit 8ffd3208 voids the previous patches f6778aab and 810c0719 for
limiting the autoclose value.  If userspace passes in -1 on 32-bit
platform, the overflow check didn't work and autoclose would be set
to 0xffffffff.

This patch defines a max_autoclose (in seconds) for limiting the value
and exposes it through sysctl, with the following intentions.

1) Avoid overflowing autoclose * HZ.

2) Keep the default autoclose bound consistent across 32- and 64-bit
   platforms (INT_MAX / HZ in this patch).

3) Keep the autoclose value consistent between setsockopt() and
   getsockopt() calls.

Suggested-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Signed-off-by: Xi Wang <xi.wang@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h |  4 ++++
 net/sctp/associola.c       |  2 +-
 net/sctp/protocol.c        |  3 +++
 net/sctp/socket.c          |  2 --
 net/sctp/sysctl.c          | 13 +++++++++++++
 5 files changed, 21 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index e90e7a9935dd..a15432da27c3 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -241,6 +241,9 @@ extern struct sctp_globals {
 	 * bits is an indicator of when to send and window update SACK.
 	 */
 	int rwnd_update_shift;
+
+	/* Threshold for autoclose timeout, in seconds. */
+	unsigned long max_autoclose;
 } sctp_globals;
 
 #define sctp_rto_initial		(sctp_globals.rto_initial)
@@ -281,6 +284,7 @@ extern struct sctp_globals {
 #define sctp_auth_enable		(sctp_globals.auth_enable)
 #define sctp_checksum_disable		(sctp_globals.checksum_disable)
 #define sctp_rwnd_upd_shift		(sctp_globals.rwnd_update_shift)
+#define sctp_max_autoclose		(sctp_globals.max_autoclose)
 
 /* SCTP Socket type: UDP or TCP style. */
 typedef enum {
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 152b5b3c3fff..acd2edbc073e 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -173,7 +173,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
 	asoc->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0;
 	asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = asoc->sackdelay;
 	asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] =
-		(unsigned long)sp->autoclose * HZ;
+		min_t(unsigned long, sp->autoclose, sctp_max_autoclose) * HZ;
 
 	/* Initializes the timers */
 	for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i)
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 61b9fca5a173..6f6ad8686833 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1285,6 +1285,9 @@ SCTP_STATIC __init int sctp_init(void)
 	sctp_max_instreams    		= SCTP_DEFAULT_INSTREAMS;
 	sctp_max_outstreams   		= SCTP_DEFAULT_OUTSTREAMS;
 
+	/* Initialize maximum autoclose timeout. */
+	sctp_max_autoclose		= INT_MAX / HZ;
+
 	/* Initialize handle used for association ids. */
 	idr_init(&sctp_assocs_id);
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 13bf5fcdbff1..54a7cd2fdd7a 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -2200,8 +2200,6 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval,
 		return -EINVAL;
 	if (copy_from_user(&sp->autoclose, optval, optlen))
 		return -EFAULT;
-	/* make sure it won't exceed MAX_SCHEDULE_TIMEOUT */
-	sp->autoclose = min_t(long, sp->autoclose, MAX_SCHEDULE_TIMEOUT / HZ);
 
 	return 0;
 }
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index 6b3952961b85..60ffbd067ff7 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -53,6 +53,10 @@ static int sack_timer_min = 1;
 static int sack_timer_max = 500;
 static int addr_scope_max = 3; /* check sctp_scope_policy_t in include/net/sctp/constants.h for max entries */
 static int rwnd_scale_max = 16;
+static unsigned long max_autoclose_min = 0;
+static unsigned long max_autoclose_max =
+	(MAX_SCHEDULE_TIMEOUT / HZ > UINT_MAX)
+	? UINT_MAX : MAX_SCHEDULE_TIMEOUT / HZ;
 
 extern long sysctl_sctp_mem[3];
 extern int sysctl_sctp_rmem[3];
@@ -258,6 +262,15 @@ static ctl_table sctp_table[] = {
 		.extra1		= &one,
 		.extra2		= &rwnd_scale_max,
 	},
+	{
+		.procname	= "max_autoclose",
+		.data		= &sctp_max_autoclose,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= &proc_doulongvec_minmax,
+		.extra1		= &max_autoclose_min,
+		.extra2		= &max_autoclose_max,
+	},
 
 	{ /* sentinel */ }
 };
-- 
cgit v1.2.3


From a76c0adf60f6ca5ff3481992e4ea0383776b24d2 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@redhat.com>
Date: Mon, 19 Dec 2011 04:11:40 +0000
Subject: sctp: Do not account for sizeof(struct sk_buff) in estimated rwnd

When checking whether a DATA chunk fits into the estimated rwnd a
full sizeof(struct sk_buff) is added to the needed chunk size. This
quickly exhausts the available rwnd space and leads to packets being
sent which are much below the PMTU limit. This can lead to much worse
performance.

The reason for this behaviour was to avoid putting too much memory
pressure on the receiver. The concept is not completely irational
because a Linux receiver does in fact clone an skb for each DATA chunk
delivered. However, Linux also reserves half the available socket
buffer space for data structures therefore usage of it is already
accounted for.

When proposing to change this the last time it was noted that this
behaviour was introduced to solve a performance issue caused by rwnd
overusage in combination with small DATA chunks.

Trying to reproduce this I found that with the sk_buff overhead removed,
the performance would improve significantly unless socket buffer limits
are increased.

The following numbers have been gathered using a patched iperf
supporting SCTP over a live 1 Gbit ethernet network. The -l option
was used to limit DATA chunk sizes. The numbers listed are based on
the average of 3 test runs each. Default values have been used for
sk_(r|w)mem.

Chunk
Size    Unpatched     No Overhead
-------------------------------------
   4    15.2 Kbit [!]   12.2 Mbit [!]
   8    35.8 Kbit [!]   26.0 Mbit [!]
  16    95.5 Kbit [!]   54.4 Mbit [!]
  32   106.7 Mbit      102.3 Mbit
  64   189.2 Mbit      188.3 Mbit
 128   331.2 Mbit      334.8 Mbit
 256   537.7 Mbit      536.0 Mbit
 512   766.9 Mbit      766.6 Mbit
1024   810.1 Mbit      808.6 Mbit

Signed-off-by: Thomas Graf <tgraf@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/output.c   | 8 +-------
 net/sctp/outqueue.c | 6 ++----
 2 files changed, 3 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/sctp/output.c b/net/sctp/output.c
index 08b3cead6503..817174eb5f41 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -697,13 +697,7 @@ static void sctp_packet_append_data(struct sctp_packet *packet,
 	/* Keep track of how many bytes are in flight to the receiver. */
 	asoc->outqueue.outstanding_bytes += datasize;
 
-	/* Update our view of the receiver's rwnd. Include sk_buff overhead
-	 * while updating peer.rwnd so that it reduces the chances of a
-	 * receiver running out of receive buffer space even when receive
-	 * window is still open. This can happen when a sender is sending
-	 * sending small messages.
-	 */
-	datasize += sizeof(struct sk_buff);
+	/* Update our view of the receiver's rwnd. */
 	if (datasize < rwnd)
 		rwnd -= datasize;
 	else
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 14c2b06028ff..cfeb1d4a1ee6 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -411,8 +411,7 @@ void sctp_retransmit_mark(struct sctp_outq *q,
 					chunk->transport->flight_size -=
 							sctp_data_size(chunk);
 				q->outstanding_bytes -= sctp_data_size(chunk);
-				q->asoc->peer.rwnd += (sctp_data_size(chunk) +
-							sizeof(struct sk_buff));
+				q->asoc->peer.rwnd += sctp_data_size(chunk);
 			}
 			continue;
 		}
@@ -432,8 +431,7 @@ void sctp_retransmit_mark(struct sctp_outq *q,
 			 * (Section 7.2.4)), add the data size of those
 			 * chunks to the rwnd.
 			 */
-			q->asoc->peer.rwnd += (sctp_data_size(chunk) +
-						sizeof(struct sk_buff));
+			q->asoc->peer.rwnd += sctp_data_size(chunk);
 			q->outstanding_bytes -= sctp_data_size(chunk);
 			if (chunk->transport)
 				transport->flight_size -= sctp_data_size(chunk);
-- 
cgit v1.2.3


From cd7816d14953c8af910af5bb92f488b0b277e29d Mon Sep 17 00:00:00 2001
From: Gerlando Falauto <gerlando.falauto@keymile.com>
Date: Mon, 19 Dec 2011 22:58:04 +0000
Subject: net: have ipconfig not wait if no dev is available

previous commit 3fb72f1e6e6165c5f495e8dc11c5bbd14c73385c
makes IP-Config wait for carrier on at least one network device.

Before waiting (predefined value 120s), check that at least one device
was successfully brought up. Otherwise (e.g. buggy bootloader
which does not set the MAC address) there is no point in waiting
for carrier.

Cc: Micha Nelissen <micha@neli.hopto.org>
Cc: Holger Brunck <holger.brunck@keymile.com>
Signed-off-by: Gerlando Falauto <gerlando.falauto@keymile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipconfig.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 0da2afc97f32..99ec116bef14 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -253,6 +253,10 @@ static int __init ic_open_devs(void)
 		}
 	}
 
+	/* no point in waiting if we could not bring up at least one device */
+	if (!ic_first_dev)
+		goto have_carrier;
+
 	/* wait for a carrier on at least one device */
 	start = jiffies;
 	while (jiffies - start < msecs_to_jiffies(CONF_CARRIER_TIMEOUT)) {
-- 
cgit v1.2.3


From 9f28a2fc0bd77511f649c0a788c7bf9a5fd04edb Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 21 Dec 2011 15:47:16 -0500
Subject: ipv4: reintroduce route cache garbage collector

Commit 2c8cec5c10b (ipv4: Cache learned PMTU information in inetpeer)
removed IP route cache garbage collector a bit too soon, as this gc was
responsible for expired routes cleanup, releasing their neighbour
reference.

As pointed out by Robert Gladewitz, recent kernels can fill and exhaust
their neighbour cache.

Reintroduce the garbage collection, since we'll have to wait our
neighbour lookups become refcount-less to not depend on this stuff.

Reported-by: Robert Gladewitz <gladewitz@gmx.de>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 107 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 107 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 46af62363b8c..252c512e8a81 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -120,6 +120,7 @@
 
 static int ip_rt_max_size;
 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
+static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
 static int ip_rt_redirect_number __read_mostly	= 9;
 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
@@ -133,6 +134,9 @@ static int ip_rt_min_advmss __read_mostly	= 256;
 static int rt_chain_length_max __read_mostly	= 20;
 static int redirect_genid;
 
+static struct delayed_work expires_work;
+static unsigned long expires_ljiffies;
+
 /*
  *	Interface to generic destination cache.
  */
@@ -830,6 +834,97 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth)
 	return ONE;
 }
 
+static void rt_check_expire(void)
+{
+	static unsigned int rover;
+	unsigned int i = rover, goal;
+	struct rtable *rth;
+	struct rtable __rcu **rthp;
+	unsigned long samples = 0;
+	unsigned long sum = 0, sum2 = 0;
+	unsigned long delta;
+	u64 mult;
+
+	delta = jiffies - expires_ljiffies;
+	expires_ljiffies = jiffies;
+	mult = ((u64)delta) << rt_hash_log;
+	if (ip_rt_gc_timeout > 1)
+		do_div(mult, ip_rt_gc_timeout);
+	goal = (unsigned int)mult;
+	if (goal > rt_hash_mask)
+		goal = rt_hash_mask + 1;
+	for (; goal > 0; goal--) {
+		unsigned long tmo = ip_rt_gc_timeout;
+		unsigned long length;
+
+		i = (i + 1) & rt_hash_mask;
+		rthp = &rt_hash_table[i].chain;
+
+		if (need_resched())
+			cond_resched();
+
+		samples++;
+
+		if (rcu_dereference_raw(*rthp) == NULL)
+			continue;
+		length = 0;
+		spin_lock_bh(rt_hash_lock_addr(i));
+		while ((rth = rcu_dereference_protected(*rthp,
+					lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
+			prefetch(rth->dst.rt_next);
+			if (rt_is_expired(rth)) {
+				*rthp = rth->dst.rt_next;
+				rt_free(rth);
+				continue;
+			}
+			if (rth->dst.expires) {
+				/* Entry is expired even if it is in use */
+				if (time_before_eq(jiffies, rth->dst.expires)) {
+nofree:
+					tmo >>= 1;
+					rthp = &rth->dst.rt_next;
+					/*
+					 * We only count entries on
+					 * a chain with equal hash inputs once
+					 * so that entries for different QOS
+					 * levels, and other non-hash input
+					 * attributes don't unfairly skew
+					 * the length computation
+					 */
+					length += has_noalias(rt_hash_table[i].chain, rth);
+					continue;
+				}
+			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
+				goto nofree;
+
+			/* Cleanup aged off entries. */
+			*rthp = rth->dst.rt_next;
+			rt_free(rth);
+		}
+		spin_unlock_bh(rt_hash_lock_addr(i));
+		sum += length;
+		sum2 += length*length;
+	}
+	if (samples) {
+		unsigned long avg = sum / samples;
+		unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
+		rt_chain_length_max = max_t(unsigned long,
+					ip_rt_gc_elasticity,
+					(avg + 4*sd) >> FRACT_BITS);
+	}
+	rover = i;
+}
+
+/*
+ * rt_worker_func() is run in process context.
+ * we call rt_check_expire() to scan part of the hash table
+ */
+static void rt_worker_func(struct work_struct *work)
+{
+	rt_check_expire();
+	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
+}
+
 /*
  * Perturbation of rt_genid by a small quantity [1..256]
  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
@@ -3178,6 +3273,13 @@ static ctl_table ipv4_route_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
 	},
+	{
+		.procname	= "gc_interval",
+		.data		= &ip_rt_gc_interval,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
 	{
 		.procname	= "redirect_load",
 		.data		= &ip_rt_redirect_load,
@@ -3388,6 +3490,11 @@ int __init ip_rt_init(void)
 	devinet_init();
 	ip_fib_init();
 
+	INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
+	expires_ljiffies = jiffies;
+	schedule_delayed_work(&expires_work,
+		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
+
 	if (ip_rt_proc_init())
 		printk(KERN_ERR "Unable to create route proc files\n");
 #ifdef CONFIG_XFRM
-- 
cgit v1.2.3


From c0ed1c14a72ca9ebacd51fb94a8aca488b0d361e Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Wed, 21 Dec 2011 16:48:08 -0500
Subject: net: Add a flow_cache_flush_deferred function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

flow_cach_flush() might sleep but can be called from
atomic context via the xfrm garbage collector. So add
a flow_cache_flush_deferred() function and use this if
the xfrm garbage colector is invoked from within the
packet path.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Acked-by: Timo Teräs <timo.teras@iki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow.h     |  1 +
 net/core/flow.c        | 12 ++++++++++++
 net/xfrm/xfrm_policy.c | 18 ++++++++++++++----
 3 files changed, 27 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/net/flow.h b/include/net/flow.h
index a09447749e2d..57f15a7f1cdd 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -207,6 +207,7 @@ extern struct flow_cache_object *flow_cache_lookup(
 		u8 dir, flow_resolve_t resolver, void *ctx);
 
 extern void flow_cache_flush(void);
+extern void flow_cache_flush_deferred(void);
 extern atomic_t flow_cache_genid;
 
 #endif
diff --git a/net/core/flow.c b/net/core/flow.c
index 8ae42de9c79e..e318c7e98042 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -358,6 +358,18 @@ void flow_cache_flush(void)
 	put_online_cpus();
 }
 
+static void flow_cache_flush_task(struct work_struct *work)
+{
+	flow_cache_flush();
+}
+
+static DECLARE_WORK(flow_cache_flush_work, flow_cache_flush_task);
+
+void flow_cache_flush_deferred(void)
+{
+	schedule_work(&flow_cache_flush_work);
+}
+
 static int __cpuinit flow_cache_cpu_prepare(struct flow_cache *fc, int cpu)
 {
 	struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 2118d6446630..9049a5caeb25 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -2276,8 +2276,6 @@ static void __xfrm_garbage_collect(struct net *net)
 {
 	struct dst_entry *head, *next;
 
-	flow_cache_flush();
-
 	spin_lock_bh(&xfrm_policy_sk_bundle_lock);
 	head = xfrm_policy_sk_bundles;
 	xfrm_policy_sk_bundles = NULL;
@@ -2290,6 +2288,18 @@ static void __xfrm_garbage_collect(struct net *net)
 	}
 }
 
+static void xfrm_garbage_collect(struct net *net)
+{
+	flow_cache_flush();
+	__xfrm_garbage_collect(net);
+}
+
+static void xfrm_garbage_collect_deferred(struct net *net)
+{
+	flow_cache_flush_deferred();
+	__xfrm_garbage_collect(net);
+}
+
 static void xfrm_init_pmtu(struct dst_entry *dst)
 {
 	do {
@@ -2422,7 +2432,7 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
 		if (likely(dst_ops->neigh_lookup == NULL))
 			dst_ops->neigh_lookup = xfrm_neigh_lookup;
 		if (likely(afinfo->garbage_collect == NULL))
-			afinfo->garbage_collect = __xfrm_garbage_collect;
+			afinfo->garbage_collect = xfrm_garbage_collect_deferred;
 		xfrm_policy_afinfo[afinfo->family] = afinfo;
 	}
 	write_unlock_bh(&xfrm_policy_afinfo_lock);
@@ -2516,7 +2526,7 @@ static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void
 
 	switch (event) {
 	case NETDEV_DOWN:
-		__xfrm_garbage_collect(dev_net(dev));
+		xfrm_garbage_collect(dev_net(dev));
 	}
 	return NOTIFY_DONE;
 }
-- 
cgit v1.2.3


From b9eda06f80b0db61a73bd87c6b0eb67d8aca55ad Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Thu, 22 Dec 2011 17:03:29 +1100
Subject: ipv4: using prefetch requires including prefetch.h

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: David Miller <davem@davemloft.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 net/ipv4/route.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 252c512e8a81..85cc053d9d6e 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -91,6 +91,7 @@
 #include <linux/rcupdate.h>
 #include <linux/times.h>
 #include <linux/slab.h>
+#include <linux/prefetch.h>
 #include <net/dst.h>
 #include <net/net_namespace.h>
 #include <net/protocol.h>
-- 
cgit v1.2.3


From a13861a28b90541aa207532d237e7a940f1b1c7b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 21 Dec 2011 20:00:32 +0000
Subject: bridge: provide a mtu() method for fake_dst_ops

Commit 618f9bc74a039da76 (net: Move mtu handling down to the protocol
depended handlers) forgot the bridge netfilter case, adding a NULL
dereference in ip_fragment().

Reported-by: Chris Boot <bootc@bootc.net>
CC: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_netfilter.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index d6ec3720c77e..08757dc670a4 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -114,12 +114,18 @@ static struct neighbour *fake_neigh_lookup(const struct dst_entry *dst, const vo
 	return NULL;
 }
 
+static unsigned int fake_mtu(const struct dst_entry *dst)
+{
+	return dst->dev->mtu;
+}
+
 static struct dst_ops fake_dst_ops = {
 	.family =		AF_INET,
 	.protocol =		cpu_to_be16(ETH_P_IP),
 	.update_pmtu =		fake_update_pmtu,
 	.cow_metrics =		fake_cow_metrics,
 	.neigh_lookup =		fake_neigh_lookup,
+	.mtu =			fake_mtu,
 };
 
 /*
-- 
cgit v1.2.3


From 7838f2ce36b6ab5c13ef20b1857e3bbd567f1759 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@redhat.com>
Date: Thu, 22 Dec 2011 02:05:07 +0000
Subject: mqprio: Avoid panic if no options are provided

Userspace may not provide TCA_OPTIONS, in fact tc currently does
so not do so if no arguments are specified on the command line.
Return EINVAL instead of panicing.

Signed-off-by: Thomas Graf <tgraf@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_mqprio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index f88256cbacbf..28de43092330 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -107,7 +107,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
 	if (!netif_is_multiqueue(dev))
 		return -EOPNOTSUPP;
 
-	if (nla_len(opt) < sizeof(*qopt))
+	if (!opt || nla_len(opt) < sizeof(*qopt))
 		return -EINVAL;
 
 	qopt = nla_data(opt);
-- 
cgit v1.2.3


From e688a604807647c9450f9c12a7cb6d027150a895 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 22 Dec 2011 04:15:53 +0000
Subject: net: introduce DST_NOPEER dst flag

Chris Boot reported crashes occurring in ipv6_select_ident().

[  461.457562] RIP: 0010:[<ffffffff812dde61>]  [<ffffffff812dde61>]
ipv6_select_ident+0x31/0xa7

[  461.578229] Call Trace:
[  461.580742] <IRQ>
[  461.582870]  [<ffffffff812efa7f>] ? udp6_ufo_fragment+0x124/0x1a2
[  461.589054]  [<ffffffff812dbfe0>] ? ipv6_gso_segment+0xc0/0x155
[  461.595140]  [<ffffffff812700c6>] ? skb_gso_segment+0x208/0x28b
[  461.601198]  [<ffffffffa03f236b>] ? ipv6_confirm+0x146/0x15e
[nf_conntrack_ipv6]
[  461.608786]  [<ffffffff81291c4d>] ? nf_iterate+0x41/0x77
[  461.614227]  [<ffffffff81271d64>] ? dev_hard_start_xmit+0x357/0x543
[  461.620659]  [<ffffffff81291cf6>] ? nf_hook_slow+0x73/0x111
[  461.626440]  [<ffffffffa0379745>] ? br_parse_ip_options+0x19a/0x19a
[bridge]
[  461.633581]  [<ffffffff812722ff>] ? dev_queue_xmit+0x3af/0x459
[  461.639577]  [<ffffffffa03747d2>] ? br_dev_queue_push_xmit+0x72/0x76
[bridge]
[  461.646887]  [<ffffffffa03791e3>] ? br_nf_post_routing+0x17d/0x18f
[bridge]
[  461.653997]  [<ffffffff81291c4d>] ? nf_iterate+0x41/0x77
[  461.659473]  [<ffffffffa0374760>] ? br_flood+0xfa/0xfa [bridge]
[  461.665485]  [<ffffffff81291cf6>] ? nf_hook_slow+0x73/0x111
[  461.671234]  [<ffffffffa0374760>] ? br_flood+0xfa/0xfa [bridge]
[  461.677299]  [<ffffffffa0379215>] ?
nf_bridge_update_protocol+0x20/0x20 [bridge]
[  461.684891]  [<ffffffffa03bb0e5>] ? nf_ct_zone+0xa/0x17 [nf_conntrack]
[  461.691520]  [<ffffffffa0374760>] ? br_flood+0xfa/0xfa [bridge]
[  461.697572]  [<ffffffffa0374812>] ? NF_HOOK.constprop.8+0x3c/0x56
[bridge]
[  461.704616]  [<ffffffffa0379031>] ?
nf_bridge_push_encap_header+0x1c/0x26 [bridge]
[  461.712329]  [<ffffffffa037929f>] ? br_nf_forward_finish+0x8a/0x95
[bridge]
[  461.719490]  [<ffffffffa037900a>] ?
nf_bridge_pull_encap_header+0x1c/0x27 [bridge]
[  461.727223]  [<ffffffffa0379974>] ? br_nf_forward_ip+0x1c0/0x1d4 [bridge]
[  461.734292]  [<ffffffff81291c4d>] ? nf_iterate+0x41/0x77
[  461.739758]  [<ffffffffa03748cc>] ? __br_deliver+0xa0/0xa0 [bridge]
[  461.746203]  [<ffffffff81291cf6>] ? nf_hook_slow+0x73/0x111
[  461.751950]  [<ffffffffa03748cc>] ? __br_deliver+0xa0/0xa0 [bridge]
[  461.758378]  [<ffffffffa037533a>] ? NF_HOOK.constprop.4+0x56/0x56
[bridge]

This is caused by bridge netfilter special dst_entry (fake_rtable), a
special shared entry, where attaching an inetpeer makes no sense.

Problem is present since commit 87c48fa3b46 (ipv6: make fragment
identifications less predictable)

Introduce DST_NOPEER dst flag and make sure ipv6_select_ident() and
__ip_select_ident() fallback to the 'no peer attached' handling.

Reported-by: Chris Boot <bootc@bootc.net>
Tested-by: Chris Boot <bootc@bootc.net>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst.h         | 1 +
 net/bridge/br_netfilter.c | 2 +-
 net/ipv4/route.c          | 4 ++--
 net/ipv6/ip6_output.c     | 2 +-
 4 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/net/dst.h b/include/net/dst.h
index 6faec1a60216..75766b42660e 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -53,6 +53,7 @@ struct dst_entry {
 #define DST_NOHASH		0x0008
 #define DST_NOCACHE		0x0010
 #define DST_NOCOUNT		0x0020
+#define DST_NOPEER		0x0040
 
 	short			error;
 	short			obsolete;
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 08757dc670a4..fa8b8f763580 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -147,7 +147,7 @@ void br_netfilter_rtable_init(struct net_bridge *br)
 	rt->dst.dev = br->dev;
 	rt->dst.path = &rt->dst;
 	dst_init_metrics(&rt->dst, br_dst_default_metrics, true);
-	rt->dst.flags	= DST_NOXFRM;
+	rt->dst.flags	= DST_NOXFRM | DST_NOPEER;
 	rt->dst.ops = &fake_dst_ops;
 }
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 85cc053d9d6e..94cdbc55ca7e 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1367,7 +1367,7 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
 {
 	struct rtable *rt = (struct rtable *) dst;
 
-	if (rt) {
+	if (rt && !(rt->dst.flags & DST_NOPEER)) {
 		if (rt->peer == NULL)
 			rt_bind_peer(rt, rt->rt_dst, 1);
 
@@ -1378,7 +1378,7 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
 			iph->id = htons(inet_getid(rt->peer, more));
 			return;
 		}
-	} else
+	} else if (!rt)
 		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
 		       __builtin_return_address(0));
 
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 84d0bd5cac93..ec562713db9b 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -603,7 +603,7 @@ void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
 	static atomic_t ipv6_fragmentation_id;
 	int old, new;
 
-	if (rt) {
+	if (rt && !(rt->dst.flags & DST_NOPEER)) {
 		struct inet_peer *peer;
 
 		if (!rt->rt6i_peer)
-- 
cgit v1.2.3


From a0a129f8b6cff54ab479324a54aefdab5db4f240 Mon Sep 17 00:00:00 2001
From: Xi Wang <xi.wang@gmail.com>
Date: Thu, 22 Dec 2011 13:35:22 +0000
Subject: rps: fix insufficient bounds checking in
 store_rps_dev_flow_table_cnt()

Setting a large rps_flow_cnt like (1 << 30) on 32-bit platform will
cause a kernel oops due to insufficient bounds checking.

	if (count > 1<<30) {
		/* Enforce a limit to prevent overflow */
		return -EINVAL;
	}
	count = roundup_pow_of_two(count);
	table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(count));

Note that the macro RPS_DEV_FLOW_TABLE_SIZE(count) is defined as:

	... + (count * sizeof(struct rps_dev_flow))

where sizeof(struct rps_dev_flow) is 8.  (1 << 30) * 8 will overflow
32 bits.

This patch replaces the magic number (1 << 30) with a symbolic bound.

Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Xi Wang <xi.wang@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net-sysfs.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index c71c434a4c05..385aefe53648 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -665,11 +665,14 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
 	if (count) {
 		int i;
 
-		if (count > 1<<30) {
+		if (count > INT_MAX)
+			return -EINVAL;
+		count = roundup_pow_of_two(count);
+		if (count > (ULONG_MAX - sizeof(struct rps_dev_flow_table))
+				/ sizeof(struct rps_dev_flow)) {
 			/* Enforce a limit to prevent overflow */
 			return -EINVAL;
 		}
-		count = roundup_pow_of_two(count);
 		table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(count));
 		if (!table)
 			return -ENOMEM;
-- 
cgit v1.2.3


From 0fd7bac6b6157eed6cf0cb86a1e88ba29e57c033 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 21 Dec 2011 07:11:44 +0000
Subject: net: relax rcvbuf limits

skb->truesize might be big even for a small packet.

Its even bigger after commit 87fb4b7b533 (net: more accurate skb
truesize) and big MTU.

We should allow queueing at least one packet per receiver, even with a
low RCVBUF setting.

Reported-by: Michal Simek <monstr@monstr.eu>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h     | 4 +++-
 net/core/sock.c        | 6 +-----
 net/packet/af_packet.c | 6 ++----
 3 files changed, 6 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/include/net/sock.h b/include/net/sock.h
index abb6e0f0c3c3..32e39371fba6 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -637,12 +637,14 @@ static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb)
 
 /*
  * Take into account size of receive queue and backlog queue
+ * Do not take into account this skb truesize,
+ * to allow even a single big packet to come.
  */
 static inline bool sk_rcvqueues_full(const struct sock *sk, const struct sk_buff *skb)
 {
 	unsigned int qsize = sk->sk_backlog.len + atomic_read(&sk->sk_rmem_alloc);
 
-	return qsize + skb->truesize > sk->sk_rcvbuf;
+	return qsize > sk->sk_rcvbuf;
 }
 
 /* The per-socket spinlock must be held here. */
diff --git a/net/core/sock.c b/net/core/sock.c
index 4ed7b1d12f5e..b23f174ab84c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -288,11 +288,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	unsigned long flags;
 	struct sk_buff_head *list = &sk->sk_receive_queue;
 
-	/* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
-	   number of warnings when compiling with -W --ANK
-	 */
-	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
-	    (unsigned)sk->sk_rcvbuf) {
+	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 		atomic_inc(&sk->sk_drops);
 		trace_sock_rcvqueue_full(sk, skb);
 		return -ENOMEM;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 82a6f34d39d0..3891702b81df 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1630,8 +1630,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
 	if (snaplen > res)
 		snaplen = res;
 
-	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
-	    (unsigned)sk->sk_rcvbuf)
+	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
 		goto drop_n_acct;
 
 	if (skb_shared(skb)) {
@@ -1762,8 +1761,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 	if (po->tp_version <= TPACKET_V2) {
 		if (macoff + snaplen > po->rx_ring.frame_size) {
 			if (po->copy_thresh &&
-				atomic_read(&sk->sk_rmem_alloc) + skb->truesize
-				< (unsigned)sk->sk_rcvbuf) {
+			    atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
 				if (skb_shared(skb)) {
 					copy_skb = skb_clone(skb, GFP_ATOMIC);
 				} else {
-- 
cgit v1.2.3


From 0354b48f633ae435acbc01b470a1ce8cfeff3e9f Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 16 Dec 2011 18:35:15 +0100
Subject: netfilter: xt_connbytes: handle negation correctly

"! --connbytes 23:42" should match if the packet/byte count is not in range.

As there is no explict "invert match" toggle in the match structure,
userspace swaps the from and to arguments
(i.e., as if "--connbytes 42:23" were given).

However, "what <= 23 && what >= 42" will always be false.

Change things so we use "||" in case "from" is larger than "to".

This change may look like it breaks backwards compatibility when "to" is 0.
However, older iptables binaries will refuse "connbytes 42:0",
and current releases treat it to mean "! --connbytes 0:42",
so we should be fine.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_connbytes.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c
index 5b138506690e..9ddf1c3bfb39 100644
--- a/net/netfilter/xt_connbytes.c
+++ b/net/netfilter/xt_connbytes.c
@@ -87,10 +87,10 @@ connbytes_mt(const struct sk_buff *skb, struct xt_action_param *par)
 		break;
 	}
 
-	if (sinfo->count.to)
+	if (sinfo->count.to >= sinfo->count.from)
 		return what <= sinfo->count.to && what >= sinfo->count.from;
-	else
-		return what >= sinfo->count.from;
+	else /* inverted */
+		return what < sinfo->count.to || what > sinfo->count.from;
 }
 
 static int connbytes_mt_check(const struct xt_mtchk_param *par)
-- 
cgit v1.2.3


From 81378f728fe560e175fb2e8fd33206793567e896 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sat, 24 Dec 2011 19:03:46 +0100
Subject: netfilter: ctnetlink: fix return value of ctnetlink_get_expect()

This fixes one bogus error that is returned to user-space:

libnetfilter_conntrack/utils# ./expect_get
TEST: get expectation (-1)(Unknown error 18446744073709551504)

This patch includes the correct handling for EAGAIN (nfnetlink
uses this error value to restart the operation after module
auto-loading).

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_netlink.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index ef21b221f036..3d7ea7af76fc 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1869,25 +1869,30 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
 
 	err = -ENOMEM;
 	skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
-	if (skb2 == NULL)
+	if (skb2 == NULL) {
+		nf_ct_expect_put(exp);
 		goto out;
+	}
 
 	rcu_read_lock();
 	err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).pid,
 				      nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp);
 	rcu_read_unlock();
+	nf_ct_expect_put(exp);
 	if (err <= 0)
 		goto free;
 
-	nf_ct_expect_put(exp);
+	err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
+	if (err < 0)
+		goto out;
 
-	return netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
+	return 0;
 
 free:
 	kfree_skb(skb2);
 out:
-	nf_ct_expect_put(exp);
-	return err;
+	/* this avoids a loop in nfnetlink. */
+	return err == -EAGAIN ? -ENOBUFS : err;
 }
 
 static int
-- 
cgit v1.2.3


From 1a31a4a8388a90e9240fb4e5e5c9c909fcfdfd0e Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sat, 24 Dec 2011 19:28:47 +0100
Subject: netfilter: ctnetlink: fix scheduling while atomic if helper is
 autoloaded

This patch fixes one scheduling while atomic error:

[  385.565186] ctnetlink v0.93: registering with nfnetlink.
[  385.565349] BUG: scheduling while atomic: lt-expect_creat/16163/0x00000200

It can be triggered with utils/expect_create included in
libnetfilter_conntrack if the FTP helper is not loaded.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_netlink.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 3d7ea7af76fc..b6977776d715 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1358,12 +1358,15 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
 						    nf_ct_protonum(ct));
 		if (helper == NULL) {
 			rcu_read_unlock();
+			spin_unlock_bh(&nf_conntrack_lock);
 #ifdef CONFIG_MODULES
 			if (request_module("nfct-helper-%s", helpname) < 0) {
+				spin_lock_bh(&nf_conntrack_lock);
 				err = -EOPNOTSUPP;
 				goto err1;
 			}
 
+			spin_lock_bh(&nf_conntrack_lock);
 			rcu_read_lock();
 			helper = __nf_conntrack_helper_find(helpname,
 							    nf_ct_l3num(ct),
-- 
cgit v1.2.3


From bb52c7acf871537a468433775151339f783d2e8c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 23 Dec 2011 19:28:51 +0000
Subject: netem: dont call vfree() under spinlock and BH disabled

commit 6373a9a286 (netem: use vmalloc for distribution table) added a
regression, since vfree() is called while holding a spinlock and BH
being disabled.

Fix this by doing the pointers swap in critical section, and freeing
after spinlock release.

Also add __GFP_NOWARN to the kmalloc() try, since we fallback to
vmalloc().

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_netem.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index eb3b9a86c6ed..a4ab207cdc59 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -488,7 +488,7 @@ static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
 		return -EINVAL;
 
 	s = sizeof(struct disttable) + n * sizeof(s16);
-	d = kmalloc(s, GFP_KERNEL);
+	d = kmalloc(s, GFP_KERNEL | __GFP_NOWARN);
 	if (!d)
 		d = vmalloc(s);
 	if (!d)
@@ -501,9 +501,10 @@ static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
 	root_lock = qdisc_root_sleeping_lock(sch);
 
 	spin_lock_bh(root_lock);
-	dist_free(q->delay_dist);
-	q->delay_dist = d;
+	swap(q->delay_dist, d);
 	spin_unlock_bh(root_lock);
+
+	dist_free(d);
 	return 0;
 }
 
-- 
cgit v1.2.3


From aef950b4ba3196622a5bd5e21ab1d63f30658285 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Tue, 27 Dec 2011 22:32:41 -0500
Subject: packet: fix possible dev refcnt leak when bind fail

If bind is fail when bind is called after set PACKET_FANOUT
sock option, the dev refcnt will leak.

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/packet/af_packet.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 3891702b81df..d9d4970b9b07 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2448,8 +2448,12 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protoc
 {
 	struct packet_sock *po = pkt_sk(sk);
 
-	if (po->fanout)
+	if (po->fanout) {
+		if (dev)
+			dev_put(dev);
+
 		return -EINVAL;
+	}
 
 	lock_sock(sk);
 
-- 
cgit v1.2.3


From 52793dbe3d60bd73bbebe28b2bfc9f6b4b920d4c Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Fri, 30 Dec 2011 14:19:02 +0900
Subject: ipvs: try also real server with port 0 in backup server

	We should not forget to try for real server with port 0
in the backup server when processing the sync message. We should
do it in all cases because the backup server can use different
forwarding method.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/ip_vs.h             |  2 +-
 net/netfilter/ipvs/ip_vs_conn.c |  2 +-
 net/netfilter/ipvs/ip_vs_ctl.c  | 10 ++++++++--
 net/netfilter/ipvs/ip_vs_sync.c |  2 +-
 4 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 873d5be7926c..e5a7b9aaf552 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -1207,7 +1207,7 @@ extern void ip_vs_control_cleanup(void);
 extern struct ip_vs_dest *
 ip_vs_find_dest(struct net *net, int af, const union nf_inet_addr *daddr,
 		__be16 dport, const union nf_inet_addr *vaddr, __be16 vport,
-		__u16 protocol, __u32 fwmark);
+		__u16 protocol, __u32 fwmark, __u32 flags);
 extern struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp);
 
 
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 12571fb2881c..29fa5badde75 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -616,7 +616,7 @@ struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp)
 	if ((cp) && (!cp->dest)) {
 		dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr,
 				       cp->dport, &cp->vaddr, cp->vport,
-				       cp->protocol, cp->fwmark);
+				       cp->protocol, cp->fwmark, cp->flags);
 		ip_vs_bind_dest(cp, dest);
 		return dest;
 	} else
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 008bf97cc91a..e1a66cf37f9a 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -619,15 +619,21 @@ struct ip_vs_dest *ip_vs_find_dest(struct net  *net, int af,
 				   const union nf_inet_addr *daddr,
 				   __be16 dport,
 				   const union nf_inet_addr *vaddr,
-				   __be16 vport, __u16 protocol, __u32 fwmark)
+				   __be16 vport, __u16 protocol, __u32 fwmark,
+				   __u32 flags)
 {
 	struct ip_vs_dest *dest;
 	struct ip_vs_service *svc;
+	__be16 port = dport;
 
 	svc = ip_vs_service_get(net, af, fwmark, protocol, vaddr, vport);
 	if (!svc)
 		return NULL;
-	dest = ip_vs_lookup_dest(svc, daddr, dport);
+	if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ)
+		port = 0;
+	dest = ip_vs_lookup_dest(svc, daddr, port);
+	if (!dest)
+		dest = ip_vs_lookup_dest(svc, daddr, port ^ dport);
 	if (dest)
 		atomic_inc(&dest->refcnt);
 	ip_vs_service_put(svc);
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 3cdd479f9b5d..2b6678c0ce14 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -740,7 +740,7 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
 		 * but still handled.
 		 */
 		dest = ip_vs_find_dest(net, type, daddr, dport, param->vaddr,
-				       param->vport, protocol, fwmark);
+				       param->vport, protocol, fwmark, flags);
 
 		/*  Set the approprite ativity flag */
 		if (protocol == IPPROTO_TCP) {
-- 
cgit v1.2.3


From c121638277a71c1e1fb44c3e654ea353357bbc2c Mon Sep 17 00:00:00 2001
From: Xi Wang <xi.wang@gmail.com>
Date: Fri, 30 Dec 2011 10:40:17 -0500
Subject: netfilter: ctnetlink: fix timeout calculation

The sanity check (timeout < 0) never works; the dividend is unsigned
and so is the division, which should have been a signed division.

	long timeout = (ct->timeout.expires - jiffies) / HZ;
	if (timeout < 0)
		timeout = 0;

This patch converts the time values to signed for the division.

Signed-off-by: Xi Wang <xi.wang@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_netlink.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index b6977776d715..257e77256c5c 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -135,7 +135,7 @@ nla_put_failure:
 static inline int
 ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct)
 {
-	long timeout = (ct->timeout.expires - jiffies) / HZ;
+	long timeout = ((long)ct->timeout.expires - (long)jiffies) / HZ;
 
 	if (timeout < 0)
 		timeout = 0;
@@ -1641,7 +1641,7 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb,
 			  const struct nf_conntrack_expect *exp)
 {
 	struct nf_conn *master = exp->master;
-	long timeout = (exp->timeout.expires - jiffies) / HZ;
+	long timeout = ((long)exp->timeout.expires - (long)jiffies) / HZ;
 	struct nf_conn_help *help;
 
 	if (timeout < 0)
-- 
cgit v1.2.3