From ef609c238a8ea163cb0af759cc73c9e2555c89da Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 3 Apr 2016 12:37:15 +0800
Subject: sunrpc: Fix skcipher/shash conversion

The skcpiher/shash conversion introduced a number of bugs in the
sunrpc code:

1) Missing calls to skcipher_request_set_tfm lead to crashes.
2) The allocation size of shash_desc is too small which leads to
memory corruption.

Fixes: 3b5cf20cf439 ("sunrpc: Use skcipher and ahash/shash")
Reported-by: J. Bruce Fields <bfields@fieldses.org>
Tested-by: J. Bruce Fields <bfields@fieldses.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 net/sunrpc/auth_gss/gss_krb5_crypto.c | 8 ++++++--
 net/sunrpc/auth_gss/gss_krb5_mech.c   | 3 ++-
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
index d94a8e1e9f05..da26455cf86c 100644
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -78,6 +78,7 @@ krb5_encrypt(
 	memcpy(out, in, length);
 	sg_init_one(sg, out, length);
 
+	skcipher_request_set_tfm(req, tfm);
 	skcipher_request_set_callback(req, 0, NULL, NULL);
 	skcipher_request_set_crypt(req, sg, sg, length, local_iv);
 
@@ -115,6 +116,7 @@ krb5_decrypt(
 	memcpy(out, in, length);
 	sg_init_one(sg, out, length);
 
+	skcipher_request_set_tfm(req, tfm);
 	skcipher_request_set_callback(req, 0, NULL, NULL);
 	skcipher_request_set_crypt(req, sg, sg, length, local_iv);
 
@@ -946,7 +948,8 @@ krb5_rc4_setup_seq_key(struct krb5_ctx *kctx, struct crypto_skcipher *cipher,
 		return PTR_ERR(hmac);
 	}
 
-	desc = kmalloc(sizeof(*desc), GFP_KERNEL);
+	desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac),
+		       GFP_KERNEL);
 	if (!desc) {
 		dprintk("%s: failed to allocate shash descriptor for '%s'\n",
 			__func__, kctx->gk5e->cksum_name);
@@ -1012,7 +1015,8 @@ krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, struct crypto_skcipher *cipher,
 		return PTR_ERR(hmac);
 	}
 
-	desc = kmalloc(sizeof(*desc), GFP_KERNEL);
+	desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac),
+		       GFP_KERNEL);
 	if (!desc) {
 		dprintk("%s: failed to allocate shash descriptor for '%s'\n",
 			__func__, kctx->gk5e->cksum_name);
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index 71341ccb9890..65427492b1c9 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -451,7 +451,8 @@ context_derive_keys_rc4(struct krb5_ctx *ctx)
 		goto out_err_free_hmac;
 
 
-	desc = kmalloc(sizeof(*desc), GFP_KERNEL);
+	desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac),
+		       GFP_KERNEL);
 	if (!desc) {
 		dprintk("%s: failed to allocate hash descriptor for '%s'\n",
 			__func__, ctx->gk5e->cksum_name);
-- 
cgit v1.2.3


From ff76def3bd7e816fe0ca7f0840065c566a42ba2f Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 29 Mar 2016 11:05:16 +0200
Subject: netfilter: arp_tables: register table in initns

arptables is broken since we didn't register the table anymore --
even 'arptables -L' fails.

Fixes: b9e69e127397187b ("netfilter: xtables: don't hook tables by default")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/arptable_filter.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index dd8c80dc32a2..8f8713b4388f 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -81,6 +81,12 @@ static int __init arptable_filter_init(void)
 		return ret;
 	}
 
+	ret = arptable_filter_table_init(&init_net);
+	if (ret) {
+		unregister_pernet_subsys(&arptable_filter_net_ops);
+		kfree(arpfilter_ops);
+	}
+
 	return ret;
 }
 
-- 
cgit v1.2.3


From 644c7e48cb59cfc6988ddc7bf3d3b1ba5fe7fa9d Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Wed, 30 Mar 2016 11:34:35 +0200
Subject: netfilter: nf_conntrack_tcp: Fix stack out of bounds when parsing TCP
 options

Baozeng Ding reported a KASAN stack out of bounds issue - it uncovered that
the TCP option parsing routines in netfilter TCP connection tracking could
read one byte out of the buffer of the TCP options.  Therefore in the patch
we check that the available data length is large enough to parse both TCP
option code and size.

Reported-by: Baozeng Ding <sploving1@gmail.com>
Tested-by: Baozeng Ding <sploving1@gmail.com>
Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_proto_tcp.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 278f3b9356ef..7cc1d9c22a9f 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -410,6 +410,8 @@ static void tcp_options(const struct sk_buff *skb,
 			length--;
 			continue;
 		default:
+			if (length < 2)
+				return;
 			opsize=*ptr++;
 			if (opsize < 2) /* "silly options" */
 				return;
@@ -470,6 +472,8 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
 			length--;
 			continue;
 		default:
+			if (length < 2)
+				return;
 			opsize = *ptr++;
 			if (opsize < 2) /* "silly options" */
 				return;
-- 
cgit v1.2.3


From ba6f5e33bdbb9ed2014b778fbbaecf20060ca989 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Wed, 6 Apr 2016 15:15:19 -0300
Subject: sctp: avoid refreshing heartbeat timer too often

Currently on high rate SCTP streams the heartbeat timer refresh can
consume quite a lot of resources as timer updates are costly and it
contains a random factor, which a) is also costly and b) invalidates
mod_timer() optimization for not editing a timer to the same value.
It may even cause the timer to be slightly advanced, for no good reason.

As suggested by David Laight this patch now removes this timer update
from hot path by leaving the timer on and re-evaluating upon its
expiration if the heartbeat is still needed or not, similarly to what is
done for TCP. If it's not needed anymore the timer is re-scheduled to
the new timeout, considering the time already elapsed.

For this, we now record the last tx timestamp per transport, updated in
the same spots as hb timer was restarted on tx. Also split up
sctp_transport_reset_timers into sctp_transport_reset_t3_rtx and
sctp_transport_reset_hb_timer, so we can re-arm T3 without re-arming the
heartbeat one.

On loopback with MTU of 65535 and data chunks with 1636, so that we
have a considerable amount of chunks without stressing system calls,
netperf -t SCTP_STREAM -l 30, perf looked like this before:

Samples: 103K of event 'cpu-clock', Event count (approx.): 25833000000
  Overhead  Command  Shared Object      Symbol
+    6,15%  netperf  [kernel.vmlinux]   [k] copy_user_enhanced_fast_string
-    5,43%  netperf  [kernel.vmlinux]   [k] _raw_write_unlock_irqrestore
   - _raw_write_unlock_irqrestore
      - 96,54% _raw_spin_unlock_irqrestore
         - 36,14% mod_timer
            + 97,24% sctp_transport_reset_timers
            + 2,76% sctp_do_sm
         + 33,65% __wake_up_sync_key
         + 28,77% sctp_ulpq_tail_event
         + 1,40% del_timer
      - 1,84% mod_timer
         + 99,03% sctp_transport_reset_timers
         + 0,97% sctp_do_sm
      + 1,50% sctp_ulpq_tail_event

And after this patch, now with netperf -l 60:

Samples: 230K of event 'cpu-clock', Event count (approx.): 57707250000
  Overhead  Command  Shared Object      Symbol
+    5,65%  netperf  [kernel.vmlinux]   [k] memcpy_erms
+    5,59%  netperf  [kernel.vmlinux]   [k] copy_user_enhanced_fast_string
-    5,05%  netperf  [kernel.vmlinux]   [k] _raw_spin_unlock_irqrestore
   - _raw_spin_unlock_irqrestore
      + 49,89% __wake_up_sync_key
      + 45,68% sctp_ulpq_tail_event
      - 2,85% mod_timer
         + 76,51% sctp_transport_reset_t3_rtx
         + 23,49% sctp_do_sm
      + 1,55% del_timer
+    2,50%  netperf  [sctp]             [k] sctp_datamsg_from_user
+    2,26%  netperf  [sctp]             [k] sctp_sendmsg

Throughput-wise, from 6800mbps without the patch to 7050mbps with it,
~3.7%.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h |  8 +++++++-
 net/sctp/outqueue.c        | 15 ++++++++++-----
 net/sctp/sm_make_chunk.c   |  3 +--
 net/sctp/sm_sideeffect.c   | 36 ++++++++++++++++--------------------
 net/sctp/transport.c       | 19 +++++++++++++------
 5 files changed, 47 insertions(+), 34 deletions(-)

(limited to 'net')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 6df1ce7a411c..5a404c354f4c 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -847,6 +847,11 @@ struct sctp_transport {
 	 */
 	ktime_t last_time_heard;
 
+	/* When was the last time that we sent a chunk using this
+	 * transport? We use this to check for idle transports
+	 */
+	unsigned long last_time_sent;
+
 	/* Last time(in jiffies) when cwnd is reduced due to the congestion
 	 * indication based on ECNE chunk.
 	 */
@@ -952,7 +957,8 @@ void sctp_transport_route(struct sctp_transport *, union sctp_addr *,
 			  struct sctp_sock *);
 void sctp_transport_pmtu(struct sctp_transport *, struct sock *sk);
 void sctp_transport_free(struct sctp_transport *);
-void sctp_transport_reset_timers(struct sctp_transport *);
+void sctp_transport_reset_t3_rtx(struct sctp_transport *);
+void sctp_transport_reset_hb_timer(struct sctp_transport *);
 int sctp_transport_hold(struct sctp_transport *);
 void sctp_transport_put(struct sctp_transport *);
 void sctp_transport_update_rto(struct sctp_transport *, __u32);
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 8d3d3625130e..084718f9b3da 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -866,8 +866,10 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 				 * sender MUST assure that at least one T3-rtx
 				 * timer is running.
 				 */
-				if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN)
-					sctp_transport_reset_timers(transport);
+				if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN) {
+					sctp_transport_reset_t3_rtx(transport);
+					transport->last_time_sent = jiffies;
+				}
 			}
 			break;
 
@@ -924,8 +926,10 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 			error = sctp_outq_flush_rtx(q, packet,
 						    rtx_timeout, &start_timer);
 
-			if (start_timer)
-				sctp_transport_reset_timers(transport);
+			if (start_timer) {
+				sctp_transport_reset_t3_rtx(transport);
+				transport->last_time_sent = jiffies;
+			}
 
 			/* This can happen on COOKIE-ECHO resend.  Only
 			 * one chunk can get bundled with a COOKIE-ECHO.
@@ -1062,7 +1066,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 			list_add_tail(&chunk->transmitted_list,
 				      &transport->transmitted);
 
-			sctp_transport_reset_timers(transport);
+			sctp_transport_reset_t3_rtx(transport);
+			transport->last_time_sent = jiffies;
 
 			/* Only let one DATA chunk get bundled with a
 			 * COOKIE-ECHO chunk.
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 7f0bf798205b..56f364d8f932 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -3080,8 +3080,7 @@ static __be16 sctp_process_asconf_param(struct sctp_association *asoc,
 			return SCTP_ERROR_RSRC_LOW;
 
 		/* Start the heartbeat timer. */
-		if (!mod_timer(&peer->hb_timer, sctp_transport_timeout(peer)))
-			sctp_transport_hold(peer);
+		sctp_transport_reset_hb_timer(peer);
 		asoc->new_transport = peer;
 		break;
 	case SCTP_PARAM_DEL_IP:
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 7fe56d0acabf..41b081a64752 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -69,8 +69,6 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
 			     sctp_cmd_seq_t *commands,
 			     gfp_t gfp);
 
-static void sctp_cmd_hb_timer_update(sctp_cmd_seq_t *cmds,
-				     struct sctp_transport *t);
 /********************************************************************
  * Helper functions
  ********************************************************************/
@@ -367,6 +365,7 @@ void sctp_generate_heartbeat_event(unsigned long data)
 	struct sctp_association *asoc = transport->asoc;
 	struct sock *sk = asoc->base.sk;
 	struct net *net = sock_net(sk);
+	u32 elapsed, timeout;
 
 	bh_lock_sock(sk);
 	if (sock_owned_by_user(sk)) {
@@ -378,6 +377,16 @@ void sctp_generate_heartbeat_event(unsigned long data)
 		goto out_unlock;
 	}
 
+	/* Check if we should still send the heartbeat or reschedule */
+	elapsed = jiffies - transport->last_time_sent;
+	timeout = sctp_transport_timeout(transport);
+	if (elapsed < timeout) {
+		elapsed = timeout - elapsed;
+		if (!mod_timer(&transport->hb_timer, jiffies + elapsed))
+			sctp_transport_hold(transport);
+		goto out_unlock;
+	}
+
 	error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT,
 			   SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_HEARTBEAT),
 			   asoc->state, asoc->ep, asoc,
@@ -507,7 +516,7 @@ static void sctp_do_8_2_transport_strike(sctp_cmd_seq_t *commands,
 					     0);
 
 		/* Update the hb timer to resend a heartbeat every rto */
-		sctp_cmd_hb_timer_update(commands, transport);
+		sctp_transport_reset_hb_timer(transport);
 	}
 
 	if (transport->state != SCTP_INACTIVE &&
@@ -634,11 +643,8 @@ static void sctp_cmd_hb_timers_start(sctp_cmd_seq_t *cmds,
 	 * hold a reference on the transport to make sure none of
 	 * the needed data structures go away.
 	 */
-	list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) {
-
-		if (!mod_timer(&t->hb_timer, sctp_transport_timeout(t)))
-			sctp_transport_hold(t);
-	}
+	list_for_each_entry(t, &asoc->peer.transport_addr_list, transports)
+		sctp_transport_reset_hb_timer(t);
 }
 
 static void sctp_cmd_hb_timers_stop(sctp_cmd_seq_t *cmds,
@@ -669,15 +675,6 @@ static void sctp_cmd_t3_rtx_timers_stop(sctp_cmd_seq_t *cmds,
 }
 
 
-/* Helper function to update the heartbeat timer. */
-static void sctp_cmd_hb_timer_update(sctp_cmd_seq_t *cmds,
-				     struct sctp_transport *t)
-{
-	/* Update the heartbeat timer.  */
-	if (!mod_timer(&t->hb_timer, sctp_transport_timeout(t)))
-		sctp_transport_hold(t);
-}
-
 /* Helper function to handle the reception of an HEARTBEAT ACK.  */
 static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds,
 				  struct sctp_association *asoc,
@@ -742,8 +739,7 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds,
 	sctp_transport_update_rto(t, (jiffies - hbinfo->sent_at));
 
 	/* Update the heartbeat timer.  */
-	if (!mod_timer(&t->hb_timer, sctp_transport_timeout(t)))
-		sctp_transport_hold(t);
+	sctp_transport_reset_hb_timer(t);
 
 	if (was_unconfirmed && asoc->peer.transport_count == 1)
 		sctp_transport_immediate_rtx(t);
@@ -1614,7 +1610,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 
 		case SCTP_CMD_HB_TIMER_UPDATE:
 			t = cmd->obj.transport;
-			sctp_cmd_hb_timer_update(commands, t);
+			sctp_transport_reset_hb_timer(t);
 			break;
 
 		case SCTP_CMD_HB_TIMERS_STOP:
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 9b6b48c7524e..81b86678be4d 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -183,7 +183,7 @@ static void sctp_transport_destroy(struct sctp_transport *transport)
 /* Start T3_rtx timer if it is not already running and update the heartbeat
  * timer.  This routine is called every time a DATA chunk is sent.
  */
-void sctp_transport_reset_timers(struct sctp_transport *transport)
+void sctp_transport_reset_t3_rtx(struct sctp_transport *transport)
 {
 	/* RFC 2960 6.3.2 Retransmission Timer Rules
 	 *
@@ -197,11 +197,18 @@ void sctp_transport_reset_timers(struct sctp_transport *transport)
 		if (!mod_timer(&transport->T3_rtx_timer,
 			       jiffies + transport->rto))
 			sctp_transport_hold(transport);
+}
+
+void sctp_transport_reset_hb_timer(struct sctp_transport *transport)
+{
+	unsigned long expires;
 
 	/* When a data chunk is sent, reset the heartbeat interval.  */
-	if (!mod_timer(&transport->hb_timer,
-		       sctp_transport_timeout(transport)))
-	    sctp_transport_hold(transport);
+	expires = jiffies + sctp_transport_timeout(transport);
+	if (time_before(transport->hb_timer.expires, expires) &&
+	    !mod_timer(&transport->hb_timer,
+		       expires + prandom_u32_max(transport->rto)))
+		sctp_transport_hold(transport);
 }
 
 /* This transport has been assigned to an association.
@@ -595,13 +602,13 @@ void sctp_transport_burst_reset(struct sctp_transport *t)
 unsigned long sctp_transport_timeout(struct sctp_transport *trans)
 {
 	/* RTO + timer slack +/- 50% of RTO */
-	unsigned long timeout = (trans->rto >> 1) + prandom_u32_max(trans->rto);
+	unsigned long timeout = trans->rto >> 1;
 
 	if (trans->state != SCTP_UNCONFIRMED &&
 	    trans->state != SCTP_PF)
 		timeout += trans->hbinterval;
 
-	return timeout + jiffies;
+	return timeout;
 }
 
 /* Reset transport variables to their initial values */
-- 
cgit v1.2.3


From a36a0d4008488fa545c74445d69eaf56377d5d4e Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 10 Apr 2016 23:01:30 -0400
Subject: decnet: Do not build routes to devices without decnet private data.

In particular, make sure we check for decnet private presence
for loopback devices.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/decnet/dn_route.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 607a14f20d88..b1dc096d22f8 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -1034,10 +1034,13 @@ source_ok:
 	if (!fld.daddr) {
 		fld.daddr = fld.saddr;
 
-		err = -EADDRNOTAVAIL;
 		if (dev_out)
 			dev_put(dev_out);
+		err = -EINVAL;
 		dev_out = init_net.loopback_dev;
+		if (!dev_out->dn_ptr)
+			goto out;
+		err = -EADDRNOTAVAIL;
 		dev_hold(dev_out);
 		if (!fld.daddr) {
 			fld.daddr =
@@ -1110,6 +1113,8 @@ source_ok:
 		if (dev_out == NULL)
 			goto out;
 		dn_db = rcu_dereference_raw(dev_out->dn_ptr);
+		if (!dn_db)
+			goto e_inval;
 		/* Possible improvement - check all devices for local addr */
 		if (dn_dev_islocal(dev_out, fld.daddr)) {
 			dev_put(dev_out);
@@ -1151,6 +1156,8 @@ select_source:
 			dev_put(dev_out);
 		dev_out = init_net.loopback_dev;
 		dev_hold(dev_out);
+		if (!dev_out->dn_ptr)
+			goto e_inval;
 		fld.flowidn_oif = dev_out->ifindex;
 		if (res.fi)
 			dn_fib_info_put(res.fi);
-- 
cgit v1.2.3


From e27260203912b40751fa353d009eaa5a642c739f Mon Sep 17 00:00:00 2001
From: Dmitry Ivanov <dmitrijs.ivanovs@ubnt.com>
Date: Thu, 7 Apr 2016 09:31:38 +0200
Subject: netlink: don't send NETLINK_URELEASE for unbound sockets

All existing users of NETLINK_URELEASE use it to clean up resources that
were previously allocated to a socket via some command. As a result, no
users require getting this notification for unbound sockets.

Sending it for unbound sockets, however, is a problem because any user
(including unprivileged users) can create a socket that uses the same ID
as an existing socket. Binding this new socket will fail, but if the
NETLINK_URELEASE notification is generated for such sockets, the users
thereof will be tricked into thinking the socket that they allocated the
resources for is closed.

In the nl80211 case, this will cause destruction of virtual interfaces
that still belong to an existing hostapd process; this is the case that
Dmitry noticed. In the NFC case, it will cause a poll abort. In the case
of netlink log/queue it will cause them to stop reporting events, as if
NFULNL_CFG_CMD_UNBIND/NFQNL_CFG_CMD_UNBIND had been called.

Fix this problem by checking that the socket is bound before generating
the NETLINK_URELEASE notification.

Cc: stable@vger.kernel.org
Signed-off-by: Dmitry Ivanov <dima@ubnt.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netlink/af_netlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 215fc08c02ab..330ebd600f25 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -688,7 +688,7 @@ static int netlink_release(struct socket *sock)
 
 	skb_queue_purge(&sk->sk_write_queue);
 
-	if (nlk->portid) {
+	if (nlk->portid && nlk->bound) {
 		struct netlink_notify n = {
 						.net = sock_net(sk),
 						.protocol = sk->sk_protocol,
-- 
cgit v1.2.3


From 541726abe7daca64390c2ec34e6a203145f1686d Mon Sep 17 00:00:00 2001
From: Erik Hugne <erik.hugne@gmail.com>
Date: Thu, 7 Apr 2016 10:40:43 -0400
Subject: tipc: make dist queue pernet

Nametable updates received from the network that cannot be applied
immediately are placed on a defer queue. This queue is global to the
TIPC module, which might cause problems when using TIPC in containers.
To prevent nametable updates from escaping into the wrong namespace,
we make the queue pernet instead.

Signed-off-by: Erik Hugne <erik.hugne@gmail.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/core.c       |  1 +
 net/tipc/core.h       |  3 +++
 net/tipc/name_distr.c | 16 +++++++---------
 3 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/tipc/core.c b/net/tipc/core.c
index 03a842870c52..e2bdb07a49a2 100644
--- a/net/tipc/core.c
+++ b/net/tipc/core.c
@@ -69,6 +69,7 @@ static int __net_init tipc_init_net(struct net *net)
 	if (err)
 		goto out_nametbl;
 
+	INIT_LIST_HEAD(&tn->dist_queue);
 	err = tipc_topsrv_start(net);
 	if (err)
 		goto out_subscr;
diff --git a/net/tipc/core.h b/net/tipc/core.h
index 5504d63503df..eff58dc53aa1 100644
--- a/net/tipc/core.h
+++ b/net/tipc/core.h
@@ -103,6 +103,9 @@ struct tipc_net {
 	spinlock_t nametbl_lock;
 	struct name_table *nametbl;
 
+	/* Name dist queue */
+	struct list_head dist_queue;
+
 	/* Topology subscription server */
 	struct tipc_server *topsrv;
 	atomic_t subscription_count;
diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c
index ebe9d0ff6e9e..4f4f5810f223 100644
--- a/net/tipc/name_distr.c
+++ b/net/tipc/name_distr.c
@@ -40,11 +40,6 @@
 
 int sysctl_tipc_named_timeout __read_mostly = 2000;
 
-/**
- * struct tipc_dist_queue - queue holding deferred name table updates
- */
-static struct list_head tipc_dist_queue = LIST_HEAD_INIT(tipc_dist_queue);
-
 struct distr_queue_item {
 	struct distr_item i;
 	u32 dtype;
@@ -279,9 +274,11 @@ static bool tipc_update_nametbl(struct net *net, struct distr_item *i,
  * tipc_named_add_backlog - add a failed name table update to the backlog
  *
  */
-static void tipc_named_add_backlog(struct distr_item *i, u32 type, u32 node)
+static void tipc_named_add_backlog(struct net *net, struct distr_item *i,
+				   u32 type, u32 node)
 {
 	struct distr_queue_item *e;
+	struct tipc_net *tn = net_generic(net, tipc_net_id);
 	unsigned long now = get_jiffies_64();
 
 	e = kzalloc(sizeof(*e), GFP_ATOMIC);
@@ -291,7 +288,7 @@ static void tipc_named_add_backlog(struct distr_item *i, u32 type, u32 node)
 	e->node = node;
 	e->expires = now + msecs_to_jiffies(sysctl_tipc_named_timeout);
 	memcpy(e, i, sizeof(*i));
-	list_add_tail(&e->next, &tipc_dist_queue);
+	list_add_tail(&e->next, &tn->dist_queue);
 }
 
 /**
@@ -301,10 +298,11 @@ static void tipc_named_add_backlog(struct distr_item *i, u32 type, u32 node)
 void tipc_named_process_backlog(struct net *net)
 {
 	struct distr_queue_item *e, *tmp;
+	struct tipc_net *tn = net_generic(net, tipc_net_id);
 	char addr[16];
 	unsigned long now = get_jiffies_64();
 
-	list_for_each_entry_safe(e, tmp, &tipc_dist_queue, next) {
+	list_for_each_entry_safe(e, tmp, &tn->dist_queue, next) {
 		if (time_after(e->expires, now)) {
 			if (!tipc_update_nametbl(net, &e->i, e->node, e->dtype))
 				continue;
@@ -344,7 +342,7 @@ void tipc_named_rcv(struct net *net, struct sk_buff_head *inputq)
 		node = msg_orignode(msg);
 		while (count--) {
 			if (!tipc_update_nametbl(net, item, node, mtype))
-				tipc_named_add_backlog(item, mtype, node);
+				tipc_named_add_backlog(net, item, mtype, node);
 			item++;
 		}
 		kfree_skb(skb);
-- 
cgit v1.2.3


From ddb1d33969ef52687ad00f75eecf87029f62e382 Mon Sep 17 00:00:00 2001
From: Erik Hugne <erik.hugne@gmail.com>
Date: Thu, 7 Apr 2016 10:40:44 -0400
Subject: tipc: purge deferred updates from dead nodes

If a peer node becomes unavailable, in addition to removing the
nametable entries from this node we also need to purge all deferred
updates associated with this node.

Signed-off-by: Erik Hugne <erik.hugne@gmail.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/name_distr.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'net')

diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c
index 4f4f5810f223..6b626a64b517 100644
--- a/net/tipc/name_distr.c
+++ b/net/tipc/name_distr.c
@@ -224,12 +224,31 @@ static void tipc_publ_purge(struct net *net, struct publication *publ, u32 addr)
 	kfree_rcu(p, rcu);
 }
 
+/**
+ * tipc_dist_queue_purge - remove deferred updates from a node that went down
+ */
+static void tipc_dist_queue_purge(struct net *net, u32 addr)
+{
+	struct tipc_net *tn = net_generic(net, tipc_net_id);
+	struct distr_queue_item *e, *tmp;
+
+	spin_lock_bh(&tn->nametbl_lock);
+	list_for_each_entry_safe(e, tmp, &tn->dist_queue, next) {
+		if (e->node != addr)
+			continue;
+		list_del(&e->next);
+		kfree(e);
+	}
+	spin_unlock_bh(&tn->nametbl_lock);
+}
+
 void tipc_publ_notify(struct net *net, struct list_head *nsub_list, u32 addr)
 {
 	struct publication *publ, *tmp;
 
 	list_for_each_entry_safe(publ, tmp, nsub_list, nodesub_list)
 		tipc_publ_purge(net, publ, addr);
+	tipc_dist_queue_purge(net, addr);
 }
 
 /**
-- 
cgit v1.2.3


From 9ab179d83b4e31ea277a123492e419067c2f129a Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Thu, 7 Apr 2016 11:10:06 -0700
Subject: net: vrf: Fix dst reference counting

Vivek reported a kernel exception deleting a VRF with an active
connection through it. The root cause is that the socket has a cached
reference to a dst that is destroyed. Converting the dst_destroy to
dst_release and letting proper reference counting kick in does not
work as the dst has a reference to the device which needs to be released
as well.

I talked to Hannes about this at netdev and he pointed out the ipv4 and
ipv6 dst handling has dst_ifdown for just this scenario. Rather than
continuing with the reinvented dst wheel in VRF just remove it and
leverage the ipv4 and ipv6 versions.

Fixes: 193125dbd8eb2 ("net: Introduce VRF device driver")
Fixes: 35402e3136634 ("net: Add IPv6 support to VRF device")

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vrf.c       | 177 +++++-------------------------------------------
 include/net/ip6_route.h |   3 +
 include/net/route.h     |   3 +
 net/ipv4/route.c        |   7 +-
 net/ipv6/route.c        |   7 +-
 5 files changed, 30 insertions(+), 167 deletions(-)

(limited to 'net')

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 9a9fabb900c1..8a8f1e58b415 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -60,41 +60,6 @@ struct pcpu_dstats {
 	struct u64_stats_sync	syncp;
 };
 
-static struct dst_entry *vrf_ip_check(struct dst_entry *dst, u32 cookie)
-{
-	return dst;
-}
-
-static int vrf_ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
-{
-	return ip_local_out(net, sk, skb);
-}
-
-static unsigned int vrf_v4_mtu(const struct dst_entry *dst)
-{
-	/* TO-DO: return max ethernet size? */
-	return dst->dev->mtu;
-}
-
-static void vrf_dst_destroy(struct dst_entry *dst)
-{
-	/* our dst lives forever - or until the device is closed */
-}
-
-static unsigned int vrf_default_advmss(const struct dst_entry *dst)
-{
-	return 65535 - 40;
-}
-
-static struct dst_ops vrf_dst_ops = {
-	.family		= AF_INET,
-	.local_out	= vrf_ip_local_out,
-	.check		= vrf_ip_check,
-	.mtu		= vrf_v4_mtu,
-	.destroy	= vrf_dst_destroy,
-	.default_advmss	= vrf_default_advmss,
-};
-
 /* neighbor handling is done with actual device; do not want
  * to flip skb->dev for those ndisc packets. This really fails
  * for multiple next protocols (e.g., NEXTHDR_HOP). But it is
@@ -349,46 +314,6 @@ static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev)
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
-static struct dst_entry *vrf_ip6_check(struct dst_entry *dst, u32 cookie)
-{
-	return dst;
-}
-
-static struct dst_ops vrf_dst_ops6 = {
-	.family		= AF_INET6,
-	.local_out	= ip6_local_out,
-	.check		= vrf_ip6_check,
-	.mtu		= vrf_v4_mtu,
-	.destroy	= vrf_dst_destroy,
-	.default_advmss	= vrf_default_advmss,
-};
-
-static int init_dst_ops6_kmem_cachep(void)
-{
-	vrf_dst_ops6.kmem_cachep = kmem_cache_create("vrf_ip6_dst_cache",
-						     sizeof(struct rt6_info),
-						     0,
-						     SLAB_HWCACHE_ALIGN,
-						     NULL);
-
-	if (!vrf_dst_ops6.kmem_cachep)
-		return -ENOMEM;
-
-	return 0;
-}
-
-static void free_dst_ops6_kmem_cachep(void)
-{
-	kmem_cache_destroy(vrf_dst_ops6.kmem_cachep);
-}
-
-static int vrf_input6(struct sk_buff *skb)
-{
-	skb->dev->stats.rx_errors++;
-	kfree_skb(skb);
-	return 0;
-}
-
 /* modelled after ip6_finish_output2 */
 static int vrf_finish_output6(struct net *net, struct sock *sk,
 			      struct sk_buff *skb)
@@ -429,67 +354,34 @@ static int vrf_output6(struct net *net, struct sock *sk, struct sk_buff *skb)
 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 }
 
-static void vrf_rt6_destroy(struct net_vrf *vrf)
+static void vrf_rt6_release(struct net_vrf *vrf)
 {
-	dst_destroy(&vrf->rt6->dst);
-	free_percpu(vrf->rt6->rt6i_pcpu);
+	dst_release(&vrf->rt6->dst);
 	vrf->rt6 = NULL;
 }
 
 static int vrf_rt6_create(struct net_device *dev)
 {
 	struct net_vrf *vrf = netdev_priv(dev);
-	struct dst_entry *dst;
+	struct net *net = dev_net(dev);
 	struct rt6_info *rt6;
-	int cpu;
 	int rc = -ENOMEM;
 
-	rt6 = dst_alloc(&vrf_dst_ops6, dev, 0,
-			DST_OBSOLETE_NONE,
-			(DST_HOST | DST_NOPOLICY | DST_NOXFRM));
+	rt6 = ip6_dst_alloc(net, dev,
+			    DST_HOST | DST_NOPOLICY | DST_NOXFRM | DST_NOCACHE);
 	if (!rt6)
 		goto out;
 
-	dst = &rt6->dst;
-
-	rt6->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_KERNEL);
-	if (!rt6->rt6i_pcpu) {
-		dst_destroy(dst);
-		goto out;
-	}
-	for_each_possible_cpu(cpu) {
-		struct rt6_info **p = per_cpu_ptr(rt6->rt6i_pcpu, cpu);
-		*p =  NULL;
-	}
-
-	memset(dst + 1, 0, sizeof(*rt6) - sizeof(*dst));
-
-	INIT_LIST_HEAD(&rt6->rt6i_siblings);
-	INIT_LIST_HEAD(&rt6->rt6i_uncached);
-
-	rt6->dst.input	= vrf_input6;
 	rt6->dst.output	= vrf_output6;
-
-	rt6->rt6i_table = fib6_get_table(dev_net(dev), vrf->tb_id);
-
-	atomic_set(&rt6->dst.__refcnt, 2);
-
+	rt6->rt6i_table = fib6_get_table(net, vrf->tb_id);
+	dst_hold(&rt6->dst);
 	vrf->rt6 = rt6;
 	rc = 0;
 out:
 	return rc;
 }
 #else
-static int init_dst_ops6_kmem_cachep(void)
-{
-	return 0;
-}
-
-static void free_dst_ops6_kmem_cachep(void)
-{
-}
-
-static void vrf_rt6_destroy(struct net_vrf *vrf)
+static void vrf_rt6_release(struct net_vrf *vrf)
 {
 }
 
@@ -557,11 +449,11 @@ static int vrf_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 			    !(IPCB(skb)->flags & IPSKB_REROUTED));
 }
 
-static void vrf_rtable_destroy(struct net_vrf *vrf)
+static void vrf_rtable_release(struct net_vrf *vrf)
 {
 	struct dst_entry *dst = (struct dst_entry *)vrf->rth;
 
-	dst_destroy(dst);
+	dst_release(dst);
 	vrf->rth = NULL;
 }
 
@@ -570,22 +462,10 @@ static struct rtable *vrf_rtable_create(struct net_device *dev)
 	struct net_vrf *vrf = netdev_priv(dev);
 	struct rtable *rth;
 
-	rth = dst_alloc(&vrf_dst_ops, dev, 2,
-			DST_OBSOLETE_NONE,
-			(DST_HOST | DST_NOPOLICY | DST_NOXFRM));
+	rth = rt_dst_alloc(dev, 0, RTN_UNICAST, 1, 1, 0);
 	if (rth) {
 		rth->dst.output	= vrf_output;
-		rth->rt_genid	= rt_genid_ipv4(dev_net(dev));
-		rth->rt_flags	= 0;
-		rth->rt_type	= RTN_UNICAST;
-		rth->rt_is_input = 0;
-		rth->rt_iif	= 0;
-		rth->rt_pmtu	= 0;
-		rth->rt_gateway	= 0;
-		rth->rt_uses_gateway = 0;
 		rth->rt_table_id = vrf->tb_id;
-		INIT_LIST_HEAD(&rth->rt_uncached);
-		rth->rt_uncached_list = NULL;
 	}
 
 	return rth;
@@ -673,8 +553,8 @@ static void vrf_dev_uninit(struct net_device *dev)
 	struct net_device *port_dev;
 	struct list_head *iter;
 
-	vrf_rtable_destroy(vrf);
-	vrf_rt6_destroy(vrf);
+	vrf_rtable_release(vrf);
+	vrf_rt6_release(vrf);
 
 	netdev_for_each_lower_dev(dev, port_dev, iter)
 		vrf_del_slave(dev, port_dev);
@@ -704,7 +584,7 @@ static int vrf_dev_init(struct net_device *dev)
 	return 0;
 
 out_rth:
-	vrf_rtable_destroy(vrf);
+	vrf_rtable_release(vrf);
 out_stats:
 	free_percpu(dev->dstats);
 	dev->dstats = NULL;
@@ -737,7 +617,7 @@ static struct rtable *vrf_get_rtable(const struct net_device *dev,
 		struct net_vrf *vrf = netdev_priv(dev);
 
 		rth = vrf->rth;
-		atomic_inc(&rth->dst.__refcnt);
+		dst_hold(&rth->dst);
 	}
 
 	return rth;
@@ -788,7 +668,7 @@ static struct dst_entry *vrf_get_rt6_dst(const struct net_device *dev,
 		struct net_vrf *vrf = netdev_priv(dev);
 
 		rt = vrf->rt6;
-		atomic_inc(&rt->dst.__refcnt);
+		dst_hold(&rt->dst);
 	}
 
 	return (struct dst_entry *)rt;
@@ -946,19 +826,6 @@ static int __init vrf_init_module(void)
 {
 	int rc;
 
-	vrf_dst_ops.kmem_cachep =
-		kmem_cache_create("vrf_ip_dst_cache",
-				  sizeof(struct rtable), 0,
-				  SLAB_HWCACHE_ALIGN,
-				  NULL);
-
-	if (!vrf_dst_ops.kmem_cachep)
-		return -ENOMEM;
-
-	rc = init_dst_ops6_kmem_cachep();
-	if (rc != 0)
-		goto error2;
-
 	register_netdevice_notifier(&vrf_notifier_block);
 
 	rc = rtnl_link_register(&vrf_link_ops);
@@ -969,22 +836,10 @@ static int __init vrf_init_module(void)
 
 error:
 	unregister_netdevice_notifier(&vrf_notifier_block);
-	free_dst_ops6_kmem_cachep();
-error2:
-	kmem_cache_destroy(vrf_dst_ops.kmem_cachep);
 	return rc;
 }
 
-static void __exit vrf_cleanup_module(void)
-{
-	rtnl_link_unregister(&vrf_link_ops);
-	unregister_netdevice_notifier(&vrf_notifier_block);
-	kmem_cache_destroy(vrf_dst_ops.kmem_cachep);
-	free_dst_ops6_kmem_cachep();
-}
-
 module_init(vrf_init_module);
-module_exit(vrf_cleanup_module);
 MODULE_AUTHOR("Shrijeet Mukherjee, David Ahern");
 MODULE_DESCRIPTION("Device driver to instantiate VRF domains");
 MODULE_LICENSE("GPL");
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 295d291269e2..54c779416eec 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -101,6 +101,9 @@ void fib6_force_start_gc(struct net *net);
 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
 				    const struct in6_addr *addr, bool anycast);
 
+struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
+			       int flags);
+
 /*
  *	support functions for ND
  *
diff --git a/include/net/route.h b/include/net/route.h
index 9b0a523bb428..6de665bf1750 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -209,6 +209,9 @@ unsigned int inet_addr_type_dev_table(struct net *net,
 void ip_rt_multicast_event(struct in_device *);
 int ip_rt_ioctl(struct net *, unsigned int cmd, void __user *arg);
 void ip_rt_get_source(u8 *src, struct sk_buff *skb, struct rtable *rt);
+struct rtable *rt_dst_alloc(struct net_device *dev,
+			     unsigned int flags, u16 type,
+			     bool nopolicy, bool noxfrm, bool will_cache);
 
 struct in_ifaddr;
 void fib_add_ifaddr(struct in_ifaddr *);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 02c62299d717..2852bdf73540 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1438,9 +1438,9 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
 #endif
 }
 
-static struct rtable *rt_dst_alloc(struct net_device *dev,
-				   unsigned int flags, u16 type,
-				   bool nopolicy, bool noxfrm, bool will_cache)
+struct rtable *rt_dst_alloc(struct net_device *dev,
+			    unsigned int flags, u16 type,
+			    bool nopolicy, bool noxfrm, bool will_cache)
 {
 	struct rtable *rt;
 
@@ -1468,6 +1468,7 @@ static struct rtable *rt_dst_alloc(struct net_device *dev,
 
 	return rt;
 }
+EXPORT_SYMBOL(rt_dst_alloc);
 
 /* called in rcu_read_lock() section */
 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index ed446639219c..1d8871a5ed20 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -338,9 +338,9 @@ static struct rt6_info *__ip6_dst_alloc(struct net *net,
 	return rt;
 }
 
-static struct rt6_info *ip6_dst_alloc(struct net *net,
-				      struct net_device *dev,
-				      int flags)
+struct rt6_info *ip6_dst_alloc(struct net *net,
+			       struct net_device *dev,
+			       int flags)
 {
 	struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
 
@@ -364,6 +364,7 @@ static struct rt6_info *ip6_dst_alloc(struct net *net,
 
 	return rt;
 }
+EXPORT_SYMBOL(ip6_dst_alloc);
 
 static void ip6_dst_destroy(struct dst_entry *dst)
 {
-- 
cgit v1.2.3


From 4f7f34eaab9f68c9bcd45386b15c414c38b40587 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Thu, 7 Apr 2016 11:10:41 -0700
Subject: net: vrf: Fix dev refcnt leak due to IPv6 prefix route

ifupdown2 found a kernel bug with IPv6 routes and movement from the main
table to the VRF table. Sequence of events:

Create the interface and add addresses:
    ip link add dev eth4.105 link eth4 type vlan id 105
    ip addr add dev eth4.105 8.105.105.10/24
    ip -6 addr add dev eth4.105 2008:105:105::10/64

At this point IPv6 has inserted a prefix route in the main table even
though the interface is 'down'. From there the VRF device is created:
    ip link add dev vrf105 type vrf table 105
    ip addr add dev vrf105 9.9.105.10/32
    ip -6 addr add dev vrf105 2000:9:105::10/128
    ip link set vrf105 up

Then the interface is enslaved, while still in the 'down' state:
    ip link set dev eth4.105 master vrf105

Since the device is down the VRF driver cycling the device does not
send the NETDEV_UP and NETDEV_DOWN but rather the NETDEV_CHANGE event
which does not flush the routes inserted prior.

When the link is brought up
    ip link set dev eth4.105 up

the prefix route is added in the VRF table, but does not remove
the route from the main table.

Fix by handling the NETDEV_CHANGEUPPER event similar what was implemented
for IPv4 in 7f49e7a38b77 ("net: Flush local routes when device changes vrf
association")

Fixes: 35402e3136634 ("net: Add IPv6 support to VRF device")

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 27aed1afcf81..2db2116d3e6b 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3255,6 +3255,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
 			   void *ptr)
 {
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct netdev_notifier_changeupper_info *info;
 	struct inet6_dev *idev = __in6_dev_get(dev);
 	int run_pending = 0;
 	int err;
@@ -3413,6 +3414,15 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
 		if (idev)
 			addrconf_type_change(dev, event);
 		break;
+
+	case NETDEV_CHANGEUPPER:
+		info = ptr;
+
+		/* flush all routes if dev is linked to or unlinked from
+		 * an L3 master device (e.g., VRF)
+		 */
+		if (info->upper_dev && netif_is_l3_master(info->upper_dev))
+			addrconf_ifdown(dev, 0);
 	}
 
 	return NOTIFY_OK;
-- 
cgit v1.2.3


From 8f815cdde3e550e10c2736990d791f60c2ce43eb Mon Sep 17 00:00:00 2001
From: Dmitry Ivanov <dmitrijs.ivanovs@ubnt.com>
Date: Wed, 6 Apr 2016 17:23:18 +0300
Subject: nl80211: check netlink protocol in socket release notification

A non-privileged user can create a netlink socket with the same port_id as
used by an existing open nl80211 netlink socket (e.g. as used by a hostapd
process) with a different protocol number.

Closing this socket will then lead to the notification going to nl80211's
socket release notification handler, and possibly cause an action such as
removing a virtual interface.

Fix this issue by checking that the netlink protocol is NETLINK_GENERIC.
Since generic netlink has no notifier chain of its own, we can't fix the
problem more generically.

Fixes: 026331c4d9b5 ("cfg80211/mac80211: allow registering for and sending action frames")
Cc: stable@vger.kernel.org
Signed-off-by: Dmitry Ivanov <dima@ubnt.com>
[rewrite commit message]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 98c924260b3d..056a7307862b 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -13216,7 +13216,7 @@ static int nl80211_netlink_notify(struct notifier_block * nb,
 	struct wireless_dev *wdev;
 	struct cfg80211_beacon_registration *reg, *tmp;
 
-	if (state != NETLINK_URELEASE)
+	if (state != NETLINK_URELEASE || notify->protocol != NETLINK_GENERIC)
 		return NOTIFY_DONE;
 
 	rcu_read_lock();
-- 
cgit v1.2.3


From bcf4934288402be3464110109a4dae3bd6fb3e93 Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Tue, 12 Apr 2016 01:31:14 +0200
Subject: netfilter: ebtables: Fix extension lookup with identical name

If a requested extension exists as module and is not loaded,
ebt_check_match() might accidentally use an NFPROTO_UNSPEC one with same
name and fail.

Reproduced with limit match: Given xt_limit and ebt_limit both built as
module, the following would fail:

  modprobe xt_limit
  ebtables -I INPUT --limit 1/s -j ACCEPT

The fix is to make ebt_check_match() distrust a found NFPROTO_UNSPEC
extension and retry after requesting an appropriate module.

Cc: Florian Westphal <fw@strlen.de>
Signed-off-by: Phil Sutter <phil@nwl.cc>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/bridge/netfilter/ebtables.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 8570bc7744c2..5a61f35412a0 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -370,7 +370,11 @@ ebt_check_match(struct ebt_entry_match *m, struct xt_mtchk_param *par,
 	    left - sizeof(struct ebt_entry_match) < m->match_size)
 		return -EINVAL;
 
-	match = xt_request_find_match(NFPROTO_BRIDGE, m->u.name, 0);
+	match = xt_find_match(NFPROTO_BRIDGE, m->u.name, 0);
+	if (IS_ERR(match) || match->family != NFPROTO_BRIDGE) {
+		request_module("ebt_%s", m->u.name);
+		match = xt_find_match(NFPROTO_BRIDGE, m->u.name, 0);
+	}
 	if (IS_ERR(match))
 		return PTR_ERR(match);
 	m->u.match = match;
-- 
cgit v1.2.3


From 70af921db6f8835f4b11c65731116560adb00c14 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Fri, 8 Apr 2016 12:01:21 -0700
Subject: net: ipv6: Do not keep linklocal and loopback addresses

f1705ec197e7 added the option to retain user configured addresses on an
admin down. A comment to one of the later revisions suggested using the
IFA_F_PERMANENT flag rather than adding a user_managed boolean to the
ifaddr struct. A side effect of this change is that link local and
loopback addresses are also retained which is not part of the objective
of f1705ec197e7. Add check to drop those addresses.

Fixes: f1705ec197e7 ("net: ipv6: Make address flushing on ifdown optional")
Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 2db2116d3e6b..23cec53b568a 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3448,6 +3448,12 @@ static void addrconf_type_change(struct net_device *dev, unsigned long event)
 		ipv6_mc_unmap(idev);
 }
 
+static bool addr_is_local(const struct in6_addr *addr)
+{
+	return ipv6_addr_type(addr) &
+		(IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
+}
+
 static int addrconf_ifdown(struct net_device *dev, int how)
 {
 	struct net *net = dev_net(dev);
@@ -3505,7 +3511,8 @@ restart:
 				 * address is retained on a down event
 				 */
 				if (!keep_addr ||
-				    !(ifa->flags & IFA_F_PERMANENT)) {
+				    !(ifa->flags & IFA_F_PERMANENT) ||
+				    addr_is_local(&ifa->addr)) {
 					hlist_del_init_rcu(&ifa->addr_lst);
 					goto restart;
 				}
@@ -3554,7 +3561,8 @@ restart:
 		write_unlock_bh(&idev->lock);
 		spin_lock_bh(&ifa->lock);
 
-		if (keep_addr && (ifa->flags & IFA_F_PERMANENT)) {
+		if (keep_addr && (ifa->flags & IFA_F_PERMANENT) &&
+		    !addr_is_local(&ifa->addr)) {
 			/* set state to skip the notifier below */
 			state = INET6_IFADDR_STATE_DEAD;
 			ifa->state = 0;
-- 
cgit v1.2.3


From d6d5e999e5df67f8ec20b6be45e2229455ee3699 Mon Sep 17 00:00:00 2001
From: Chris Friesen <chris.friesen@windriver.com>
Date: Fri, 8 Apr 2016 15:21:30 -0600
Subject: route: do not cache fib route info on local routes with oif

For local routes that require a particular output interface we do not want
to cache the result.  Caching the result causes incorrect behaviour when
there are multiple source addresses on the interface.  The end result
being that if the intended recipient is waiting on that interface for the
packet he won't receive it because it will be delivered on the loopback
interface and the IP_PKTINFO ipi_ifindex will be set to the loopback
interface as well.

This can be tested by running a program such as "dhcp_release" which
attempts to inject a packet on a particular interface so that it is
received by another program on the same board.  The receiving process
should see an IP_PKTINFO ipi_ifndex value of the source interface
(e.g., eth1) instead of the loopback interface (e.g., lo).  The packet
will still appear on the loopback interface in tcpdump but the important
aspect is that the CMSG info is correct.

Sample dhcp_release command line:

   dhcp_release eth1 192.168.204.222 02:11:33:22:44:66

Signed-off-by: Allain Legacy <allain.legacy@windriver.com>
Signed off-by: Chris Friesen <chris.friesen@windriver.com>
Reviewed-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 2852bdf73540..60398a9370e7 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2046,6 +2046,18 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 		 */
 		if (fi && res->prefixlen < 4)
 			fi = NULL;
+	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
+		   (orig_oif != dev_out->ifindex)) {
+		/* For local routes that require a particular output interface
+		 * we do not want to cache the result.  Caching the result
+		 * causes incorrect behaviour when there are multiple source
+		 * addresses on the interface, the end result being that if the
+		 * intended recipient is waiting on that interface for the
+		 * packet he won't receive it because it will be delivered on
+		 * the loopback interface and the IP_PKTINFO ipi_ifindex will
+		 * be set to the loopback interface as well.
+		 */
+		fi = NULL;
 	}
 
 	fnhe = NULL;
-- 
cgit v1.2.3


From 309cf37fe2a781279b7675d4bb7173198e532867 Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@googlemail.com>
Date: Sun, 10 Apr 2016 12:52:28 +0200
Subject: packet: fix heap info leak in PACKET_DIAG_MCLIST sock_diag interface

Because we miss to wipe the remainder of i->addr[] in packet_mc_add(),
pdiag_put_mclist() leaks uninitialized heap bytes via the
PACKET_DIAG_MCLIST netlink attribute.

Fix this by explicitly memset(0)ing the remaining bytes in i->addr[].

Fixes: eea68e2f1a00 ("packet: Report socket mclist info via diag module")
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Acked-by: Pavel Emelyanov <xemul@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/packet/af_packet.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index f12c17f355d9..18d0becbc46d 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -3521,6 +3521,7 @@ static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
 	i->ifindex = mreq->mr_ifindex;
 	i->alen = mreq->mr_alen;
 	memcpy(i->addr, mreq->mr_address, i->alen);
+	memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
 	i->count = 1;
 	i->next = po->mclist;
 	po->mclist = i;
-- 
cgit v1.2.3


From 3dcd493fbebfd631913df6e2773cc295d3bf7d22 Mon Sep 17 00:00:00 2001
From: Lars Persson <lars.persson@axis.com>
Date: Tue, 12 Apr 2016 08:45:52 +0200
Subject: net: sched: do not requeue a NULL skb

A failure in validate_xmit_skb_list() triggered an unconditional call
to dev_requeue_skb with skb=NULL. This slowly grows the queue
discipline's qlen count until all traffic through the queue stops.

We take the optimistic approach and continue running the queue after a
failure since it is unknown if later packets also will fail in the
validate path.

Fixes: 55a93b3ea780 ("qdisc: validate skb without holding lock")
Signed-off-by: Lars Persson <larper@axis.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_generic.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index f18c35024207..80742edea96f 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -159,12 +159,15 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 	if (validate)
 		skb = validate_xmit_skb_list(skb, dev);
 
-	if (skb) {
+	if (likely(skb)) {
 		HARD_TX_LOCK(dev, txq, smp_processor_id());
 		if (!netif_xmit_frozen_or_stopped(txq))
 			skb = dev_hard_start_xmit(skb, dev, txq, &ret);
 
 		HARD_TX_UNLOCK(dev, txq);
+	} else {
+		spin_lock(root_lock);
+		return qdisc_qlen(q);
 	}
 	spin_lock(root_lock);
 
-- 
cgit v1.2.3


From 80fbdb208f37740774652ba095a5b2045205ed59 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Mon, 11 Apr 2016 15:29:34 -0700
Subject: ipv6: datagram: Refactor flowi6 init codes to a new function

Move flowi6 init codes for connected datagram sk to a newly created
function ip6_datagram_flow_key_init().

Notes:
1. fl6_flowlabel is used instead of fl6.flowlabel in __ip6_datagram_connect
2. ipv6_addr_is_multicast(&fl6->daddr) is used instead of
   (addr_type & IPV6_ADDR_MULTICAST) in ip6_datagram_flow_key_init()

This new function will be reused during pmtu update in the later patch.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Wei Wang <weiwan@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/datagram.c | 50 ++++++++++++++++++++++++++++++--------------------
 1 file changed, 30 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 428162155280..f07c1ddb5cac 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -40,6 +40,30 @@ static bool ipv6_mapped_addr_any(const struct in6_addr *a)
 	return ipv6_addr_v4mapped(a) && (a->s6_addr32[3] == 0);
 }
 
+static void ip6_datagram_flow_key_init(struct flowi6 *fl6, struct sock *sk)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct ipv6_pinfo *np = inet6_sk(sk);
+
+	memset(fl6, 0, sizeof(*fl6));
+	fl6->flowi6_proto = sk->sk_protocol;
+	fl6->daddr = sk->sk_v6_daddr;
+	fl6->saddr = np->saddr;
+	fl6->flowi6_oif = sk->sk_bound_dev_if;
+	fl6->flowi6_mark = sk->sk_mark;
+	fl6->fl6_dport = inet->inet_dport;
+	fl6->fl6_sport = inet->inet_sport;
+	fl6->flowlabel = np->flow_label;
+
+	if (!fl6->flowi6_oif)
+		fl6->flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
+
+	if (!fl6->flowi6_oif && ipv6_addr_is_multicast(&fl6->daddr))
+		fl6->flowi6_oif = np->mcast_oif;
+
+	security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
+}
+
 static int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 {
 	struct sockaddr_in6	*usin = (struct sockaddr_in6 *) uaddr;
@@ -52,6 +76,7 @@ static int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int a
 	struct ipv6_txoptions	*opt;
 	int			addr_type;
 	int			err;
+	__be32			fl6_flowlabel = 0;
 
 	if (usin->sin6_family == AF_INET) {
 		if (__ipv6_only_sock(sk))
@@ -66,11 +91,10 @@ static int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int a
 	if (usin->sin6_family != AF_INET6)
 		return -EAFNOSUPPORT;
 
-	memset(&fl6, 0, sizeof(fl6));
 	if (np->sndflow) {
-		fl6.flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK;
-		if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
-			flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
+		fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK;
+		if (fl6_flowlabel & IPV6_FLOWLABEL_MASK) {
+			flowlabel = fl6_sock_lookup(sk, fl6_flowlabel);
 			if (!flowlabel)
 				return -EINVAL;
 		}
@@ -145,7 +169,7 @@ ipv4_connected:
 	}
 
 	sk->sk_v6_daddr = *daddr;
-	np->flow_label = fl6.flowlabel;
+	np->flow_label = fl6_flowlabel;
 
 	inet->inet_dport = usin->sin6_port;
 
@@ -154,21 +178,7 @@ ipv4_connected:
 	 *	destination cache for it.
 	 */
 
-	fl6.flowi6_proto = sk->sk_protocol;
-	fl6.daddr = sk->sk_v6_daddr;
-	fl6.saddr = np->saddr;
-	fl6.flowi6_oif = sk->sk_bound_dev_if;
-	fl6.flowi6_mark = sk->sk_mark;
-	fl6.fl6_dport = inet->inet_dport;
-	fl6.fl6_sport = inet->inet_sport;
-
-	if (!fl6.flowi6_oif)
-		fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
-
-	if (!fl6.flowi6_oif && (addr_type&IPV6_ADDR_MULTICAST))
-		fl6.flowi6_oif = np->mcast_oif;
-
-	security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+	ip6_datagram_flow_key_init(&fl6, sk);
 
 	rcu_read_lock();
 	opt = flowlabel ? flowlabel->opt : rcu_dereference(np->opt);
-- 
cgit v1.2.3


From 7e2040db1539a904924b997a14ebd3de53172100 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Mon, 11 Apr 2016 15:29:35 -0700
Subject: ipv6: datagram: Refactor dst lookup and update codes to a new
 function

This patch moves the route lookup and update codes for connected
datagram sk to a newly created function ip6_datagram_dst_update()

It will be reused during the pmtu update in the later patch.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Wei Wang <weiwan@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/datagram.c | 103 +++++++++++++++++++++++++++++-----------------------
 1 file changed, 57 insertions(+), 46 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index f07c1ddb5cac..669585e11294 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -64,16 +64,65 @@ static void ip6_datagram_flow_key_init(struct flowi6 *fl6, struct sock *sk)
 	security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
 }
 
+static int ip6_datagram_dst_update(struct sock *sk)
+{
+	struct ip6_flowlabel *flowlabel = NULL;
+	struct in6_addr *final_p, final;
+	struct ipv6_txoptions *opt;
+	struct dst_entry *dst;
+	struct inet_sock *inet = inet_sk(sk);
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct flowi6 fl6;
+	int err = 0;
+
+	if (np->sndflow && (np->flow_label & IPV6_FLOWLABEL_MASK)) {
+		flowlabel = fl6_sock_lookup(sk, np->flow_label);
+		if (!flowlabel)
+			return -EINVAL;
+	}
+	ip6_datagram_flow_key_init(&fl6, sk);
+
+	rcu_read_lock();
+	opt = flowlabel ? flowlabel->opt : rcu_dereference(np->opt);
+	final_p = fl6_update_dst(&fl6, opt, &final);
+	rcu_read_unlock();
+
+	dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
+	if (IS_ERR(dst)) {
+		err = PTR_ERR(dst);
+		goto out;
+	}
+
+	if (ipv6_addr_any(&np->saddr))
+		np->saddr = fl6.saddr;
+
+	if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) {
+		sk->sk_v6_rcv_saddr = fl6.saddr;
+		inet->inet_rcv_saddr = LOOPBACK4_IPV6;
+		if (sk->sk_prot->rehash)
+			sk->sk_prot->rehash(sk);
+	}
+
+	ip6_dst_store(sk, dst,
+		      ipv6_addr_equal(&fl6.daddr, &sk->sk_v6_daddr) ?
+		      &sk->sk_v6_daddr : NULL,
+#ifdef CONFIG_IPV6_SUBTREES
+		      ipv6_addr_equal(&fl6.saddr, &np->saddr) ?
+		      &np->saddr :
+#endif
+		      NULL);
+
+out:
+	fl6_sock_release(flowlabel);
+	return err;
+}
+
 static int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 {
 	struct sockaddr_in6	*usin = (struct sockaddr_in6 *) uaddr;
 	struct inet_sock	*inet = inet_sk(sk);
 	struct ipv6_pinfo	*np = inet6_sk(sk);
-	struct in6_addr	*daddr, *final_p, final;
-	struct dst_entry	*dst;
-	struct flowi6		fl6;
-	struct ip6_flowlabel	*flowlabel = NULL;
-	struct ipv6_txoptions	*opt;
+	struct in6_addr		*daddr;
 	int			addr_type;
 	int			err;
 	__be32			fl6_flowlabel = 0;
@@ -91,14 +140,8 @@ static int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int a
 	if (usin->sin6_family != AF_INET6)
 		return -EAFNOSUPPORT;
 
-	if (np->sndflow) {
+	if (np->sndflow)
 		fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK;
-		if (fl6_flowlabel & IPV6_FLOWLABEL_MASK) {
-			flowlabel = fl6_sock_lookup(sk, fl6_flowlabel);
-			if (!flowlabel)
-				return -EINVAL;
-		}
-	}
 
 	addr_type = ipv6_addr_type(&usin->sin6_addr);
 
@@ -178,45 +221,13 @@ ipv4_connected:
 	 *	destination cache for it.
 	 */
 
-	ip6_datagram_flow_key_init(&fl6, sk);
-
-	rcu_read_lock();
-	opt = flowlabel ? flowlabel->opt : rcu_dereference(np->opt);
-	final_p = fl6_update_dst(&fl6, opt, &final);
-	rcu_read_unlock();
-
-	dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
-	err = 0;
-	if (IS_ERR(dst)) {
-		err = PTR_ERR(dst);
+	err = ip6_datagram_dst_update(sk);
+	if (err)
 		goto out;
-	}
-
-	/* source address lookup done in ip6_dst_lookup */
-
-	if (ipv6_addr_any(&np->saddr))
-		np->saddr = fl6.saddr;
-
-	if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) {
-		sk->sk_v6_rcv_saddr = fl6.saddr;
-		inet->inet_rcv_saddr = LOOPBACK4_IPV6;
-		if (sk->sk_prot->rehash)
-			sk->sk_prot->rehash(sk);
-	}
-
-	ip6_dst_store(sk, dst,
-		      ipv6_addr_equal(&fl6.daddr, &sk->sk_v6_daddr) ?
-		      &sk->sk_v6_daddr : NULL,
-#ifdef CONFIG_IPV6_SUBTREES
-		      ipv6_addr_equal(&fl6.saddr, &np->saddr) ?
-		      &np->saddr :
-#endif
-		      NULL);
 
 	sk->sk_state = TCP_ESTABLISHED;
 	sk_set_txhash(sk);
 out:
-	fl6_sock_release(flowlabel);
 	return err;
 }
 
-- 
cgit v1.2.3


From 33c162a980fe03498fcecb917f618ad7e7c55e61 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Mon, 11 Apr 2016 15:29:36 -0700
Subject: ipv6: datagram: Update dst cache of a connected datagram sk during
 pmtu update

There is a case in connected UDP socket such that
getsockopt(IPV6_MTU) will return a stale MTU value. The reproducible
sequence could be the following:
1. Create a connected UDP socket
2. Send some datagrams out
3. Receive a ICMPV6_PKT_TOOBIG
4. No new outgoing datagrams to trigger the sk_dst_check()
   logic to update the sk->sk_dst_cache.
5. getsockopt(IPV6_MTU) returns the mtu from the invalid
   sk->sk_dst_cache instead of the newly created RTF_CACHE clone.

This patch updates the sk->sk_dst_cache for a connected datagram sk
during pmtu-update code path.

Note that the sk->sk_v6_daddr is used to do the route lookup
instead of skb->data (i.e. iph).  It is because a UDP socket can become
connected after sending out some datagrams in un-connected state.  or
It can be connected multiple times to different destinations.  Hence,
iph may not be related to where sk is currently connected to.

It is done under '!sock_owned_by_user(sk)' condition because
the user may make another ip6_datagram_connect()  (i.e changing
the sk->sk_v6_daddr) while dst lookup is happening in the pmtu-update
code path.

For the sock_owned_by_user(sk) == true case, the next patch will
introduce a release_cb() which will update the sk->sk_dst_cache.

Test:

Server (Connected UDP Socket):
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Route Details:
[root@arch-fb-vm1 ~]# ip -6 r show | egrep '2fac'
2fac::/64 dev eth0  proto kernel  metric 256  pref medium
2fac:face::/64 via 2fac::face dev eth0  metric 1024  pref medium

A simple python code to create a connected UDP socket:

import socket
import errno

HOST = '2fac::1'
PORT = 8080

s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM)
s.bind((HOST, PORT))
s.connect(('2fac:face::face', 53))
print("connected")
while True:
    try:
	data = s.recv(1024)
    except socket.error as se:
	if se.errno == errno.EMSGSIZE:
		pmtu = s.getsockopt(41, 24)
		print("PMTU:%d" % pmtu)
		break
s.close()

Python program output after getting a ICMPV6_PKT_TOOBIG:
[root@arch-fb-vm1 ~]# python2 ~/devshare/kernel/tasks/fib6/udp-connect-53-8080.py
connected
PMTU:1300

Cache routes after recieving TOOBIG:
[root@arch-fb-vm1 ~]# ip -6 r show table cache
2fac:face::face via 2fac::face dev eth0  metric 0
    cache  expires 463sec mtu 1300 pref medium

Client (Send the ICMPV6_PKT_TOOBIG):
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
scapy is used to generate the TOOBIG message.  Here is the scapy script I have
used:

>>> p=Ether(src='da:75:4d:36:ac:32', dst='52:54:00:12:34:66', type=0x86dd)/IPv6(src='2fac::face', dst='2fac::1')/ICMPv6PacketTooBig(mtu=1300)/IPv6(src='2fac::
1',dst='2fac:face::face', nh='UDP')/UDP(sport=8080,dport=53)
>>> sendp(p, iface='qemubr0')

Fixes: 45e4fd26683c ("ipv6: Only create RTF_CACHE routes after encountering pmtu exception")
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Reported-by: Wei Wang <weiwan@google.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Wei Wang <weiwan@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h  |  1 +
 net/ipv6/datagram.c | 20 +++++++++++---------
 net/ipv6/route.c    | 12 ++++++++++++
 3 files changed, 24 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index d0aeb97aec5d..fd02e90b1289 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -959,6 +959,7 @@ int compat_ipv6_getsockopt(struct sock *sk, int level, int optname,
 int ip6_datagram_connect(struct sock *sk, struct sockaddr *addr, int addr_len);
 int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *addr,
 				 int addr_len);
+int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr);
 
 int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len,
 		    int *addr_len);
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 669585e11294..59e01f27db9f 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -64,7 +64,7 @@ static void ip6_datagram_flow_key_init(struct flowi6 *fl6, struct sock *sk)
 	security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
 }
 
-static int ip6_datagram_dst_update(struct sock *sk)
+int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr)
 {
 	struct ip6_flowlabel *flowlabel = NULL;
 	struct in6_addr *final_p, final;
@@ -93,14 +93,16 @@ static int ip6_datagram_dst_update(struct sock *sk)
 		goto out;
 	}
 
-	if (ipv6_addr_any(&np->saddr))
-		np->saddr = fl6.saddr;
+	if (fix_sk_saddr) {
+		if (ipv6_addr_any(&np->saddr))
+			np->saddr = fl6.saddr;
 
-	if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) {
-		sk->sk_v6_rcv_saddr = fl6.saddr;
-		inet->inet_rcv_saddr = LOOPBACK4_IPV6;
-		if (sk->sk_prot->rehash)
-			sk->sk_prot->rehash(sk);
+		if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) {
+			sk->sk_v6_rcv_saddr = fl6.saddr;
+			inet->inet_rcv_saddr = LOOPBACK4_IPV6;
+			if (sk->sk_prot->rehash)
+				sk->sk_prot->rehash(sk);
+		}
 	}
 
 	ip6_dst_store(sk, dst,
@@ -221,7 +223,7 @@ ipv4_connected:
 	 *	destination cache for it.
 	 */
 
-	err = ip6_datagram_dst_update(sk);
+	err = ip6_datagram_dst_update(sk, true);
 	if (err)
 		goto out;
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 1d8871a5ed20..d916d6ab9ad2 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1418,8 +1418,20 @@ EXPORT_SYMBOL_GPL(ip6_update_pmtu);
 
 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
 {
+	struct dst_entry *dst;
+
 	ip6_update_pmtu(skb, sock_net(sk), mtu,
 			sk->sk_bound_dev_if, sk->sk_mark);
+
+	dst = __sk_dst_get(sk);
+	if (!dst || !dst->obsolete ||
+	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
+		return;
+
+	bh_lock_sock(sk);
+	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
+		ip6_datagram_dst_update(sk, false);
+	bh_unlock_sock(sk);
 }
 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
 
-- 
cgit v1.2.3


From e646b657f6983017783914a951039e323120dc55 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Mon, 11 Apr 2016 15:29:37 -0700
Subject: ipv6: udp: Do a route lookup and update during release_cb

This patch adds a release_cb for UDPv6.  It does a route lookup
and updates sk->sk_dst_cache if it is needed.  It picks up the
left-over job from ip6_sk_update_pmtu() if the sk was owned
by user during the pmtu update.

It takes a rcu_read_lock to protect the __sk_dst_get() operations
because another thread may do ip6_dst_store() without taking the
sk lock (e.g. sendmsg).

Fixes: 45e4fd26683c ("ipv6: Only create RTF_CACHE routes after encountering pmtu exception")
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Reported-by: Wei Wang <weiwan@google.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Wei Wang <weiwan@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h  |  1 +
 net/ipv6/datagram.c | 20 ++++++++++++++++++++
 net/ipv6/udp.c      |  1 +
 3 files changed, 22 insertions(+)

(limited to 'net')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index fd02e90b1289..1be050ada8c5 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -960,6 +960,7 @@ int ip6_datagram_connect(struct sock *sk, struct sockaddr *addr, int addr_len);
 int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *addr,
 				 int addr_len);
 int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr);
+void ip6_datagram_release_cb(struct sock *sk);
 
 int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len,
 		    int *addr_len);
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 59e01f27db9f..9dd3882fe6bf 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -119,6 +119,26 @@ out:
 	return err;
 }
 
+void ip6_datagram_release_cb(struct sock *sk)
+{
+	struct dst_entry *dst;
+
+	if (ipv6_addr_v4mapped(&sk->sk_v6_daddr))
+		return;
+
+	rcu_read_lock();
+	dst = __sk_dst_get(sk);
+	if (!dst || !dst->obsolete ||
+	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) {
+		rcu_read_unlock();
+		return;
+	}
+	rcu_read_unlock();
+
+	ip6_datagram_dst_update(sk, false);
+}
+EXPORT_SYMBOL_GPL(ip6_datagram_release_cb);
+
 static int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 {
 	struct sockaddr_in6	*usin = (struct sockaddr_in6 *) uaddr;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 8125931106be..6bc5c664fa46 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1539,6 +1539,7 @@ struct proto udpv6_prot = {
 	.sendmsg	   = udpv6_sendmsg,
 	.recvmsg	   = udpv6_recvmsg,
 	.backlog_rcv	   = __udpv6_queue_rcv_skb,
+	.release_cb	   = ip6_datagram_release_cb,
 	.hash		   = udp_lib_hash,
 	.unhash		   = udp_lib_unhash,
 	.rehash		   = udp_v6_rehash,
-- 
cgit v1.2.3


From d894ba18d4e449b3a7f6eb491f16c9e02933736e Mon Sep 17 00:00:00 2001
From: Craig Gallek <kraig@google.com>
Date: Tue, 12 Apr 2016 13:11:25 -0400
Subject: soreuseport: fix ordering for mixed v4/v6 sockets
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With the SO_REUSEPORT socket option, it is possible to create sockets
in the AF_INET and AF_INET6 domains which are bound to the same IPv4 address.
This is only possible with SO_REUSEPORT and when not using IPV6_V6ONLY on
the AF_INET6 sockets.

Prior to the commits referenced below, an incoming IPv4 packet would
always be routed to a socket of type AF_INET when this mixed-mode was used.
After those changes, the same packet would be routed to the most recently
bound socket (if this happened to be an AF_INET6 socket, it would
have an IPv4 mapped IPv6 address).

The change in behavior occurred because the recent SO_REUSEPORT optimizations
short-circuit the socket scoring logic as soon as they find a match.  They
did not take into account the scoring logic that favors AF_INET sockets
over AF_INET6 sockets in the event of a tie.

To fix this problem, this patch changes the insertion order of AF_INET
and AF_INET6 addresses in the TCP and UDP socket lists when the sockets
have SO_REUSEPORT set.  AF_INET sockets will be inserted at the head of the
list and AF_INET6 sockets with SO_REUSEPORT set will always be inserted at
the tail of the list.  This will force AF_INET sockets to always be
considered first.

Fixes: e32ea7e74727 ("soreuseport: fast reuseport UDP socket selection")
Fixes: 125e80b88687 ("soreuseport: fast reuseport TCP socket selection")

Reported-by: Maciej Żenczykowski <maze@google.com>
Signed-off-by: Craig Gallek <kraig@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rculist_nulls.h | 39 +++++++++++++++++++++++++++++++++++++++
 include/net/sock.h            |  6 +++++-
 net/ipv4/udp.c                |  9 +++++++--
 3 files changed, 51 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
index 1c33dd7da4a7..4ae95f7e8597 100644
--- a/include/linux/rculist_nulls.h
+++ b/include/linux/rculist_nulls.h
@@ -98,6 +98,45 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n,
 	if (!is_a_nulls(first))
 		first->pprev = &n->next;
 }
+
+/**
+ * hlist_nulls_add_tail_rcu
+ * @n: the element to add to the hash list.
+ * @h: the list to add to.
+ *
+ * Description:
+ * Adds the specified element to the end of the specified hlist_nulls,
+ * while permitting racing traversals.  NOTE: tail insertion requires
+ * list traversal.
+ *
+ * The caller must take whatever precautions are necessary
+ * (such as holding appropriate locks) to avoid racing
+ * with another list-mutation primitive, such as hlist_nulls_add_head_rcu()
+ * or hlist_nulls_del_rcu(), running on this same list.
+ * However, it is perfectly legal to run concurrently with
+ * the _rcu list-traversal primitives, such as
+ * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency
+ * problems on Alpha CPUs.  Regardless of the type of CPU, the
+ * list-traversal primitive must be guarded by rcu_read_lock().
+ */
+static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n,
+					struct hlist_nulls_head *h)
+{
+	struct hlist_nulls_node *i, *last = NULL;
+
+	for (i = hlist_nulls_first_rcu(h); !is_a_nulls(i);
+	     i = hlist_nulls_next_rcu(i))
+		last = i;
+
+	if (last) {
+		n->next = last->next;
+		n->pprev = &last->next;
+		rcu_assign_pointer(hlist_nulls_next_rcu(last), n);
+	} else {
+		hlist_nulls_add_head_rcu(n, h);
+	}
+}
+
 /**
  * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type
  * @tpos:	the type * to use as a loop cursor.
diff --git a/include/net/sock.h b/include/net/sock.h
index 255d3e03727b..121ffc115c4f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -630,7 +630,11 @@ static inline void sk_add_node_rcu(struct sock *sk, struct hlist_head *list)
 
 static inline void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
 {
-	hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list);
+	if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
+	    sk->sk_family == AF_INET6)
+		hlist_nulls_add_tail_rcu(&sk->sk_nulls_node, list);
+	else
+		hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list);
 }
 
 static inline void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 08eed5e16df0..a2e7f55a1f61 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -339,8 +339,13 @@ found:
 
 		hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
 		spin_lock(&hslot2->lock);
-		hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
-					 &hslot2->head);
+		if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
+			sk->sk_family == AF_INET6)
+			hlist_nulls_add_tail_rcu(&udp_sk(sk)->udp_portaddr_node,
+						 &hslot2->head);
+		else
+			hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
+						 &hslot2->head);
 		hslot2->count++;
 		spin_unlock(&hslot2->lock);
 	}
-- 
cgit v1.2.3


From 9241e2df4fbc648a92ea0752918e05c26255649e Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 16 Apr 2016 02:27:58 +0200
Subject: vlan: pull on __vlan_insert_tag error path and fix csum correction

When __vlan_insert_tag() fails from skb_vlan_push() path due to the
skb_cow_head(), we need to undo the __skb_push() in the error path
as well that was done earlier to move skb->data pointer to mac header.

Moreover, I noticed that when in the non-error path the __skb_pull()
is done and the original offset to mac header was non-zero, we fixup
from a wrong skb->data offset in the checksum complete processing.

So the skb_postpush_rcsum() really needs to be done before __skb_pull()
where skb->data still points to the mac header start and thus operates
under the same conditions as in __vlan_insert_tag().

Fixes: 93515d53b133 ("net: move vlan pop/push functions into common code")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d04c2d1c8c87..e561f9f07d6d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4502,13 +4502,16 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
 		__skb_push(skb, offset);
 		err = __vlan_insert_tag(skb, skb->vlan_proto,
 					skb_vlan_tag_get(skb));
-		if (err)
+		if (err) {
+			__skb_pull(skb, offset);
 			return err;
+		}
+
 		skb->protocol = skb->vlan_proto;
 		skb->mac_len += VLAN_HLEN;
-		__skb_pull(skb, offset);
 
 		skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
+		__skb_pull(skb, offset);
 	}
 	__vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
 	return 0;
-- 
cgit v1.2.3


From a7c556546f610a331c22cb7edd9d1afe63f0cd52 Mon Sep 17 00:00:00 2001
From: Qing Huang <qing.huang@oracle.com>
Date: Thu, 14 Apr 2016 10:43:26 -0700
Subject: RDS: fix endianness for dp_ack_seq

dp->dp_ack_seq is used in big endian format. We need to do the
big endianness conversion when we assign a value in host format
to it.

Signed-off-by: Qing Huang <qing.huang@oracle.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/ib_cm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 8764970f0c24..310cabce2311 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -194,7 +194,7 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
 		dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version);
 		dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version);
 		dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
-		dp->dp_ack_seq = rds_ib_piggyb_ack(ic);
+		dp->dp_ack_seq = cpu_to_be64(rds_ib_piggyb_ack(ic));
 
 		/* Advertise flow control */
 		if (ic->i_flowctl) {
-- 
cgit v1.2.3


From e47db94e10447fc467777a40302f2b393e9af2fa Mon Sep 17 00:00:00 2001
From: "santosh.shilimkar@oracle.com" <santosh.shilimkar@oracle.com>
Date: Thu, 14 Apr 2016 10:43:27 -0700
Subject: RDS: Fix the atomicity for congestion map update

Two different threads with different rds sockets may be in
rds_recv_rcvbuf_delta() via receive path. If their ports
both map to the same word in the congestion map, then
using non-atomic ops to update it could cause the map to
be incorrect. Lets use atomics to avoid such an issue.

Full credit to Wengang <wen.gang.wang@oracle.com> for
finding the issue, analysing it and also pointing out
to offending code with spin lock based fix.

Reviewed-by: Leon Romanovsky <leon@leon.nu>
Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/cong.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/rds/cong.c b/net/rds/cong.c
index e6144b8246fd..6641bcf7c185 100644
--- a/net/rds/cong.c
+++ b/net/rds/cong.c
@@ -299,7 +299,7 @@ void rds_cong_set_bit(struct rds_cong_map *map, __be16 port)
 	i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
 	off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
 
-	__set_bit_le(off, (void *)map->m_page_addrs[i]);
+	set_bit_le(off, (void *)map->m_page_addrs[i]);
 }
 
 void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port)
@@ -313,7 +313,7 @@ void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port)
 	i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
 	off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
 
-	__clear_bit_le(off, (void *)map->m_page_addrs[i]);
+	clear_bit_le(off, (void *)map->m_page_addrs[i]);
 }
 
 static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port)
-- 
cgit v1.2.3


From 9c995cc9a206a008699da82f6cd01e9b2615649a Mon Sep 17 00:00:00 2001
From: Jorgen Hansen <jhansen@vmware.com>
Date: Mon, 18 Apr 2016 23:58:52 -0700
Subject: VSOCK: Only check error on skb_recv_datagram when skb is NULL

If skb_recv_datagram returns an skb, we should ignore the err
value returned. Otherwise, datagram receives will return EAGAIN
when they have to wait for a datagram.

Acked-by: Adit Ranadive <aditr@vmware.com>
Signed-off-by: Jorgen Hansen <jhansen@vmware.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/vmw_vsock/vmci_transport.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index 662bdd20a748..56214736fe88 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -1735,11 +1735,8 @@ static int vmci_transport_dgram_dequeue(struct vsock_sock *vsk,
 	/* Retrieve the head sk_buff from the socket's receive queue. */
 	err = 0;
 	skb = skb_recv_datagram(&vsk->sk, flags, noblock, &err);
-	if (err)
-		return err;
-
 	if (!skb)
-		return -EAGAIN;
+		return err;
 
 	dg = (struct vmci_datagram *)skb->data;
 	if (!dg)
@@ -2154,7 +2151,7 @@ module_exit(vmci_transport_exit);
 
 MODULE_AUTHOR("VMware, Inc.");
 MODULE_DESCRIPTION("VMCI transport for Virtual Sockets");
-MODULE_VERSION("1.0.3.0-k");
+MODULE_VERSION("1.0.4.0-k");
 MODULE_LICENSE("GPL v2");
 MODULE_ALIAS("vmware_vsock");
 MODULE_ALIAS_NETPROTO(PF_VSOCK);
-- 
cgit v1.2.3


From 49e261a8a21e0960a3f7ff187a453ba1c1149053 Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@ovn.org>
Date: Mon, 18 Apr 2016 14:51:47 -0700
Subject: openvswitch: Orphan skbs before IPv6 defrag

This is the IPv6 counterpart to commit 8282f27449bf ("inet: frag: Always
orphan skbs inside ip_defrag()").

Prior to commit 029f7f3b8701 ("netfilter: ipv6: nf_defrag: avoid/free
clone operations"), ipv6 fragments sent to nf_ct_frag6_gather() would be
cloned (implicitly orphaning) prior to queueing for reassembly. As such,
when the IPv6 message is eventually reassembled, the skb->sk for all
fragments would be NULL. After that commit was introduced, rather than
cloning, the original skbs were queued directly without orphaning. The
end result is that all frags except for the first and last may have a
socket attached.

This commit explicitly orphans such skbs during nf_ct_frag6_gather() to
prevent BUG_ON(skb->sk) during a later call to ip6_fragment().

kernel BUG at net/ipv6/ip6_output.c:631!
[...]
Call Trace:
 <IRQ>
 [<ffffffff810be8f7>] ? __lock_acquire+0x927/0x20a0
 [<ffffffffa042c7c0>] ? do_output.isra.28+0x1b0/0x1b0 [openvswitch]
 [<ffffffff810bb8a2>] ? __lock_is_held+0x52/0x70
 [<ffffffffa042c587>] ovs_fragment+0x1f7/0x280 [openvswitch]
 [<ffffffff810bdab5>] ? mark_held_locks+0x75/0xa0
 [<ffffffff817be416>] ? _raw_spin_unlock_irqrestore+0x36/0x50
 [<ffffffff81697ea0>] ? dst_discard_out+0x20/0x20
 [<ffffffff81697e80>] ? dst_ifdown+0x80/0x80
 [<ffffffffa042c703>] do_output.isra.28+0xf3/0x1b0 [openvswitch]
 [<ffffffffa042d279>] do_execute_actions+0x709/0x12c0 [openvswitch]
 [<ffffffffa04340a4>] ? ovs_flow_stats_update+0x74/0x1e0 [openvswitch]
 [<ffffffffa04340d1>] ? ovs_flow_stats_update+0xa1/0x1e0 [openvswitch]
 [<ffffffff817be387>] ? _raw_spin_unlock+0x27/0x40
 [<ffffffffa042de75>] ovs_execute_actions+0x45/0x120 [openvswitch]
 [<ffffffffa0432d65>] ovs_dp_process_packet+0x85/0x150 [openvswitch]
 [<ffffffff817be387>] ? _raw_spin_unlock+0x27/0x40
 [<ffffffffa042def4>] ovs_execute_actions+0xc4/0x120 [openvswitch]
 [<ffffffffa0432d65>] ovs_dp_process_packet+0x85/0x150 [openvswitch]
 [<ffffffffa04337f2>] ? key_extract+0x442/0xc10 [openvswitch]
 [<ffffffffa043b26d>] ovs_vport_receive+0x5d/0xb0 [openvswitch]
 [<ffffffff810be8f7>] ? __lock_acquire+0x927/0x20a0
 [<ffffffff810be8f7>] ? __lock_acquire+0x927/0x20a0
 [<ffffffff810be8f7>] ? __lock_acquire+0x927/0x20a0
 [<ffffffff817be416>] ? _raw_spin_unlock_irqrestore+0x36/0x50
 [<ffffffffa043c11d>] internal_dev_xmit+0x6d/0x150 [openvswitch]
 [<ffffffffa043c0b5>] ? internal_dev_xmit+0x5/0x150 [openvswitch]
 [<ffffffff8168fb5f>] dev_hard_start_xmit+0x2df/0x660
 [<ffffffff8168f5ea>] ? validate_xmit_skb.isra.105.part.106+0x1a/0x2b0
 [<ffffffff81690925>] __dev_queue_xmit+0x8f5/0x950
 [<ffffffff81690080>] ? __dev_queue_xmit+0x50/0x950
 [<ffffffff810bdab5>] ? mark_held_locks+0x75/0xa0
 [<ffffffff81690990>] dev_queue_xmit+0x10/0x20
 [<ffffffff8169a418>] neigh_resolve_output+0x178/0x220
 [<ffffffff81752759>] ? ip6_finish_output2+0x219/0x7b0
 [<ffffffff81752759>] ip6_finish_output2+0x219/0x7b0
 [<ffffffff817525a5>] ? ip6_finish_output2+0x65/0x7b0
 [<ffffffff816cde2b>] ? ip_idents_reserve+0x6b/0x80
 [<ffffffff8175488f>] ? ip6_fragment+0x93f/0xc50
 [<ffffffff81754af1>] ip6_fragment+0xba1/0xc50
 [<ffffffff81752540>] ? ip6_flush_pending_frames+0x40/0x40
 [<ffffffff81754c6b>] ip6_finish_output+0xcb/0x1d0
 [<ffffffff81754dcf>] ip6_output+0x5f/0x1a0
 [<ffffffff81754ba0>] ? ip6_fragment+0xc50/0xc50
 [<ffffffff81797fbd>] ip6_local_out+0x3d/0x80
 [<ffffffff817554df>] ip6_send_skb+0x2f/0xc0
 [<ffffffff817555bd>] ip6_push_pending_frames+0x4d/0x50
 [<ffffffff817796cc>] icmpv6_push_pending_frames+0xac/0xe0
 [<ffffffff8177a4be>] icmpv6_echo_reply+0x42e/0x500
 [<ffffffff8177acbf>] icmpv6_rcv+0x4cf/0x580
 [<ffffffff81755ac7>] ip6_input_finish+0x1a7/0x690
 [<ffffffff81755925>] ? ip6_input_finish+0x5/0x690
 [<ffffffff817567a0>] ip6_input+0x30/0xa0
 [<ffffffff81755920>] ? ip6_rcv_finish+0x1a0/0x1a0
 [<ffffffff817557ce>] ip6_rcv_finish+0x4e/0x1a0
 [<ffffffff8175640f>] ipv6_rcv+0x45f/0x7c0
 [<ffffffff81755fe6>] ? ipv6_rcv+0x36/0x7c0
 [<ffffffff81755780>] ? ip6_make_skb+0x1c0/0x1c0
 [<ffffffff8168b649>] __netif_receive_skb_core+0x229/0xb80
 [<ffffffff810bdab5>] ? mark_held_locks+0x75/0xa0
 [<ffffffff8168c07f>] ? process_backlog+0x6f/0x230
 [<ffffffff8168bfb6>] __netif_receive_skb+0x16/0x70
 [<ffffffff8168c088>] process_backlog+0x78/0x230
 [<ffffffff8168c0ed>] ? process_backlog+0xdd/0x230
 [<ffffffff8168db43>] net_rx_action+0x203/0x480
 [<ffffffff810bdab5>] ? mark_held_locks+0x75/0xa0
 [<ffffffff817c156e>] __do_softirq+0xde/0x49f
 [<ffffffff81752768>] ? ip6_finish_output2+0x228/0x7b0
 [<ffffffff817c070c>] do_softirq_own_stack+0x1c/0x30
 <EOI>
 [<ffffffff8106f88b>] do_softirq.part.18+0x3b/0x40
 [<ffffffff8106f946>] __local_bh_enable_ip+0xb6/0xc0
 [<ffffffff81752791>] ip6_finish_output2+0x251/0x7b0
 [<ffffffff81754af1>] ? ip6_fragment+0xba1/0xc50
 [<ffffffff816cde2b>] ? ip_idents_reserve+0x6b/0x80
 [<ffffffff8175488f>] ? ip6_fragment+0x93f/0xc50
 [<ffffffff81754af1>] ip6_fragment+0xba1/0xc50
 [<ffffffff81752540>] ? ip6_flush_pending_frames+0x40/0x40
 [<ffffffff81754c6b>] ip6_finish_output+0xcb/0x1d0
 [<ffffffff81754dcf>] ip6_output+0x5f/0x1a0
 [<ffffffff81754ba0>] ? ip6_fragment+0xc50/0xc50
 [<ffffffff81797fbd>] ip6_local_out+0x3d/0x80
 [<ffffffff817554df>] ip6_send_skb+0x2f/0xc0
 [<ffffffff817555bd>] ip6_push_pending_frames+0x4d/0x50
 [<ffffffff81778558>] rawv6_sendmsg+0xa28/0xe30
 [<ffffffff81719097>] ? inet_sendmsg+0xc7/0x1d0
 [<ffffffff817190d6>] inet_sendmsg+0x106/0x1d0
 [<ffffffff81718fd5>] ? inet_sendmsg+0x5/0x1d0
 [<ffffffff8166d078>] sock_sendmsg+0x38/0x50
 [<ffffffff8166d4d6>] SYSC_sendto+0xf6/0x170
 [<ffffffff8100201b>] ? trace_hardirqs_on_thunk+0x1b/0x1d
 [<ffffffff8166e38e>] SyS_sendto+0xe/0x10
 [<ffffffff817bebe5>] entry_SYSCALL_64_fastpath+0x18/0xa8
Code: 06 48 83 3f 00 75 26 48 8b 87 d8 00 00 00 2b 87 d0 00 00 00 48 39 d0 72 14 8b 87 e4 00 00 00 83 f8 01 75 09 48 83 7f 18 00 74 9a <0f> 0b 41 8b 86 cc 00 00 00 49 8#
RIP  [<ffffffff8175468a>] ip6_fragment+0x73a/0xc50
 RSP <ffff880072803120>

Fixes: 029f7f3b8701 ("netfilter: ipv6: nf_defrag: avoid/free clone
operations")
Reported-by: Daniele Di Proietto <diproiettod@vmware.com>
Signed-off-by: Joe Stringer <joe@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/conntrack.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 1b9d286756be..b5fea1101faa 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -367,6 +367,7 @@ static int handle_fragments(struct net *net, struct sw_flow_key *key,
 	} else if (key->eth.type == htons(ETH_P_IPV6)) {
 		enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;
 
+		skb_orphan(skb);
 		memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
 		err = nf_ct_frag6_gather(net, skb, user);
 		if (err)
-- 
cgit v1.2.3


From 479f85c36688f5c855ad463b71902ef5992628b7 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Mon, 18 Apr 2016 15:39:53 -0700
Subject: tcp: Fix SOF_TIMESTAMPING_TX_ACK when handling dup acks

Assuming SOF_TIMESTAMPING_TX_ACK is on. When dup acks are received,
it could incorrectly think that a skb has already
been acked and queue a SCM_TSTAMP_ACK cmsg to the
sk->sk_error_queue.

In tcp_ack_tstamp(), it checks
'between(shinfo->tskey, prior_snd_una, tcp_sk(sk)->snd_una - 1)'.
If prior_snd_una == tcp_sk(sk)->snd_una like the following packetdrill
script, between() returns true but the tskey is actually not acked.
e.g. try between(3, 2, 1).

The fix is to replace between() with one before() and one !before().
By doing this, the -1 offset on the tcp_sk(sk)->snd_una can also be
removed.

A packetdrill script is used to reproduce the dup ack scenario.
Due to the lacking cmsg support in packetdrill (may be I
cannot find it),  a BPF prog is used to kprobe to
sock_queue_err_skb() and print out the value of
serr->ee.ee_data.

Both the packetdrill and the bcc BPF script is attached at the end of
this commit message.

BPF Output Before Fix:
~~~~~~
      <...>-2056  [001] d.s.   433.927987: : ee_data:1459  #incorrect
packetdrill-2056  [001] d.s.   433.929563: : ee_data:1459  #incorrect
packetdrill-2056  [001] d.s.   433.930765: : ee_data:1459  #incorrect
packetdrill-2056  [001] d.s.   434.028177: : ee_data:1459
packetdrill-2056  [001] d.s.   434.029686: : ee_data:14599

BPF Output After Fix:
~~~~~~
      <...>-2049  [000] d.s.   113.517039: : ee_data:1459
      <...>-2049  [000] d.s.   113.517253: : ee_data:14599

BCC BPF Script:
~~~~~~
#!/usr/bin/env python

from __future__ import print_function
from bcc import BPF

bpf_text = """
#include <uapi/linux/ptrace.h>
#include <net/sock.h>
#include <bcc/proto.h>
#include <linux/errqueue.h>

#ifdef memset
#undef memset
#endif

int trace_err_skb(struct pt_regs *ctx)
{
	struct sk_buff *skb = (struct sk_buff *)ctx->si;
	struct sock *sk = (struct sock *)ctx->di;
	struct sock_exterr_skb *serr;
	u32 ee_data = 0;

	if (!sk || !skb)
		return 0;

	serr = SKB_EXT_ERR(skb);
	bpf_probe_read(&ee_data, sizeof(ee_data), &serr->ee.ee_data);
	bpf_trace_printk("ee_data:%u\\n", ee_data);

	return 0;
};
"""

b = BPF(text=bpf_text)
b.attach_kprobe(event="sock_queue_err_skb", fn_name="trace_err_skb")
print("Attached to kprobe")
b.trace_print()

Packetdrill Script:
~~~~~~
+0 `sysctl -q -w net.ipv4.tcp_min_tso_segs=10`
+0 `sysctl -q -w net.ipv4.tcp_no_metrics_save=1`
+0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0 bind(3, ..., ...) = 0
+0 listen(3, 1) = 0

0.100 < S 0:0(0) win 32792 <mss 1460,sackOK,nop,nop,nop,wscale 7>
0.100 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 7>
0.200 < . 1:1(0) ack 1 win 257
0.200 accept(3, ..., ...) = 4
+0 setsockopt(4, SOL_TCP, TCP_NODELAY, [1], 4) = 0

+0 setsockopt(4, SOL_SOCKET, 37, [2688], 4) = 0
0.200 write(4, ..., 1460) = 1460
0.200 write(4, ..., 13140) = 13140

0.200 > P. 1:1461(1460) ack 1
0.200 > . 1461:8761(7300) ack 1
0.200 > P. 8761:14601(5840) ack 1

0.300 < . 1:1(0) ack 1 win 257 <sack 1461:2921,nop,nop>
0.300 < . 1:1(0) ack 1 win 257 <sack 1461:4381,nop,nop>
0.300 < . 1:1(0) ack 1 win 257 <sack 1461:5841,nop,nop>
0.300 > P. 1:1461(1460) ack 1
0.400 < . 1:1(0) ack 14601 win 257

0.400 close(4) = 0
0.400 > F. 14601:14601(0) ack 1
0.500 < F. 1:1(0) ack 14602 win 257
0.500 > . 14602:14602(0) ack 2

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Soheil Hassas Yeganeh <soheil.kdev@gmail.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Tested-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e6e65f79ade8..0edb07185c25 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3098,7 +3098,8 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
 
 	shinfo = skb_shinfo(skb);
 	if ((shinfo->tx_flags & SKBTX_ACK_TSTAMP) &&
-	    between(shinfo->tskey, prior_snd_una, tcp_sk(sk)->snd_una - 1))
+	    !before(shinfo->tskey, prior_snd_una) &&
+	    before(shinfo->tskey, tcp_sk(sk)->snd_una))
 		__skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
 }
 
-- 
cgit v1.2.3


From 082ac2d51d9f19ec1c29bdaaaf7fb49889e4fade Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Tue, 19 Apr 2016 22:39:28 -0700
Subject: tcp: Merge tx_flags and tskey in tcp_collapse_retrans

If two skbs are merged/collapsed during retransmission, the current
logic does not merge the tx_flags and tskey.  The end result is
the SCM_TSTAMP_ACK timestamp could be missing for a packet.

The patch:
1. Merge the tx_flags
2. Overwrite the prev_skb's tskey with the next_skb's tskey

BPF Output Before:
~~~~~~
<no-output-due-to-missing-tstamp-event>

BPF Output After:
~~~~~~
packetdrill-2092  [001] d.s.   453.998486: : ee_data:1459

Packetdrill Script:
~~~~~~
+0 `sysctl -q -w net.ipv4.tcp_min_tso_segs=10`
+0 `sysctl -q -w net.ipv4.tcp_no_metrics_save=1`
+0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0 bind(3, ..., ...) = 0
+0 listen(3, 1) = 0

0.100 < S 0:0(0) win 32792 <mss 1460,sackOK,nop,nop,nop,wscale 7>
0.100 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 7>
0.200 < . 1:1(0) ack 1 win 257
0.200 accept(3, ..., ...) = 4
+0 setsockopt(4, SOL_TCP, TCP_NODELAY, [1], 4) = 0

0.200 write(4, ..., 730) = 730
+0 setsockopt(4, SOL_SOCKET, 37, [2688], 4) = 0
0.200 write(4, ..., 730) = 730
+0 setsockopt(4, SOL_SOCKET, 37, [2176], 4) = 0
0.200 write(4, ..., 11680) = 11680
+0 setsockopt(4, SOL_SOCKET, 37, [2688], 4) = 0

0.200 > P. 1:731(730) ack 1
0.200 > P. 731:1461(730) ack 1
0.200 > . 1461:8761(7300) ack 1
0.200 > P. 8761:13141(4380) ack 1

0.300 < . 1:1(0) ack 1 win 257 <sack 1461:2921,nop,nop>
0.300 < . 1:1(0) ack 1 win 257 <sack 1461:4381,nop,nop>
0.300 < . 1:1(0) ack 1 win 257 <sack 1461:5841,nop,nop>
0.300 > P. 1:1461(1460) ack 1
0.400 < . 1:1(0) ack 13141 win 257

0.400 close(4) = 0
0.400 > F. 13141:13141(0) ack 1
0.500 < F. 1:1(0) ack 13142 win 257
0.500 > . 13142:13142(0) ack 2

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Soheil Hassas Yeganeh <soheil@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Tested-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7d2dc015cd19..5bc3c3062672 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2441,6 +2441,20 @@ u32 __tcp_select_window(struct sock *sk)
 	return window;
 }
 
+static void tcp_skb_collapse_tstamp(struct sk_buff *skb,
+				    const struct sk_buff *next_skb)
+{
+	const struct skb_shared_info *next_shinfo = skb_shinfo(next_skb);
+	u8 tsflags = next_shinfo->tx_flags & SKBTX_ANY_TSTAMP;
+
+	if (unlikely(tsflags)) {
+		struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+		shinfo->tx_flags |= tsflags;
+		shinfo->tskey = next_shinfo->tskey;
+	}
+}
+
 /* Collapses two adjacent SKB's during retransmission. */
 static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
 {
@@ -2484,6 +2498,8 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
 
 	tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb));
 
+	tcp_skb_collapse_tstamp(skb, next_skb);
+
 	sk_wmem_free_skb(sk, next_skb);
 }
 
-- 
cgit v1.2.3


From cfea5a688eb37bcd1081255df9f9f777f4e61999 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Tue, 19 Apr 2016 22:39:29 -0700
Subject: tcp: Merge tx_flags and tskey in tcp_shifted_skb

After receiving sacks, tcp_shifted_skb() will collapse
skbs if possible.  tx_flags and tskey also have to be
merged.

This patch reuses the tcp_skb_collapse_tstamp() to handle
them.

BPF Output Before:
~~~~~
<no-output-due-to-missing-tstamp-event>

BPF Output After:
~~~~~
<...>-2024  [007] d.s.    88.644374: : ee_data:14599

Packetdrill Script:
~~~~~
+0 `sysctl -q -w net.ipv4.tcp_min_tso_segs=10`
+0 `sysctl -q -w net.ipv4.tcp_no_metrics_save=1`
+0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0 bind(3, ..., ...) = 0
+0 listen(3, 1) = 0

0.100 < S 0:0(0) win 32792 <mss 1460,sackOK,nop,nop,nop,wscale 7>
0.100 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 7>
0.200 < . 1:1(0) ack 1 win 257
0.200 accept(3, ..., ...) = 4
+0 setsockopt(4, SOL_TCP, TCP_NODELAY, [1], 4) = 0

0.200 write(4, ..., 1460) = 1460
+0 setsockopt(4, SOL_SOCKET, 37, [2688], 4) = 0
0.200 write(4, ..., 13140) = 13140

0.200 > P. 1:1461(1460) ack 1
0.200 > . 1461:8761(7300) ack 1
0.200 > P. 8761:14601(5840) ack 1

0.300 < . 1:1(0) ack 1 win 257 <sack 1461:14601,nop,nop>
0.300 > P. 1:1461(1460) ack 1
0.400 < . 1:1(0) ack 14601 win 257

0.400 close(4) = 0
0.400 > F. 14601:14601(0) ack 1
0.500 < F. 1:1(0) ack 14602 win 257
0.500 > . 14602:14602(0) ack 2

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Soheil Hassas Yeganeh <soheil@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Tested-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     | 2 ++
 net/ipv4/tcp_input.c  | 1 +
 net/ipv4/tcp_output.c | 4 ++--
 3 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index b91370f61be6..6db10228113f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -552,6 +552,8 @@ void tcp_send_ack(struct sock *sk);
 void tcp_send_delayed_ack(struct sock *sk);
 void tcp_send_loss_probe(struct sock *sk);
 bool tcp_schedule_loss_probe(struct sock *sk);
+void tcp_skb_collapse_tstamp(struct sk_buff *skb,
+			     const struct sk_buff *next_skb);
 
 /* tcp_input.c */
 void tcp_resume_early_retransmit(struct sock *sk);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 0edb07185c25..c124c3c12f7c 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1309,6 +1309,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
 	if (skb == tcp_highest_sack(sk))
 		tcp_advance_highest_sack(sk, skb);
 
+	tcp_skb_collapse_tstamp(prev, skb);
 	tcp_unlink_write_queue(skb, sk);
 	sk_wmem_free_skb(sk, skb);
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 5bc3c3062672..441ae9da3a23 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2441,8 +2441,8 @@ u32 __tcp_select_window(struct sock *sk)
 	return window;
 }
 
-static void tcp_skb_collapse_tstamp(struct sk_buff *skb,
-				    const struct sk_buff *next_skb)
+void tcp_skb_collapse_tstamp(struct sk_buff *skb,
+			     const struct sk_buff *next_skb)
 {
 	const struct skb_shared_info *next_shinfo = skb_shinfo(next_skb);
 	u8 tsflags = next_shinfo->tx_flags & SKBTX_ANY_TSTAMP;
-- 
cgit v1.2.3


From b4f70527f052b0c00be4d7cac562baa75b212df5 Mon Sep 17 00:00:00 2001
From: Simon Horman <simon.horman@netronome.com>
Date: Thu, 21 Apr 2016 11:49:15 +1000
Subject: openvswitch: use flow protocol when recalculating ipv6 checksums

When using masked actions the ipv6_proto field of an action
to set IPv6 fields may be zero rather than the prevailing protocol
which will result in skipping checksum recalculation.

This patch resolves the problem by relying on the protocol
in the flow key rather than that in the set field action.

Fixes: 83d2b9ba1abc ("net: openvswitch: Support masked set actions.")
Cc: Jarno Rajahalme <jrajahalme@nicira.com>
Signed-off-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/actions.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index e9dd47b2a85b..879185fe183f 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -461,7 +461,7 @@ static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key,
 		mask_ipv6_addr(saddr, key->ipv6_src, mask->ipv6_src, masked);
 
 		if (unlikely(memcmp(saddr, masked, sizeof(masked)))) {
-			set_ipv6_addr(skb, key->ipv6_proto, saddr, masked,
+			set_ipv6_addr(skb, flow_key->ip.proto, saddr, masked,
 				      true);
 			memcpy(&flow_key->ipv6.addr.src, masked,
 			       sizeof(flow_key->ipv6.addr.src));
@@ -483,7 +483,7 @@ static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key,
 							     NULL, &flags)
 					       != NEXTHDR_ROUTING);
 
-			set_ipv6_addr(skb, key->ipv6_proto, daddr, masked,
+			set_ipv6_addr(skb, flow_key->ip.proto, daddr, masked,
 				      recalc_csum);
 			memcpy(&flow_key->ipv6.addr.dst, masked,
 			       sizeof(flow_key->ipv6.addr.dst));
-- 
cgit v1.2.3