From 10a7ef33679073d13bf1dd05e3f1b7912f999543 Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Tue, 10 Oct 2017 20:59:38 -0700
Subject: ipsec: Fix dst leak in xfrm_bundle_create().

If we cannot find a suitable inner_mode value, we will leak
the currently allocated 'xdst'.

The fix is to make sure it is linked into the chain before
erroring out.

Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_policy.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index f06253969972..2746b62a8944 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1573,6 +1573,14 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
 			goto put_states;
 		}
 
+		if (!dst_prev)
+			dst0 = dst1;
+		else
+			/* Ref count is taken during xfrm_alloc_dst()
+			 * No need to do dst_clone() on dst1
+			 */
+			dst_prev->child = dst1;
+
 		if (xfrm[i]->sel.family == AF_UNSPEC) {
 			inner_mode = xfrm_ip2inner_mode(xfrm[i],
 							xfrm_af2proto(family));
@@ -1584,14 +1592,6 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
 		} else
 			inner_mode = xfrm[i]->inner_mode;
 
-		if (!dst_prev)
-			dst0 = dst1;
-		else
-			/* Ref count is taken during xfrm_alloc_dst()
-			 * No need to do dst_clone() on dst1
-			 */
-			dst_prev->child = dst1;
-
 		xdst->route = dst;
 		dst_copy_metrics(dst1, dst);
 
-- 
cgit v1.2.3


From 4c625a974fb81724e60966b677e47fcba782c950 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Fri, 13 Oct 2017 14:08:55 -0400
Subject: SUNRPC: fix a list corruption issue in xprt_release()

We remove the request from the receive list before we call
xprt_wait_on_pinned_rqst(), and so we need to use list_del_init().
Otherwise, we will see list corruption when xprt_complete_rqst()
is called.

Reported-by: Emre Celebi <emre@primarydata.com>
Fixes: ce7c252a8c741 ("SUNRPC: Add a separate spinlock to protect...")
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 net/sunrpc/xprt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index e741ec2b4d8e..1a39ad14c42f 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1333,7 +1333,7 @@ void xprt_release(struct rpc_task *task)
 		rpc_count_iostats(task, task->tk_client->cl_metrics);
 	spin_lock(&xprt->recv_lock);
 	if (!list_empty(&req->rq_list)) {
-		list_del(&req->rq_list);
+		list_del_init(&req->rq_list);
 		xprt_wait_on_pinned_rqst(req);
 	}
 	spin_unlock(&xprt->recv_lock);
-- 
cgit v1.2.3


From 2bdd713b92a9cade239d3c7d15205a09f556624d Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Tue, 17 Oct 2017 20:32:07 +0200
Subject: mac80211: use constant time comparison with keys

Otherwise we risk leaking information via timing side channel.

Fixes: fdf7cb4185b6 ("mac80211: accept key reinstall without changing anything")
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/key.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index ae995c8480db..035d16fe926e 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -19,6 +19,7 @@
 #include <linux/slab.h>
 #include <linux/export.h>
 #include <net/mac80211.h>
+#include <crypto/algapi.h>
 #include <asm/unaligned.h>
 #include "ieee80211_i.h"
 #include "driver-ops.h"
@@ -635,7 +636,7 @@ int ieee80211_key_link(struct ieee80211_key *key,
 	 * new version of the key to avoid nonce reuse or replay issues.
 	 */
 	if (old_key && key->conf.keylen == old_key->conf.keylen &&
-	    !memcmp(key->conf.key, old_key->conf.key, key->conf.keylen)) {
+	    !crypto_memneq(key->conf.key, old_key->conf.key, key->conf.keylen)) {
 		ieee80211_key_free_unused(key);
 		ret = 0;
 		goto out;
-- 
cgit v1.2.3


From 51e13359cd5ea34acc62c90627603352956380af Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 17 Oct 2017 21:56:20 +0200
Subject: cfg80211: fix connect/disconnect edge cases

If we try to connect while already connected/connecting, but
this fails, we set ssid_len=0 but leave current_bss hanging,
leading to errors.

Check all of this better, first of all ensuring that we can't
try to connect to a different SSID while connected/ing; ensure
that prev_bssid is set for re-association attempts even in the
case of the driver supporting the connect() method, and don't
reset ssid_len in the failure cases.

While at it, also reset ssid_len while disconnecting unless we
were connected and expect a disconnected event, and warn on a
successful connection without ssid_len being set.

Cc: stable@vger.kernel.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/sme.c | 50 +++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 41 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 0a49b88070d0..b6533ecbf5b1 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -522,11 +522,6 @@ static int cfg80211_sme_connect(struct wireless_dev *wdev,
 		return -EOPNOTSUPP;
 
 	if (wdev->current_bss) {
-		if (!prev_bssid)
-			return -EALREADY;
-		if (prev_bssid &&
-		    !ether_addr_equal(prev_bssid, wdev->current_bss->pub.bssid))
-			return -ENOTCONN;
 		cfg80211_unhold_bss(wdev->current_bss);
 		cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub);
 		wdev->current_bss = NULL;
@@ -1063,11 +1058,35 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev,
 
 	ASSERT_WDEV_LOCK(wdev);
 
-	if (WARN_ON(wdev->connect_keys)) {
-		kzfree(wdev->connect_keys);
-		wdev->connect_keys = NULL;
+	/*
+	 * If we have an ssid_len, we're trying to connect or are
+	 * already connected, so reject a new SSID unless it's the
+	 * same (which is the case for re-association.)
+	 */
+	if (wdev->ssid_len &&
+	    (wdev->ssid_len != connect->ssid_len ||
+	     memcmp(wdev->ssid, connect->ssid, wdev->ssid_len)))
+		return -EALREADY;
+
+	/*
+	 * If connected, reject (re-)association unless prev_bssid
+	 * matches the current BSSID.
+	 */
+	if (wdev->current_bss) {
+		if (!prev_bssid)
+			return -EALREADY;
+		if (!ether_addr_equal(prev_bssid, wdev->current_bss->pub.bssid))
+			return -ENOTCONN;
 	}
 
+	/*
+	 * Reject if we're in the process of connecting with WEP,
+	 * this case isn't very interesting and trying to handle
+	 * it would make the code much more complex.
+	 */
+	if (wdev->connect_keys)
+		return -EINPROGRESS;
+
 	cfg80211_oper_and_ht_capa(&connect->ht_capa_mask,
 				  rdev->wiphy.ht_capa_mod_mask);
 
@@ -1118,7 +1137,12 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev,
 
 	if (err) {
 		wdev->connect_keys = NULL;
-		wdev->ssid_len = 0;
+		/*
+		 * This could be reassoc getting refused, don't clear
+		 * ssid_len in that case.
+		 */
+		if (!wdev->current_bss)
+			wdev->ssid_len = 0;
 		return err;
 	}
 
@@ -1145,6 +1169,14 @@ int cfg80211_disconnect(struct cfg80211_registered_device *rdev,
 	else if (wdev->ssid_len)
 		err = rdev_disconnect(rdev, dev, reason);
 
+	/*
+	 * Clear ssid_len unless we actually were fully connected,
+	 * in which case cfg80211_disconnected() will take care of
+	 * this later.
+	 */
+	if (!wdev->current_bss)
+		wdev->ssid_len = 0;
+
 	return err;
 }
 
-- 
cgit v1.2.3


From e5f5ce37a7918ed7406c52987c7cc8b670ed5e14 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 18 Oct 2017 09:36:51 +0200
Subject: mac80211: validate user rate mask before configuring driver

Ben reported that when the user rate mask is rejected for not
matching any basic rate, the driver had already been configured.
This is clearly an oversight in my original change, fix this by
doing the validation before calling the driver.

Reported-by: Ben Greear <greearb@candelatech.com>
Fixes: e8e4f5280ddd ("mac80211: reject/clear user rate mask if not usable")
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/cfg.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index a354f1939e49..fb15d3b97cb2 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -2727,12 +2727,6 @@ static int ieee80211_set_bitrate_mask(struct wiphy *wiphy,
 	if (!ieee80211_sdata_running(sdata))
 		return -ENETDOWN;
 
-	if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) {
-		ret = drv_set_bitrate_mask(local, sdata, mask);
-		if (ret)
-			return ret;
-	}
-
 	/*
 	 * If active validate the setting and reject it if it doesn't leave
 	 * at least one basic rate usable, since we really have to be able
@@ -2748,6 +2742,12 @@ static int ieee80211_set_bitrate_mask(struct wiphy *wiphy,
 			return -EINVAL;
 	}
 
+	if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) {
+		ret = drv_set_bitrate_mask(local, sdata, mask);
+		if (ret)
+			return ret;
+	}
+
 	for (i = 0; i < NUM_NL80211_BANDS; i++) {
 		struct ieee80211_supported_band *sband = wiphy->bands[i];
 		int j;
-- 
cgit v1.2.3


From 528fd3547bad0bdd31c8f987e5bd00c83df8af39 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Thu, 19 Oct 2017 12:13:10 -0400
Subject: SUNRPC: Destroy transport from the system workqueue

The transport may need to flush transport connect and receive tasks
that are running on rpciod. In order to do so safely, we need to
ensure that the caller of cancel_work_sync() etc is not itself
running on rpciod.
Do so by running the destroy task from the system workqueue.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 net/sunrpc/xprt.c | 34 ++++++++++++++++++++++++----------
 1 file changed, 24 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 1a39ad14c42f..898485e3ece4 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1445,6 +1445,23 @@ out:
 	return xprt;
 }
 
+static void xprt_destroy_cb(struct work_struct *work)
+{
+	struct rpc_xprt *xprt =
+		container_of(work, struct rpc_xprt, task_cleanup);
+
+	rpc_xprt_debugfs_unregister(xprt);
+	rpc_destroy_wait_queue(&xprt->binding);
+	rpc_destroy_wait_queue(&xprt->pending);
+	rpc_destroy_wait_queue(&xprt->sending);
+	rpc_destroy_wait_queue(&xprt->backlog);
+	kfree(xprt->servername);
+	/*
+	 * Tear down transport state and free the rpc_xprt
+	 */
+	xprt->ops->destroy(xprt);
+}
+
 /**
  * xprt_destroy - destroy an RPC transport, killing off all requests.
  * @xprt: transport to destroy
@@ -1454,22 +1471,19 @@ static void xprt_destroy(struct rpc_xprt *xprt)
 {
 	dprintk("RPC:       destroying transport %p\n", xprt);
 
-	/* Exclude transport connect/disconnect handlers */
+	/*
+	 * Exclude transport connect/disconnect handlers and autoclose
+	 */
 	wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_UNINTERRUPTIBLE);
 
 	del_timer_sync(&xprt->timer);
 
-	rpc_xprt_debugfs_unregister(xprt);
-	rpc_destroy_wait_queue(&xprt->binding);
-	rpc_destroy_wait_queue(&xprt->pending);
-	rpc_destroy_wait_queue(&xprt->sending);
-	rpc_destroy_wait_queue(&xprt->backlog);
-	cancel_work_sync(&xprt->task_cleanup);
-	kfree(xprt->servername);
 	/*
-	 * Tear down transport state and free the rpc_xprt
+	 * Destroy sockets etc from the system workqueue so they can
+	 * safely flush receive work running on rpciod.
 	 */
-	xprt->ops->destroy(xprt);
+	INIT_WORK(&xprt->task_cleanup, xprt_destroy_cb);
+	schedule_work(&xprt->task_cleanup);
 }
 
 static void xprt_destroy_kref(struct kref *kref)
-- 
cgit v1.2.3


From 3a91d29f20276fa7cd4d0c9c7f3e78b30708159d Mon Sep 17 00:00:00 2001
From: Koichiro Den <den@klaipeden.com>
Date: Sun, 22 Oct 2017 13:13:16 +0900
Subject: tcp: do tcp_mstamp_refresh before retransmits on TSQ handler

When retransmission on TSQ handler was introduced in the commit
f9616c35a0d7 ("tcp: implement TSQ for retransmits"), the retransmitted
skbs' timestamps were updated on the actual transmission. In the later
commit 385e20706fac ("tcp: use tp->tcp_mstamp in output path"), it stops
being done so. In the commit, the comment says "We try to refresh
tp->tcp_mstamp only when necessary", and at present tcp_tsq_handler and
tcp_v4_mtu_reduced applies to this. About the latter, it's okay since
it's rare enough.

About the former, even though possible retransmissions on the tasklet
comes just after the destructor run in NET_RX softirq handling, the time
between them could be nonnegligibly large to the extent that
tcp_rack_advance or rto rearming be affected if other (remaining) RX,
BLOCK and (preceding) TASKLET sofirq handlings are unexpectedly heavy.

So in the same way as tcp_write_timer_handler does, doing tcp_mstamp_refresh
ensures the accuracy of algorithms relying on it.

Fixes: 385e20706fac ("tcp: use tp->tcp_mstamp in output path")
Signed-off-by: Koichiro Den <den@klaipeden.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0bc9e46a5369..973befc36fd4 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -739,8 +739,10 @@ static void tcp_tsq_handler(struct sock *sk)
 		struct tcp_sock *tp = tcp_sk(sk);
 
 		if (tp->lost_out > tp->retrans_out &&
-		    tp->snd_cwnd > tcp_packets_in_flight(tp))
+		    tp->snd_cwnd > tcp_packets_in_flight(tp)) {
+			tcp_mstamp_refresh(tp);
 			tcp_xmit_retransmit_queue(sk);
+		}
 
 		tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle,
 			       0, GFP_ATOMIC);
-- 
cgit v1.2.3


From a6ca7abe53633d08eea1c6756cb49c9b2d4c90bf Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 22 Oct 2017 12:33:57 -0700
Subject: tcp/dccp: fix lockdep splat in inet_csk_route_req()

This patch fixes the following lockdep splat in inet_csk_route_req()

  lockdep_rcu_suspicious
  inet_csk_route_req
  tcp_v4_send_synack
  tcp_rtx_synack
  inet_rtx_syn_ack
  tcp_fastopen_synack_time
  tcp_retransmit_timer
  tcp_write_timer_handler
  tcp_write_timer
  call_timer_fn

Thread running inet_csk_route_req() owns a reference on the request
socket, so we have the guarantee ireq->ireq_opt wont be changed or
freed.

lockdep can enforce this invariant for us.

Fixes: c92e8c02fe66 ("tcp/dccp: fix ireq->opt races")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_connection_sock.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 5ec9136a7c36..18cd2eae758f 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -543,7 +543,8 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk,
 	struct ip_options_rcu *opt;
 	struct rtable *rt;
 
-	opt = rcu_dereference(ireq->ireq_opt);
+	opt = rcu_dereference_protected(ireq->ireq_opt,
+					refcount_read(&req->rsk_refcnt) > 0);
 	flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark,
 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 			   sk->sk_protocol, inet_sk_flowi_flags(sk),
-- 
cgit v1.2.3


From 1137b5e2529a8f5ca8ee709288ecba3e68044df2 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 19 Oct 2017 20:51:10 +0800
Subject: ipsec: Fix aborted xfrm policy dump crash

An independent security researcher, Mohamed Ghannam, has reported
this vulnerability to Beyond Security's SecuriTeam Secure Disclosure
program.

The xfrm_dump_policy_done function expects xfrm_dump_policy to
have been called at least once or it will crash.  This can be
triggered if a dump fails because the target socket's receive
buffer is full.

This patch fixes it by using the cb->start mechanism to ensure that
the initialisation is always done regardless of the buffer situation.

Fixes: 12a169e7d8f4 ("ipsec: Put dumpers on the dump list")
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_user.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index b997f1395357..e44a0fed48dd 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1693,32 +1693,34 @@ static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr
 
 static int xfrm_dump_policy_done(struct netlink_callback *cb)
 {
-	struct xfrm_policy_walk *walk = (struct xfrm_policy_walk *) &cb->args[1];
+	struct xfrm_policy_walk *walk = (struct xfrm_policy_walk *)cb->args;
 	struct net *net = sock_net(cb->skb->sk);
 
 	xfrm_policy_walk_done(walk, net);
 	return 0;
 }
 
+static int xfrm_dump_policy_start(struct netlink_callback *cb)
+{
+	struct xfrm_policy_walk *walk = (struct xfrm_policy_walk *)cb->args;
+
+	BUILD_BUG_ON(sizeof(*walk) > sizeof(cb->args));
+
+	xfrm_policy_walk_init(walk, XFRM_POLICY_TYPE_ANY);
+	return 0;
+}
+
 static int xfrm_dump_policy(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct net *net = sock_net(skb->sk);
-	struct xfrm_policy_walk *walk = (struct xfrm_policy_walk *) &cb->args[1];
+	struct xfrm_policy_walk *walk = (struct xfrm_policy_walk *)cb->args;
 	struct xfrm_dump_info info;
 
-	BUILD_BUG_ON(sizeof(struct xfrm_policy_walk) >
-		     sizeof(cb->args) - sizeof(cb->args[0]));
-
 	info.in_skb = cb->skb;
 	info.out_skb = skb;
 	info.nlmsg_seq = cb->nlh->nlmsg_seq;
 	info.nlmsg_flags = NLM_F_MULTI;
 
-	if (!cb->args[0]) {
-		cb->args[0] = 1;
-		xfrm_policy_walk_init(walk, XFRM_POLICY_TYPE_ANY);
-	}
-
 	(void) xfrm_policy_walk(net, walk, dump_one_policy, &info);
 
 	return skb->len;
@@ -2474,6 +2476,7 @@ static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = {
 
 static const struct xfrm_link {
 	int (*doit)(struct sk_buff *, struct nlmsghdr *, struct nlattr **);
+	int (*start)(struct netlink_callback *);
 	int (*dump)(struct sk_buff *, struct netlink_callback *);
 	int (*done)(struct netlink_callback *);
 	const struct nla_policy *nla_pol;
@@ -2487,6 +2490,7 @@ static const struct xfrm_link {
 	[XFRM_MSG_NEWPOLICY   - XFRM_MSG_BASE] = { .doit = xfrm_add_policy    },
 	[XFRM_MSG_DELPOLICY   - XFRM_MSG_BASE] = { .doit = xfrm_get_policy    },
 	[XFRM_MSG_GETPOLICY   - XFRM_MSG_BASE] = { .doit = xfrm_get_policy,
+						   .start = xfrm_dump_policy_start,
 						   .dump = xfrm_dump_policy,
 						   .done = xfrm_dump_policy_done },
 	[XFRM_MSG_ALLOCSPI    - XFRM_MSG_BASE] = { .doit = xfrm_alloc_userspi },
@@ -2539,6 +2543,7 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 
 		{
 			struct netlink_dump_control c = {
+				.start = link->start,
 				.dump = link->dump,
 				.done = link->done,
 			};
-- 
cgit v1.2.3


From b71d21c274eff20a9db8158882b545b141b73ab8 Mon Sep 17 00:00:00 2001
From: Laszlo Toth <laszlth@gmail.com>
Date: Mon, 23 Oct 2017 19:19:33 +0200
Subject: sctp: full support for ipv6 ip_nonlocal_bind & IP_FREEBIND

Commit 9b9742022888 ("sctp: support ipv6 nonlocal bind")
introduced support for the above options as v4 sctp did,
so patched sctp_v6_available().

In the v4 implementation it's enough, because
sctp_inet_bind_verify() just returns with sctp_v4_available().
However sctp_inet6_bind_verify() has an extra check before that
for link-local scope_id, which won't respect the above options.

Added the checks before calling ipv6_chk_addr(), but
not before the validation of scope_id.

before (w/ both options):
 ./v6test fe80::10 sctp
 bind failed, errno: 99 (Cannot assign requested address)
 ./v6test fe80::10 tcp
 bind success, errno: 0 (Success)

after (w/ both options):
 ./v6test fe80::10 sctp
 bind success, errno: 0 (Success)

Signed-off-by: Laszlo Toth <laszlth@gmail.com>
Reviewed-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/ipv6.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 51c488769590..7fe9e1d1b7ec 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -882,8 +882,10 @@ static int sctp_inet6_bind_verify(struct sctp_sock *opt, union sctp_addr *addr)
 			net = sock_net(&opt->inet.sk);
 			rcu_read_lock();
 			dev = dev_get_by_index_rcu(net, addr->v6.sin6_scope_id);
-			if (!dev ||
-			    !ipv6_chk_addr(net, &addr->v6.sin6_addr, dev, 0)) {
+			if (!dev || !(opt->inet.freebind ||
+				      net->ipv6.sysctl.ip_nonlocal_bind ||
+				      ipv6_chk_addr(net, &addr->v6.sin6_addr,
+						    dev, 0))) {
 				rcu_read_unlock();
 				return 0;
 			}
-- 
cgit v1.2.3


From 829385f08ae99740276cbd46c9db29764c519211 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@quantonium.net>
Date: Fri, 20 Oct 2017 16:40:43 -0700
Subject: strparser: Use delayed work instead of timer for msg timeout

Sock lock may be taken in the message timer function which is a
problem since timers run in BH. Instead of timers use delayed_work.

Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Fixes: bbb03029a899 ("strparser: Generalize strparser")
Signed-off-by: Tom Herbert <tom@quantonium.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/strparser.h   |  3 +--
 net/strparser/strparser.c | 17 ++++++++---------
 2 files changed, 9 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/include/net/strparser.h b/include/net/strparser.h
index 7dc131d62ad5..d96b59f45eba 100644
--- a/include/net/strparser.h
+++ b/include/net/strparser.h
@@ -74,10 +74,9 @@ struct strparser {
 	u32 unrecov_intr : 1;
 
 	struct sk_buff **skb_nextp;
-	struct timer_list msg_timer;
 	struct sk_buff *skb_head;
 	unsigned int need_bytes;
-	struct delayed_work delayed_work;
+	struct delayed_work msg_timer_work;
 	struct work_struct work;
 	struct strp_stats stats;
 	struct strp_callbacks cb;
diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index d4ea46a5f233..c5fda15ba319 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -49,7 +49,7 @@ static void strp_abort_strp(struct strparser *strp, int err)
 {
 	/* Unrecoverable error in receive */
 
-	del_timer(&strp->msg_timer);
+	cancel_delayed_work(&strp->msg_timer_work);
 
 	if (strp->stopped)
 		return;
@@ -68,7 +68,7 @@ static void strp_abort_strp(struct strparser *strp, int err)
 static void strp_start_timer(struct strparser *strp, long timeo)
 {
 	if (timeo)
-		mod_timer(&strp->msg_timer, timeo);
+		mod_delayed_work(strp_wq, &strp->msg_timer_work, timeo);
 }
 
 /* Lower lock held */
@@ -319,7 +319,7 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 		eaten += (cand_len - extra);
 
 		/* Hurray, we have a new message! */
-		del_timer(&strp->msg_timer);
+		cancel_delayed_work(&strp->msg_timer_work);
 		strp->skb_head = NULL;
 		STRP_STATS_INCR(strp->stats.msgs);
 
@@ -450,9 +450,10 @@ static void strp_work(struct work_struct *w)
 	do_strp_work(container_of(w, struct strparser, work));
 }
 
-static void strp_msg_timeout(unsigned long arg)
+static void strp_msg_timeout(struct work_struct *w)
 {
-	struct strparser *strp = (struct strparser *)arg;
+	struct strparser *strp = container_of(w, struct strparser,
+					      msg_timer_work.work);
 
 	/* Message assembly timed out */
 	STRP_STATS_INCR(strp->stats.msg_timeouts);
@@ -505,9 +506,7 @@ int strp_init(struct strparser *strp, struct sock *sk,
 	strp->cb.read_sock_done = cb->read_sock_done ? : default_read_sock_done;
 	strp->cb.abort_parser = cb->abort_parser ? : strp_abort_strp;
 
-	setup_timer(&strp->msg_timer, strp_msg_timeout,
-		    (unsigned long)strp);
-
+	INIT_DELAYED_WORK(&strp->msg_timer_work, strp_msg_timeout);
 	INIT_WORK(&strp->work, strp_work);
 
 	return 0;
@@ -532,7 +531,7 @@ void strp_done(struct strparser *strp)
 {
 	WARN_ON(!strp->stopped);
 
-	del_timer_sync(&strp->msg_timer);
+	cancel_delayed_work_sync(&strp->msg_timer_work);
 	cancel_work_sync(&strp->work);
 
 	if (strp->skb_head) {
-- 
cgit v1.2.3


From 3eb8feeb1708c7dbfd2e97df92a2a407c116606e Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Tue, 24 Oct 2017 16:37:19 -0400
Subject: net: dsa: check master device before put

In the case of pdata, the dsa_cpu_parse function calls dev_put() before
making sure it isn't NULL. Fix this.

Fixes: 71e0bbde0d88 ("net: dsa: Add support for platform data")
Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/dsa2.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 873af0108e24..045d8a176279 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -496,14 +496,15 @@ static int dsa_cpu_parse(struct dsa_port *port, u32 index,
 		if (!ethernet)
 			return -EINVAL;
 		ethernet_dev = of_find_net_device_by_node(ethernet);
+		if (!ethernet_dev)
+			return -EPROBE_DEFER;
 	} else {
 		ethernet_dev = dsa_dev_to_net_device(ds->cd->netdev[index]);
+		if (!ethernet_dev)
+			return -EPROBE_DEFER;
 		dev_put(ethernet_dev);
 	}
 
-	if (!ethernet_dev)
-		return -EPROBE_DEFER;
-
 	if (!dst->cpu_dp) {
 		dst->cpu_dp = port;
 		dst->cpu_dp->netdev = ethernet_dev;
-- 
cgit v1.2.3


From cfbb0d90a7abb289edc91833d0905931f8805f12 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 24 Oct 2017 21:12:13 +0200
Subject: mac80211: don't compare TKIP TX MIC key in reinstall prevention

For the reinstall prevention, the code I had added compares the
whole key. It turns out though that iwlwifi firmware doesn't
provide the TKIP TX MIC key as it's not needed in client mode,
and thus the comparison will always return false.

For client mode, thus always zero out the TX MIC key part before
doing the comparison in order to avoid accepting the reinstall
of the key with identical encryption and RX MIC key, but not the
same TX MIC key (since the supplicant provides the real one.)

Fixes: fdf7cb4185b6 ("mac80211: accept key reinstall without changing anything")
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/key.c | 36 ++++++++++++++++++++++++++++++++++--
 1 file changed, 34 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index 035d16fe926e..938049395f90 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -610,6 +610,39 @@ void ieee80211_key_free_unused(struct ieee80211_key *key)
 	ieee80211_key_free_common(key);
 }
 
+static bool ieee80211_key_identical(struct ieee80211_sub_if_data *sdata,
+				    struct ieee80211_key *old,
+				    struct ieee80211_key *new)
+{
+	u8 tkip_old[WLAN_KEY_LEN_TKIP], tkip_new[WLAN_KEY_LEN_TKIP];
+	u8 *tk_old, *tk_new;
+
+	if (!old || new->conf.keylen != old->conf.keylen)
+		return false;
+
+	tk_old = old->conf.key;
+	tk_new = new->conf.key;
+
+	/*
+	 * In station mode, don't compare the TX MIC key, as it's never used
+	 * and offloaded rekeying may not care to send it to the host. This
+	 * is the case in iwlwifi, for example.
+	 */
+	if (sdata->vif.type == NL80211_IFTYPE_STATION &&
+	    new->conf.cipher == WLAN_CIPHER_SUITE_TKIP &&
+	    new->conf.keylen == WLAN_KEY_LEN_TKIP &&
+	    !(new->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE)) {
+		memcpy(tkip_old, tk_old, WLAN_KEY_LEN_TKIP);
+		memcpy(tkip_new, tk_new, WLAN_KEY_LEN_TKIP);
+		memset(tkip_old + NL80211_TKIP_DATA_OFFSET_TX_MIC_KEY, 0, 8);
+		memset(tkip_new + NL80211_TKIP_DATA_OFFSET_TX_MIC_KEY, 0, 8);
+		tk_old = tkip_old;
+		tk_new = tkip_new;
+	}
+
+	return !crypto_memneq(tk_old, tk_new, new->conf.keylen);
+}
+
 int ieee80211_key_link(struct ieee80211_key *key,
 		       struct ieee80211_sub_if_data *sdata,
 		       struct sta_info *sta)
@@ -635,8 +668,7 @@ int ieee80211_key_link(struct ieee80211_key *key,
 	 * Silently accept key re-installation without really installing the
 	 * new version of the key to avoid nonce reuse or replay issues.
 	 */
-	if (old_key && key->conf.keylen == old_key->conf.keylen &&
-	    !crypto_memneq(key->conf.key, old_key->conf.key, key->conf.keylen)) {
+	if (ieee80211_key_identical(sdata, old_key, key)) {
 		ieee80211_key_free_unused(key);
 		ret = 0;
 		goto out;
-- 
cgit v1.2.3


From 0f5da659d8f1810f44de14acf2c80cd6499623a0 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@openvz.org>
Date: Wed, 25 Oct 2017 10:16:42 -0700
Subject: net/unix: don't show information about sockets from other namespaces

socket_diag shows information only about sockets from a namespace where
a diag socket lives.

But if we request information about one unix socket, the kernel don't
check that its netns is matched with a diag socket namespace, so any
user can get information about any unix socket in a system. This looks
like a bug.

v2: add a Fixes tag

Fixes: 51d7cccf0723 ("net: make sock diag per-namespace")
Signed-off-by: Andrei Vagin <avagin@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/diag.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/unix/diag.c b/net/unix/diag.c
index 4d9679701a6d..384c84e83462 100644
--- a/net/unix/diag.c
+++ b/net/unix/diag.c
@@ -257,6 +257,8 @@ static int unix_diag_get_exact(struct sk_buff *in_skb,
 	err = -ENOENT;
 	if (sk == NULL)
 		goto out_nosk;
+	if (!net_eq(sock_net(sk), net))
+		goto out;
 
 	err = sock_diag_check_cookie(sk, req->udiag_cookie);
 	if (err)
-- 
cgit v1.2.3


From e9a0b99804ff662d02b78a556a84e22308066fe1 Mon Sep 17 00:00:00 2001
From: Håkon Bugge <Haakon.Bugge@oracle.com>
Date: Tue, 24 Oct 2017 18:17:18 +0200
Subject: rds: ib: Fix uninitialized variable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

send_flags needs to be initialized before calling
rds_ib_set_wr_signal_state().

Signed-off-by: Håkon Bugge <haakon.bugge@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/ib_send.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 6ab39dbcca01..8f46755477ae 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -792,6 +792,7 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
 		send->s_atomic_wr.compare_add_mask = op->op_m_fadd.nocarry_mask;
 		send->s_atomic_wr.swap_mask = 0;
 	}
+	send->s_wr.send_flags = 0;
 	nr_sig = rds_ib_set_wr_signal_state(ic, send, op->op_notify);
 	send->s_atomic_wr.wr.num_sge = 1;
 	send->s_atomic_wr.wr.next = NULL;
-- 
cgit v1.2.3


From a0c0865fa0abcbc142c11fabec3a2bffc1a4229d Mon Sep 17 00:00:00 2001
From: Håkon Bugge <Haakon.Bugge@oracle.com>
Date: Tue, 24 Oct 2017 16:16:28 +0200
Subject: rds: Fix inaccurate accounting of unsignaled wrs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The number of unsignaled work-requests posted to the IB send queue is
tracked by a counter in the rds_ib_connection struct. When it reaches
zero, or the caller explicitly asks for it, the send-signaled bit is
set in send_flags and the counter is reset. This is performed by the
rds_ib_set_wr_signal_state() function.

However, this function is not always used which yields inaccurate
accounting. This commit fixes this, re-factors a code bloat related to
the matter, and makes the actual parameter type to the function
consistent.

Signed-off-by: Håkon Bugge <haakon.bugge@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/ib_send.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 8f46755477ae..8557a1cae041 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -661,13 +661,15 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 			}
 		}
 
-		rds_ib_set_wr_signal_state(ic, send, 0);
+		rds_ib_set_wr_signal_state(ic, send, false);
 
 		/*
 		 * Always signal the last one if we're stopping due to flow control.
 		 */
-		if (ic->i_flowctl && flow_controlled && i == (work_alloc-1))
-			send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+		if (ic->i_flowctl && flow_controlled && i == (work_alloc - 1)) {
+			rds_ib_set_wr_signal_state(ic, send, true);
+			send->s_wr.send_flags |= IB_SEND_SOLICITED;
+		}
 
 		if (send->s_wr.send_flags & IB_SEND_SIGNALED)
 			nr_sig++;
@@ -705,11 +707,8 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 	if (scat == &rm->data.op_sg[rm->data.op_count]) {
 		prev->s_op = ic->i_data_op;
 		prev->s_wr.send_flags |= IB_SEND_SOLICITED;
-		if (!(prev->s_wr.send_flags & IB_SEND_SIGNALED)) {
-			ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
-			prev->s_wr.send_flags |= IB_SEND_SIGNALED;
-			nr_sig++;
-		}
+		if (!(prev->s_wr.send_flags & IB_SEND_SIGNALED))
+			nr_sig += rds_ib_set_wr_signal_state(ic, prev, true);
 		ic->i_data_op = NULL;
 	}
 
-- 
cgit v1.2.3


From 06f877d613be3621604c2520ec0351d9fbdca15f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 24 Oct 2017 08:20:31 -0700
Subject: tcp/dccp: fix other lockdep splats accessing ireq_opt
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In my first attempt to fix the lockdep splat, I forgot we could
enter inet_csk_route_req() with a freshly allocated request socket,
for which refcount has not yet been elevated, due to complex
SLAB_TYPESAFE_BY_RCU rules.

We either are in rcu_read_lock() section _or_ we own a refcount on the
request.

Correct RCU verb to use here is rcu_dereference_check(), although it is
not possible to prove we actually own a reference on a shared
refcount :/

In v2, I added ireq_opt_deref() helper and use in three places, to fix other
possible splats.

[   49.844590]  lockdep_rcu_suspicious+0xea/0xf3
[   49.846487]  inet_csk_route_req+0x53/0x14d
[   49.848334]  tcp_v4_route_req+0xe/0x10
[   49.850174]  tcp_conn_request+0x31c/0x6a0
[   49.851992]  ? __lock_acquire+0x614/0x822
[   49.854015]  tcp_v4_conn_request+0x5a/0x79
[   49.855957]  ? tcp_v4_conn_request+0x5a/0x79
[   49.858052]  tcp_rcv_state_process+0x98/0xdcc
[   49.859990]  ? sk_filter_trim_cap+0x2f6/0x307
[   49.862085]  tcp_v4_do_rcv+0xfc/0x145
[   49.864055]  ? tcp_v4_do_rcv+0xfc/0x145
[   49.866173]  tcp_v4_rcv+0x5ab/0xaf9
[   49.868029]  ip_local_deliver_finish+0x1af/0x2e7
[   49.870064]  ip_local_deliver+0x1b2/0x1c5
[   49.871775]  ? inet_del_offload+0x45/0x45
[   49.873916]  ip_rcv_finish+0x3f7/0x471
[   49.875476]  ip_rcv+0x3f1/0x42f
[   49.876991]  ? ip_local_deliver_finish+0x2e7/0x2e7
[   49.878791]  __netif_receive_skb_core+0x6d3/0x950
[   49.880701]  ? process_backlog+0x7e/0x216
[   49.882589]  __netif_receive_skb+0x1d/0x5e
[   49.884122]  process_backlog+0x10c/0x216
[   49.885812]  net_rx_action+0x147/0x3df

Fixes: a6ca7abe53633 ("tcp/dccp: fix lockdep splat in inet_csk_route_req()")
Fixes: c92e8c02fe66 ("tcp/dccp: fix ireq->opt races")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: kernel test robot <fengguang.wu@intel.com>
Reported-by: Maciej Żenczykowski <maze@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_sock.h         | 6 ++++++
 net/dccp/ipv4.c                 | 2 +-
 net/ipv4/inet_connection_sock.c | 4 ++--
 net/ipv4/tcp_ipv4.c             | 2 +-
 4 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 425752f768d2..db8162dd8c0b 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -132,6 +132,12 @@ static inline int inet_request_bound_dev_if(const struct sock *sk,
 	return sk->sk_bound_dev_if;
 }
 
+static inline struct ip_options_rcu *ireq_opt_deref(const struct inet_request_sock *ireq)
+{
+	return rcu_dereference_check(ireq->ireq_opt,
+				     refcount_read(&ireq->req.rsk_refcnt) > 0);
+}
+
 struct inet_cork {
 	unsigned int		flags;
 	__be32			addr;
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 0490916864f9..e65fcb45c3f6 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -495,7 +495,7 @@ static int dccp_v4_send_response(const struct sock *sk, struct request_sock *req
 							      ireq->ir_rmt_addr);
 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 					    ireq->ir_rmt_addr,
-					    rcu_dereference(ireq->ireq_opt));
+					    ireq_opt_deref(ireq));
 		err = net_xmit_eval(err);
 	}
 
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 18cd2eae758f..b47a59cb3573 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -543,8 +543,8 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk,
 	struct ip_options_rcu *opt;
 	struct rtable *rt;
 
-	opt = rcu_dereference_protected(ireq->ireq_opt,
-					refcount_read(&req->rsk_refcnt) > 0);
+	opt = ireq_opt_deref(ireq);
+
 	flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark,
 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 			   sk->sk_protocol, inet_sk_flowi_flags(sk),
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 4c43365c374c..5b027c69cbc5 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -877,7 +877,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 
 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 					    ireq->ir_rmt_addr,
-					    rcu_dereference(ireq->ireq_opt));
+					    ireq_opt_deref(ireq));
 		err = net_xmit_eval(err);
 	}
 
-- 
cgit v1.2.3


From 5889e2c0e441d84060e66211ed5c4517ca591167 Mon Sep 17 00:00:00 2001
From: Yousuk Seung <ysseung@google.com>
Date: Tue, 24 Oct 2017 16:44:42 -0700
Subject: tcp: call tcp_rate_skb_sent() when retransmit with unaligned
 skb->data

Current implementation calls tcp_rate_skb_sent() when tcp_transmit_skb()
is called when it clones skb only. Not calling tcp_rate_skb_sent() is OK
for all such code paths except from __tcp_retransmit_skb() which happens
when skb->data address is not aligned. This may rarely happen e.g. when
small amount of data is sent initially and the receiver partially acks
odd number of bytes for some reason, possibly malicious.

Signed-off-by: Yousuk Seung <ysseung@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 973befc36fd4..1151870018e3 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2843,8 +2843,10 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 		nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
 		err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
 			     -ENOBUFS;
-		if (!err)
+		if (!err) {
 			skb->skb_mstamp = tp->tcp_mstamp;
+			tcp_rate_skb_sent(sk, skb);
+		}
 	} else {
 		err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
 	}
-- 
cgit v1.2.3


From f3594f0a7ea36661d7fd942facd7f31a64245f1a Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 26 Oct 2017 19:19:56 +0800
Subject: ipip: only increase err_count for some certain type icmp in ipip_err

t->err_count is used to count the link failure on tunnel and an err
will be reported to user socket in tx path if t->err_count is not 0.
udp socket could even return EHOSTUNREACH to users.

Since commit fd58156e456d ("IPIP: Use ip-tunneling code.") removed
the 'switch check' for icmp type in ipip_err(), err_count would be
increased by the icmp packet with ICMP_EXC_FRAGTIME code. an link
failure would be reported out due to this.

In Jianlin's case, when receiving ICMP_EXC_FRAGTIME a icmp packet,
udp netperf failed with the err:
  send_data: data send error: No route to host (errno 113)

We expect this error reported from tunnel to socket when receiving
some certain type icmp, but not ICMP_EXC_FRAGTIME, ICMP_SR_FAILED
or ICMP_PARAMETERPROB ones.

This patch is to bring 'switch check' for icmp type back to ipip_err
so that it only reports link failure for the right type icmp, just as
in ipgre_err() and ipip6_err().

Fixes: fd58156e456d ("IPIP: Use ip-tunneling code.")
Reported-by: Jianlin Shi <jishi@redhat.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipip.c | 59 ++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 42 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index fb1ad22b5e29..cdd627355ed1 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -128,43 +128,68 @@ static struct rtnl_link_ops ipip_link_ops __read_mostly;
 
 static int ipip_err(struct sk_buff *skb, u32 info)
 {
-
-/* All the routers (except for Linux) return only
-   8 bytes of packet payload. It means, that precise relaying of
-   ICMP in the real Internet is absolutely infeasible.
- */
+	/* All the routers (except for Linux) return only
+	 * 8 bytes of packet payload. It means, that precise relaying of
+	 * ICMP in the real Internet is absolutely infeasible.
+	 */
 	struct net *net = dev_net(skb->dev);
 	struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
 	const struct iphdr *iph = (const struct iphdr *)skb->data;
-	struct ip_tunnel *t;
-	int err;
 	const int type = icmp_hdr(skb)->type;
 	const int code = icmp_hdr(skb)->code;
+	struct ip_tunnel *t;
+	int err = 0;
+
+	switch (type) {
+	case ICMP_DEST_UNREACH:
+		switch (code) {
+		case ICMP_SR_FAILED:
+			/* Impossible event. */
+			goto out;
+		default:
+			/* All others are translated to HOST_UNREACH.
+			 * rfc2003 contains "deep thoughts" about NET_UNREACH,
+			 * I believe they are just ether pollution. --ANK
+			 */
+			break;
+		}
+		break;
+
+	case ICMP_TIME_EXCEEDED:
+		if (code != ICMP_EXC_TTL)
+			goto out;
+		break;
+
+	case ICMP_REDIRECT:
+		break;
+
+	default:
+		goto out;
+	}
 
-	err = -ENOENT;
 	t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
 			     iph->daddr, iph->saddr, 0);
-	if (!t)
+	if (!t) {
+		err = -ENOENT;
 		goto out;
+	}
 
 	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
-		ipv4_update_pmtu(skb, dev_net(skb->dev), info,
-				 t->parms.link, 0, iph->protocol, 0);
-		err = 0;
+		ipv4_update_pmtu(skb, net, info, t->parms.link, 0,
+				 iph->protocol, 0);
 		goto out;
 	}
 
 	if (type == ICMP_REDIRECT) {
-		ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
-			      iph->protocol, 0);
-		err = 0;
+		ipv4_redirect(skb, net, t->parms.link, 0, iph->protocol, 0);
 		goto out;
 	}
 
-	if (t->parms.iph.daddr == 0)
+	if (t->parms.iph.daddr == 0) {
+		err = -ENOENT;
 		goto out;
+	}
 
-	err = 0;
 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
 		goto out;
 
-- 
cgit v1.2.3


From f8d20b46ce55cf40afb30dcef6d9288f7ef46d9b Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 26 Oct 2017 19:23:27 +0800
Subject: ip6_gre: only increase err_count for some certain type icmpv6 in
 ip6gre_err

The similar fix in patch 'ipip: only increase err_count for some
certain type icmp in ipip_err' is needed for ip6gre_err.

In Jianlin's case, udp netperf broke even when receiving a TooBig
icmpv6 packet.

Fixes: c12b395a4664 ("gre: Support GRE over IPv6")
Reported-by: Jianlin Shi <jishi@redhat.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 1602b491b281..fb595e8dc15b 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -408,13 +408,16 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	case ICMPV6_DEST_UNREACH:
 		net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n",
 				    t->parms.name);
-		break;
+		if (code != ICMPV6_PORT_UNREACH)
+			break;
+		return;
 	case ICMPV6_TIME_EXCEED:
 		if (code == ICMPV6_EXC_HOPLIMIT) {
 			net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n",
 					    t->parms.name);
+			break;
 		}
-		break;
+		return;
 	case ICMPV6_PARAMPROB:
 		teli = 0;
 		if (code == ICMPV6_HDR_FIELD)
@@ -430,7 +433,7 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 			net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n",
 					    t->parms.name);
 		}
-		break;
+		return;
 	case ICMPV6_PKT_TOOBIG:
 		mtu = be32_to_cpu(info) - offset - t->tun_hlen;
 		if (t->dev->type == ARPHRD_ETHER)
@@ -438,7 +441,7 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		if (mtu < IPV6_MIN_MTU)
 			mtu = IPV6_MIN_MTU;
 		t->dev->mtu = mtu;
-		break;
+		return;
 	}
 
 	if (time_before(jiffies, t->err_time + IP6TUNNEL_ERR_TIMEO))
-- 
cgit v1.2.3


From 8aec4959d832bae0889a8e2f348973b5e4abffef Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 26 Oct 2017 19:27:17 +0800
Subject: ip6_gre: update dst pmtu if dev mtu has been updated by toobig in
 __gre6_xmit

When receiving a Toobig icmpv6 packet, ip6gre_err would just set
tunnel dev's mtu, that's not enough. For skb_dst(skb)'s pmtu may
still be using the old value, it has no chance to be updated with
tunnel dev's mtu.

Jianlin found this issue by reducing route's mtu while running
netperf, the performance went to 0.

ip6ip6 and ip4ip6 tunnel can work well with this, as they lookup
the upper dst and update_pmtu it's pmtu or icmpv6_send a Toobig
to upper socket after setting tunnel dev's mtu.

We couldn't do that for ip6_gre, as gre's inner packet could be
any protocol, it's difficult to handle them (like lookup upper
dst) in a good way.

So this patch is to fix it by updating skb_dst(skb)'s pmtu when
dev->mtu < skb_dst(skb)'s pmtu in tx path. It's safe to do this
update there, as usually dev->mtu <= skb_dst(skb)'s pmtu and no
performance regression can be caused by this.

Fixes: c12b395a4664 ("gre: Support GRE over IPv6")
Reported-by: Jianlin Shi <jishi@redhat.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index fb595e8dc15b..59c121b932ac 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -503,8 +503,8 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
 			       __u32 *pmtu, __be16 proto)
 {
 	struct ip6_tnl *tunnel = netdev_priv(dev);
-	__be16 protocol = (dev->type == ARPHRD_ETHER) ?
-			  htons(ETH_P_TEB) : proto;
+	struct dst_entry *dst = skb_dst(skb);
+	__be16 protocol;
 
 	if (dev->type == ARPHRD_ETHER)
 		IPCB(skb)->flags = 0;
@@ -518,9 +518,14 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
 		tunnel->o_seqno++;
 
 	/* Push GRE header. */
+	protocol = (dev->type == ARPHRD_ETHER) ? htons(ETH_P_TEB) : proto;
 	gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags,
 			 protocol, tunnel->parms.o_key, htonl(tunnel->o_seqno));
 
+	/* TooBig packet may have updated dst->dev's mtu */
+	if (dst && dst_mtu(dst) > dst->dev->mtu)
+		dst->ops->update_pmtu(dst, NULL, skb, dst->dev->mtu);
+
 	return ip6_tnl_xmit(skb, dev, dsfield, fl6, encap_limit, pmtu,
 			    NEXTHDR_GRE);
 }
-- 
cgit v1.2.3


From ee1836aec4f5a977c1699a311db4d9027ef21ac8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 26 Oct 2017 21:21:40 -0700
Subject: tcp: refresh tp timestamp before tcp_mtu_probe()

In the unlikely event tcp_mtu_probe() is sending a packet, we
want tp->tcp_mstamp being as accurate as possible.

This means we need to call tcp_mstamp_refresh() a bit earlier in
tcp_write_xmit().

Fixes: 385e20706fac ("tcp: use tp->tcp_mstamp in output path")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 1151870018e3..ae60dd3faed0 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2239,6 +2239,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 
 	sent_pkts = 0;
 
+	tcp_mstamp_refresh(tp);
 	if (!push_one) {
 		/* Do MTU probing. */
 		result = tcp_mtu_probe(sk);
@@ -2250,7 +2251,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 	}
 
 	max_segs = tcp_tso_segs(sk, mss_now);
-	tcp_mstamp_refresh(tp);
 	while ((skb = tcp_send_head(sk))) {
 		unsigned int limit;
 
-- 
cgit v1.2.3


From 8108a77515126f6db4374e8593956e20430307c0 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Fri, 27 Oct 2017 09:45:34 -0700
Subject: bpf: bpf_compute_data uses incorrect cb structure

SK_SKB program types use bpf_compute_data to store the end of the
packet data. However, bpf_compute_data assumes the cb is stored in the
qdisc layer format. But, for SK_SKB this is the wrong layer of the
stack for this type.

It happens to work (sort of!) because in most cases nothing happens
to be overwritten today. This is very fragile and error prone.
Fortunately, we have another hole in tcp_skb_cb we can use so lets
put the data_end value there.

Note, SK_SKB program types do not use data_meta, they are failed by
sk_skb_is_valid_access().

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h    |  1 +
 kernel/bpf/sockmap.c | 12 ++++++++++--
 net/core/filter.c    | 27 ++++++++++++++++++++++++++-
 3 files changed, 37 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index b1ef98ebce53..33599d17522d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -844,6 +844,7 @@ struct tcp_skb_cb {
 			__u32 key;
 			__u32 flags;
 			struct bpf_map *map;
+			void *data_end;
 		} bpf;
 	};
 };
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 2b6eb35ae5d3..6778fb773934 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -93,6 +93,14 @@ static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
 	return rcu_dereference_sk_user_data(sk);
 }
 
+/* compute the linear packet data range [data, data_end) for skb when
+ * sk_skb type programs are in use.
+ */
+static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb)
+{
+	TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb);
+}
+
 static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
 {
 	struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict);
@@ -108,7 +116,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
 	 */
 	TCP_SKB_CB(skb)->bpf.map = NULL;
 	skb->sk = psock->sock;
-	bpf_compute_data_end(skb);
+	bpf_compute_data_end_sk_skb(skb);
 	preempt_disable();
 	rc = (*prog->bpf_func)(skb, prog->insnsi);
 	preempt_enable();
@@ -368,7 +376,7 @@ static int smap_parse_func_strparser(struct strparser *strp,
 	 * any socket yet.
 	 */
 	skb->sk = psock->sock;
-	bpf_compute_data_end(skb);
+	bpf_compute_data_end_sk_skb(skb);
 	rc = (*prog->bpf_func)(skb, prog->insnsi);
 	skb->sk = NULL;
 	rcu_read_unlock();
diff --git a/net/core/filter.c b/net/core/filter.c
index aa0265997f93..68eaa2f81a8e 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4243,6 +4243,31 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 	return insn - insn_buf;
 }
 
+static u32 sk_skb_convert_ctx_access(enum bpf_access_type type,
+				     const struct bpf_insn *si,
+				     struct bpf_insn *insn_buf,
+				     struct bpf_prog *prog, u32 *target_size)
+{
+	struct bpf_insn *insn = insn_buf;
+	int off;
+
+	switch (si->off) {
+	case offsetof(struct __sk_buff, data_end):
+		off  = si->off;
+		off -= offsetof(struct __sk_buff, data_end);
+		off += offsetof(struct sk_buff, cb);
+		off += offsetof(struct tcp_skb_cb, bpf.data_end);
+		*insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
+				      si->src_reg, off);
+		break;
+	default:
+		return bpf_convert_ctx_access(type, si, insn_buf, prog,
+					      target_size);
+	}
+
+	return insn - insn_buf;
+}
+
 const struct bpf_verifier_ops sk_filter_prog_ops = {
 	.get_func_proto		= sk_filter_func_proto,
 	.is_valid_access	= sk_filter_is_valid_access,
@@ -4301,7 +4326,7 @@ const struct bpf_verifier_ops sock_ops_prog_ops = {
 const struct bpf_verifier_ops sk_skb_prog_ops = {
 	.get_func_proto		= sk_skb_func_proto,
 	.is_valid_access	= sk_skb_is_valid_access,
-	.convert_ctx_access	= bpf_convert_ctx_access,
+	.convert_ctx_access	= sk_skb_convert_ctx_access,
 	.gen_prologue		= sk_skb_prologue,
 };
 
-- 
cgit v1.2.3


From bfa640757e9378c2f26867e723f1287e94f5a7ad Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Fri, 27 Oct 2017 09:45:53 -0700
Subject: bpf: rename sk_actions to align with bpf infrastructure

Recent additions to support multiple programs in cgroups impose
a strict requirement, "all yes is yes, any no is no". To enforce
this the infrastructure requires the 'no' return code, SK_DROP in
this case, to be 0.

To apply these rules to SK_SKB program types the sk_actions return
codes need to be adjusted.

This fix adds SK_PASS and makes 'SK_DROP = 0'. Finally, remove
SK_ABORTED to remove any chance that the API may allow aborted
program flows to be passed up the stack. This would be incorrect
behavior and allow programs to break existing policies.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h       | 6 +++---
 kernel/bpf/sockmap.c           | 3 ++-
 net/core/filter.c              | 5 +++--
 tools/include/uapi/linux/bpf.h | 4 ++--
 4 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f90860d1f897..0d7948ce2128 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -575,7 +575,7 @@ union bpf_attr {
  *     @map: pointer to sockmap
  *     @key: key to lookup sock in map
  *     @flags: reserved for future use
- *     Return: SK_REDIRECT
+ *     Return: SK_PASS
  *
  * int bpf_sock_map_update(skops, map, key, flags)
  *	@skops: pointer to bpf_sock_ops
@@ -786,8 +786,8 @@ struct xdp_md {
 };
 
 enum sk_action {
-	SK_ABORTED = 0,
-	SK_DROP,
+	SK_DROP = 0,
+	SK_PASS,
 	SK_REDIRECT,
 };
 
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 6778fb773934..66f00a2b27f4 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -122,7 +122,8 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
 	preempt_enable();
 	skb->sk = NULL;
 
-	return rc;
+	return rc == SK_PASS ?
+		(TCP_SKB_CB(skb)->bpf.map ? SK_REDIRECT : SK_PASS) : SK_DROP;
 }
 
 static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)
diff --git a/net/core/filter.c b/net/core/filter.c
index 68eaa2f81a8e..6ae94f825f72 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1844,14 +1844,15 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
 {
 	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
 
+	/* If user passes invalid input drop the packet. */
 	if (unlikely(flags))
-		return SK_ABORTED;
+		return SK_DROP;
 
 	tcb->bpf.key = key;
 	tcb->bpf.flags = flags;
 	tcb->bpf.map = map;
 
-	return SK_REDIRECT;
+	return SK_PASS;
 }
 
 struct sock *do_sk_redirect_map(struct sk_buff *skb)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 24b35a1fd4d6..c174971afbe6 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -787,8 +787,8 @@ struct xdp_md {
 };
 
 enum sk_action {
-	SK_ABORTED = 0,
-	SK_DROP,
+	SK_DROP = 0,
+	SK_PASS,
 	SK_REDIRECT,
 };
 
-- 
cgit v1.2.3


From d04adf1b355181e737b6b1e23d801b07f0b7c4c0 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 28 Oct 2017 02:13:29 +0800
Subject: sctp: reset owner sk for data chunks on out queues when migrating a
 sock

Now when migrating sock to another one in sctp_sock_migrate(), it only
resets owner sk for the data in receive queues, not the chunks on out
queues.

It would cause that data chunks length on the sock is not consistent
with sk sk_wmem_alloc. When closing the sock or freeing these chunks,
the old sk would never be freed, and the new sock may crash due to
the overflow sk_wmem_alloc.

syzbot found this issue with this series:

  r0 = socket$inet_sctp()
  sendto$inet(r0)
  listen(r0)
  accept4(r0)
  close(r0)

Although listen() should have returned error when one TCP-style socket
is in connecting (I may fix this one in another patch), it could also
be reproduced by peeling off an assoc.

This issue is there since very beginning.

This patch is to reset owner sk for the chunks on out queues so that
sk sk_wmem_alloc has correct value after accept one sock or peeloff
an assoc to one sock.

Note that when resetting owner sk for chunks on outqueue, it has to
sctp_clear_owner_w/skb_orphan chunks before changing assoc->base.sk
first and then sctp_set_owner_w them after changing assoc->base.sk,
due to that sctp_wfree and it's callees are using assoc->base.sk.

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/socket.c | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'net')

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 17841ab30798..6f45d1713452 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -170,6 +170,36 @@ static inline void sctp_set_owner_w(struct sctp_chunk *chunk)
 	sk_mem_charge(sk, chunk->skb->truesize);
 }
 
+static void sctp_clear_owner_w(struct sctp_chunk *chunk)
+{
+	skb_orphan(chunk->skb);
+}
+
+static void sctp_for_each_tx_datachunk(struct sctp_association *asoc,
+				       void (*cb)(struct sctp_chunk *))
+
+{
+	struct sctp_outq *q = &asoc->outqueue;
+	struct sctp_transport *t;
+	struct sctp_chunk *chunk;
+
+	list_for_each_entry(t, &asoc->peer.transport_addr_list, transports)
+		list_for_each_entry(chunk, &t->transmitted, transmitted_list)
+			cb(chunk);
+
+	list_for_each_entry(chunk, &q->retransmit, list)
+		cb(chunk);
+
+	list_for_each_entry(chunk, &q->sacked, list)
+		cb(chunk);
+
+	list_for_each_entry(chunk, &q->abandoned, list)
+		cb(chunk);
+
+	list_for_each_entry(chunk, &q->out_chunk_list, list)
+		cb(chunk);
+}
+
 /* Verify that this is a valid address. */
 static inline int sctp_verify_addr(struct sock *sk, union sctp_addr *addr,
 				   int len)
@@ -8212,7 +8242,9 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
 	 * paths won't try to lock it and then oldsk.
 	 */
 	lock_sock_nested(newsk, SINGLE_DEPTH_NESTING);
+	sctp_for_each_tx_datachunk(assoc, sctp_clear_owner_w);
 	sctp_assoc_migrate(assoc, newsk);
+	sctp_for_each_tx_datachunk(assoc, sctp_set_owner_w);
 
 	/* If the association on the newsk is already closed before accept()
 	 * is called, set RCV_SHUTDOWN flag.
-- 
cgit v1.2.3


From 50317fce2cc70a2bbbc4b42c31bbad510382a53c Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Fri, 27 Oct 2017 22:08:56 -0700
Subject: net_sched: avoid matching qdisc with zero handle

Davide found the following script triggers a NULL pointer
dereference:

ip l a name eth0 type dummy
tc q a dev eth0 parent :1 handle 1: htb

This is because for a freshly created netdevice noop_qdisc
is attached and when passing 'parent :1', kernel actually
tries to match the major handle which is 0 and noop_qdisc
has handle 0 so is matched by mistake. Commit 69012ae425d7
tries to fix a similar bug but still misses this case.

Handle 0 is not a valid one, should be just skipped. In
fact, kernel uses it as TC_H_UNSPEC.

Fixes: 69012ae425d7 ("net: sched: fix handling of singleton qdiscs with qdisc_hash")
Fixes: 59cc1f61f09c ("net: sched:convert qdisc linked list to hashtable")
Reported-by: Davide Caratti <dcaratti@redhat.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_api.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index c6deb74e3d2f..22bc6fc48311 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -301,6 +301,8 @@ struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
 {
 	struct Qdisc *q;
 
+	if (!handle)
+		return NULL;
 	q = qdisc_match_from_root(dev->qdisc, handle);
 	if (q)
 		goto out;
-- 
cgit v1.2.3


From 1da4fc97cbf89514e417a3df46eaec864a9b8a48 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 28 Oct 2017 19:43:54 +0800
Subject: sctp: fix some type cast warnings introduced by stream reconf

These warnings were found by running 'make C=2 M=net/sctp/'.

They are introduced by not aware of Endian when coding stream
reconf patches.

Since commit c0d8bab6ae51 ("sctp: add get and set sockopt for
reconf_enable") enabled stream reconf feature for users, the
Fixes tag below would use it.

Fixes: c0d8bab6ae51 ("sctp: add get and set sockopt for reconf_enable")
Reported-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h        | 32 ++++++++++++++++----------------
 include/net/sctp/sm.h       |  2 +-
 include/net/sctp/ulpevent.h |  2 +-
 net/sctp/sm_make_chunk.c    |  5 +++--
 net/sctp/stream.c           | 26 +++++++++++++++++---------
 net/sctp/ulpevent.c         |  2 +-
 6 files changed, 39 insertions(+), 30 deletions(-)

(limited to 'net')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 82b171e1aa0b..09d7412e9cb0 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -716,28 +716,28 @@ struct sctp_reconf_chunk {
 
 struct sctp_strreset_outreq {
 	struct sctp_paramhdr param_hdr;
-	__u32 request_seq;
-	__u32 response_seq;
-	__u32 send_reset_at_tsn;
-	__u16 list_of_streams[0];
+	__be32 request_seq;
+	__be32 response_seq;
+	__be32 send_reset_at_tsn;
+	__be16 list_of_streams[0];
 };
 
 struct sctp_strreset_inreq {
 	struct sctp_paramhdr param_hdr;
-	__u32 request_seq;
-	__u16 list_of_streams[0];
+	__be32 request_seq;
+	__be16 list_of_streams[0];
 };
 
 struct sctp_strreset_tsnreq {
 	struct sctp_paramhdr param_hdr;
-	__u32 request_seq;
+	__be32 request_seq;
 };
 
 struct sctp_strreset_addstrm {
 	struct sctp_paramhdr param_hdr;
-	__u32 request_seq;
-	__u16 number_of_streams;
-	__u16 reserved;
+	__be32 request_seq;
+	__be16 number_of_streams;
+	__be16 reserved;
 };
 
 enum {
@@ -752,16 +752,16 @@ enum {
 
 struct sctp_strreset_resp {
 	struct sctp_paramhdr param_hdr;
-	__u32 response_seq;
-	__u32 result;
+	__be32 response_seq;
+	__be32 result;
 };
 
 struct sctp_strreset_resptsn {
 	struct sctp_paramhdr param_hdr;
-	__u32 response_seq;
-	__u32 result;
-	__u32 senders_next_tsn;
-	__u32 receivers_next_tsn;
+	__be32 response_seq;
+	__be32 result;
+	__be32 senders_next_tsn;
+	__be32 receivers_next_tsn;
 };
 
 #endif /* __LINUX_SCTP_H__ */
diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index 2db3d3a9ce1d..88233cf8b8d4 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -261,7 +261,7 @@ struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc,
 				    struct sctp_fwdtsn_skip *skiplist);
 struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc);
 struct sctp_chunk *sctp_make_strreset_req(const struct sctp_association *asoc,
-					  __u16 stream_num, __u16 *stream_list,
+					  __u16 stream_num, __be16 *stream_list,
 					  bool out, bool in);
 struct sctp_chunk *sctp_make_strreset_tsnreq(
 					const struct sctp_association *asoc);
diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h
index b8c86ec1a8f5..231dc42f1da6 100644
--- a/include/net/sctp/ulpevent.h
+++ b/include/net/sctp/ulpevent.h
@@ -130,7 +130,7 @@ struct sctp_ulpevent *sctp_ulpevent_make_sender_dry_event(
 
 struct sctp_ulpevent *sctp_ulpevent_make_stream_reset_event(
 	const struct sctp_association *asoc, __u16 flags,
-	__u16 stream_num, __u16 *stream_list, gfp_t gfp);
+	__u16 stream_num, __be16 *stream_list, gfp_t gfp);
 
 struct sctp_ulpevent *sctp_ulpevent_make_assoc_reset_event(
 	const struct sctp_association *asoc, __u16 flags,
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index ca8f196b6c6c..57c55045f5a7 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -3591,7 +3591,7 @@ static struct sctp_chunk *sctp_make_reconf(const struct sctp_association *asoc,
  */
 struct sctp_chunk *sctp_make_strreset_req(
 					const struct sctp_association *asoc,
-					__u16 stream_num, __u16 *stream_list,
+					__u16 stream_num, __be16 *stream_list,
 					bool out, bool in)
 {
 	struct sctp_strreset_outreq outreq;
@@ -3788,7 +3788,8 @@ bool sctp_verify_reconf(const struct sctp_association *asoc,
 {
 	struct sctp_reconf_chunk *hdr;
 	union sctp_params param;
-	__u16 last = 0, cnt = 0;
+	__be16 last = 0;
+	__u16 cnt = 0;
 
 	hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr;
 	sctp_walk_params(param, hdr, params) {
diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index 63ea15503714..fa8371ff05c4 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -118,6 +118,7 @@ int sctp_send_reset_streams(struct sctp_association *asoc,
 	__u16 i, str_nums, *str_list;
 	struct sctp_chunk *chunk;
 	int retval = -EINVAL;
+	__be16 *nstr_list;
 	bool out, in;
 
 	if (!asoc->peer.reconf_capable ||
@@ -148,13 +149,18 @@ int sctp_send_reset_streams(struct sctp_association *asoc,
 			if (str_list[i] >= stream->incnt)
 				goto out;
 
+	nstr_list = kcalloc(str_nums, sizeof(__be16), GFP_KERNEL);
+	if (!nstr_list) {
+		retval = -ENOMEM;
+		goto out;
+	}
+
 	for (i = 0; i < str_nums; i++)
-		str_list[i] = htons(str_list[i]);
+		nstr_list[i] = htons(str_list[i]);
 
-	chunk = sctp_make_strreset_req(asoc, str_nums, str_list, out, in);
+	chunk = sctp_make_strreset_req(asoc, str_nums, nstr_list, out, in);
 
-	for (i = 0; i < str_nums; i++)
-		str_list[i] = ntohs(str_list[i]);
+	kfree(nstr_list);
 
 	if (!chunk) {
 		retval = -ENOMEM;
@@ -305,7 +311,7 @@ out:
 }
 
 static struct sctp_paramhdr *sctp_chunk_lookup_strreset_param(
-			struct sctp_association *asoc, __u32 resp_seq,
+			struct sctp_association *asoc, __be32 resp_seq,
 			__be16 type)
 {
 	struct sctp_chunk *chunk = asoc->strreset_chunk;
@@ -345,8 +351,9 @@ struct sctp_chunk *sctp_process_strreset_outreq(
 {
 	struct sctp_strreset_outreq *outreq = param.v;
 	struct sctp_stream *stream = &asoc->stream;
-	__u16 i, nums, flags = 0, *str_p = NULL;
 	__u32 result = SCTP_STRRESET_DENIED;
+	__u16 i, nums, flags = 0;
+	__be16 *str_p = NULL;
 	__u32 request_seq;
 
 	request_seq = ntohl(outreq->request_seq);
@@ -439,8 +446,9 @@ struct sctp_chunk *sctp_process_strreset_inreq(
 	struct sctp_stream *stream = &asoc->stream;
 	__u32 result = SCTP_STRRESET_DENIED;
 	struct sctp_chunk *chunk = NULL;
-	__u16 i, nums, *str_p;
 	__u32 request_seq;
+	__u16 i, nums;
+	__be16 *str_p;
 
 	request_seq = ntohl(inreq->request_seq);
 	if (TSN_lt(asoc->strreset_inseq, request_seq) ||
@@ -769,7 +777,7 @@ struct sctp_chunk *sctp_process_strreset_resp(
 
 	if (req->type == SCTP_PARAM_RESET_OUT_REQUEST) {
 		struct sctp_strreset_outreq *outreq;
-		__u16 *str_p;
+		__be16 *str_p;
 
 		outreq = (struct sctp_strreset_outreq *)req;
 		str_p = outreq->list_of_streams;
@@ -794,7 +802,7 @@ struct sctp_chunk *sctp_process_strreset_resp(
 			nums, str_p, GFP_ATOMIC);
 	} else if (req->type == SCTP_PARAM_RESET_IN_REQUEST) {
 		struct sctp_strreset_inreq *inreq;
-		__u16 *str_p;
+		__be16 *str_p;
 
 		/* if the result is performed, it's impossible for inreq */
 		if (result == SCTP_STRRESET_PERFORMED)
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index 67abc0194f30..5447228bf1a0 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -847,7 +847,7 @@ struct sctp_ulpevent *sctp_ulpevent_make_sender_dry_event(
 
 struct sctp_ulpevent *sctp_ulpevent_make_stream_reset_event(
 	const struct sctp_association *asoc, __u16 flags, __u16 stream_num,
-	__u16 *stream_list, gfp_t gfp)
+	__be16 *stream_list, gfp_t gfp)
 {
 	struct sctp_stream_reset_event *sreset;
 	struct sctp_ulpevent *event;
-- 
cgit v1.2.3


From 8d32503efde82db4e0a370981e90628ebd6718b5 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 28 Oct 2017 19:43:55 +0800
Subject: sctp: fix some type cast warnings introduced by transport rhashtable

These warnings were found by running 'make C=2 M=net/sctp/'.

They are introduced by not aware of Endian for the port when
coding transport rhashtable patches.

Fixes: 7fda702f9315 ("sctp: use new rhlist interface on sctp transport rhashtable")
Reported-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/input.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/sctp/input.c b/net/sctp/input.c
index 34f10e75f3b9..621b5ca3fd1c 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -794,7 +794,7 @@ hit:
 struct sctp_hash_cmp_arg {
 	const union sctp_addr	*paddr;
 	const struct net	*net;
-	u16			lport;
+	__be16			lport;
 };
 
 static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg,
@@ -820,37 +820,37 @@ out:
 	return err;
 }
 
-static inline u32 sctp_hash_obj(const void *data, u32 len, u32 seed)
+static inline __u32 sctp_hash_obj(const void *data, u32 len, u32 seed)
 {
 	const struct sctp_transport *t = data;
 	const union sctp_addr *paddr = &t->ipaddr;
 	const struct net *net = sock_net(t->asoc->base.sk);
-	u16 lport = htons(t->asoc->base.bind_addr.port);
-	u32 addr;
+	__be16 lport = htons(t->asoc->base.bind_addr.port);
+	__u32 addr;
 
 	if (paddr->sa.sa_family == AF_INET6)
 		addr = jhash(&paddr->v6.sin6_addr, 16, seed);
 	else
-		addr = paddr->v4.sin_addr.s_addr;
+		addr = (__force __u32)paddr->v4.sin_addr.s_addr;
 
-	return  jhash_3words(addr, ((__u32)paddr->v4.sin_port) << 16 |
+	return  jhash_3words(addr, ((__force __u32)paddr->v4.sin_port) << 16 |
 			     (__force __u32)lport, net_hash_mix(net), seed);
 }
 
-static inline u32 sctp_hash_key(const void *data, u32 len, u32 seed)
+static inline __u32 sctp_hash_key(const void *data, u32 len, u32 seed)
 {
 	const struct sctp_hash_cmp_arg *x = data;
 	const union sctp_addr *paddr = x->paddr;
 	const struct net *net = x->net;
-	u16 lport = x->lport;
-	u32 addr;
+	__be16 lport = x->lport;
+	__u32 addr;
 
 	if (paddr->sa.sa_family == AF_INET6)
 		addr = jhash(&paddr->v6.sin6_addr, 16, seed);
 	else
-		addr = paddr->v4.sin_addr.s_addr;
+		addr = (__force __u32)paddr->v4.sin_addr.s_addr;
 
-	return  jhash_3words(addr, ((__u32)paddr->v4.sin_port) << 16 |
+	return  jhash_3words(addr, ((__force __u32)paddr->v4.sin_port) << 16 |
 			     (__force __u32)lport, net_hash_mix(net), seed);
 }
 
-- 
cgit v1.2.3


From f6fc6bc0b8e0bb13a210bd7386ffdcb1a5f30ef1 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 28 Oct 2017 19:43:56 +0800
Subject: sctp: fix a type cast warnings that causes a_rwnd gets the wrong
 value

These warnings were found by running 'make C=2 M=net/sctp/'.

Commit d4d6fb5787a6 ("sctp: Try not to change a_rwnd when faking a
SACK from SHUTDOWN.") expected to use the peers old rwnd and add
our flight size to the a_rwnd. But with the wrong Endian, it may
not work as well as expected.

So fix it by converting to the right value.

Fixes: d4d6fb5787a6 ("sctp: Try not to change a_rwnd when faking a SACK from SHUTDOWN.")
Reported-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/sm_sideeffect.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index e6a2974e020e..8f2762bba879 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -1680,8 +1680,8 @@ static int sctp_cmd_interpreter(enum sctp_event event_type,
 		case SCTP_CMD_PROCESS_CTSN:
 			/* Dummy up a SACK for processing. */
 			sackh.cum_tsn_ack = cmd->obj.be32;
-			sackh.a_rwnd = asoc->peer.rwnd +
-					asoc->outqueue.outstanding_bytes;
+			sackh.a_rwnd = htonl(asoc->peer.rwnd +
+					     asoc->outqueue.outstanding_bytes);
 			sackh.num_gap_ack_blocks = 0;
 			sackh.num_dup_tsns = 0;
 			chunk->subh.sack_hdr = &sackh;
-- 
cgit v1.2.3


From 978aa0474115f3f5848949f2efce4def0766a5cb Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 28 Oct 2017 19:43:57 +0800
Subject: sctp: fix some type cast warnings introduced since very beginning

These warnings were found by running 'make C=2 M=net/sctp/'.
They are there since very beginning.

Note after this patch, there still one warning left in
sctp_outq_flush():
  sctp_chunk_fail(chunk, SCTP_ERROR_INV_STRM)

Since it has been moved to sctp_stream_outq_migrate on net-next,
to avoid the extra job when merging net-next to net, I will post
the fix for it after the merging is done.

Reported-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h      | 2 +-
 include/uapi/linux/sctp.h | 2 +-
 net/sctp/ipv6.c           | 2 +-
 net/sctp/sm_make_chunk.c  | 4 ++--
 net/sctp/sm_sideeffect.c  | 4 ++--
 5 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 09d7412e9cb0..da803dfc7a39 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -231,7 +231,7 @@ struct sctp_datahdr {
 	__be32 tsn;
 	__be16 stream;
 	__be16 ssn;
-	__be32 ppid;
+	__u32 ppid;
 	__u8  payload[0];
 };
 
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 6217ff8500a1..84fc2914b7fb 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -376,7 +376,7 @@ struct sctp_remote_error {
 	__u16 sre_type;
 	__u16 sre_flags;
 	__u32 sre_length;
-	__u16 sre_error;
+	__be16 sre_error;
 	sctp_assoc_t sre_assoc_id;
 	__u8 sre_data[0];
 };
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 7fe9e1d1b7ec..a6dfa86c0201 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -738,7 +738,7 @@ static int sctp_v6_skb_iif(const struct sk_buff *skb)
 /* Was this packet marked by Explicit Congestion Notification? */
 static int sctp_v6_is_ce(const struct sk_buff *skb)
 {
-	return *((__u32 *)(ipv6_hdr(skb))) & htonl(1 << 20);
+	return *((__u32 *)(ipv6_hdr(skb))) & (__force __u32)htonl(1 << 20);
 }
 
 /* Dump the v6 addr to the seq file. */
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 57c55045f5a7..514465b03829 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -2854,7 +2854,7 @@ struct sctp_chunk *sctp_make_asconf_update_ip(struct sctp_association *asoc,
 		addr_param_len = af->to_addr_param(addr, &addr_param);
 		param.param_hdr.type = flags;
 		param.param_hdr.length = htons(paramlen + addr_param_len);
-		param.crr_id = i;
+		param.crr_id = htonl(i);
 
 		sctp_addto_chunk(retval, paramlen, &param);
 		sctp_addto_chunk(retval, addr_param_len, &addr_param);
@@ -2867,7 +2867,7 @@ struct sctp_chunk *sctp_make_asconf_update_ip(struct sctp_association *asoc,
 		addr_param_len = af->to_addr_param(addr, &addr_param);
 		param.param_hdr.type = SCTP_PARAM_DEL_IP;
 		param.param_hdr.length = htons(paramlen + addr_param_len);
-		param.crr_id = i;
+		param.crr_id = htonl(i);
 
 		sctp_addto_chunk(retval, paramlen, &param);
 		sctp_addto_chunk(retval, addr_param_len, &addr_param);
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 8f2762bba879..e2d9a4b49c9c 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -1607,12 +1607,12 @@ static int sctp_cmd_interpreter(enum sctp_event event_type,
 			break;
 
 		case SCTP_CMD_INIT_FAILED:
-			sctp_cmd_init_failed(commands, asoc, cmd->obj.err);
+			sctp_cmd_init_failed(commands, asoc, cmd->obj.u32);
 			break;
 
 		case SCTP_CMD_ASSOC_FAILED:
 			sctp_cmd_assoc_failed(commands, asoc, event_type,
-					      subtype, chunk, cmd->obj.err);
+					      subtype, chunk, cmd->obj.u32);
 			break;
 
 		case SCTP_CMD_INIT_COUNTER_INC:
-- 
cgit v1.2.3


From 7aa0045dadb6ef37485ea9f2a7d28278ca588b51 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Thu, 26 Oct 2017 18:24:28 -0700
Subject: net_sched: introduce a workqueue for RCU callbacks of tc filter

This patch introduces a dedicated workqueue for tc filters
so that each tc filter's RCU callback could defer their
action destroy work to this workqueue. The helper
tcf_queue_work() is introduced for them to use.

Because we hold RTNL lock when calling tcf_block_put(), we
can not simply flush works inside it, therefore we have to
defer it again to this workqueue and make sure all flying RCU
callbacks have already queued their work before this one, in
other words, to ensure this is the last one to execute to
prevent any use-after-free.

On the other hand, this makes tcf_block_put() ugly and
harder to understand. Since David and Eric strongly dislike
adding synchronize_rcu(), this is probably the only
solution that could make everyone happy.

Please also see the code comments below.

Reported-by: Chris Mi <chrism@mellanox.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jiri Pirko <jiri@resnulli.us>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h     |  3 +++
 include/net/sch_generic.h |  2 ++
 net/sched/cls_api.c       | 68 +++++++++++++++++++++++++++++++++++------------
 3 files changed, 56 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index e80edd8879ef..3009547f3c66 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -2,6 +2,7 @@
 #define __NET_PKT_CLS_H
 
 #include <linux/pkt_cls.h>
+#include <linux/workqueue.h>
 #include <net/sch_generic.h>
 #include <net/act_api.h>
 
@@ -17,6 +18,8 @@ struct tcf_walker {
 int register_tcf_proto_ops(struct tcf_proto_ops *ops);
 int unregister_tcf_proto_ops(struct tcf_proto_ops *ops);
 
+bool tcf_queue_work(struct work_struct *work);
+
 #ifdef CONFIG_NET_CLS
 struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index,
 				bool create);
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 135f5a2dd931..0dec8a23be57 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -10,6 +10,7 @@
 #include <linux/dynamic_queue_limits.h>
 #include <linux/list.h>
 #include <linux/refcount.h>
+#include <linux/workqueue.h>
 #include <net/gen_stats.h>
 #include <net/rtnetlink.h>
 
@@ -271,6 +272,7 @@ struct tcf_chain {
 
 struct tcf_block {
 	struct list_head chain_list;
+	struct work_struct work;
 };
 
 static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz)
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 0b2219adf520..045d13679ad6 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -77,6 +77,8 @@ out:
 }
 EXPORT_SYMBOL(register_tcf_proto_ops);
 
+static struct workqueue_struct *tc_filter_wq;
+
 int unregister_tcf_proto_ops(struct tcf_proto_ops *ops)
 {
 	struct tcf_proto_ops *t;
@@ -86,6 +88,7 @@ int unregister_tcf_proto_ops(struct tcf_proto_ops *ops)
 	 * tcf_proto_ops's destroy() handler.
 	 */
 	rcu_barrier();
+	flush_workqueue(tc_filter_wq);
 
 	write_lock(&cls_mod_lock);
 	list_for_each_entry(t, &tcf_proto_base, head) {
@@ -100,6 +103,12 @@ int unregister_tcf_proto_ops(struct tcf_proto_ops *ops)
 }
 EXPORT_SYMBOL(unregister_tcf_proto_ops);
 
+bool tcf_queue_work(struct work_struct *work)
+{
+	return queue_work(tc_filter_wq, work);
+}
+EXPORT_SYMBOL(tcf_queue_work);
+
 /* Select new prio value from the range, managed by kernel. */
 
 static inline u32 tcf_auto_prio(struct tcf_proto *tp)
@@ -266,23 +275,30 @@ err_chain_create:
 }
 EXPORT_SYMBOL(tcf_block_get);
 
-void tcf_block_put(struct tcf_block *block)
+static void tcf_block_put_final(struct work_struct *work)
 {
+	struct tcf_block *block = container_of(work, struct tcf_block, work);
 	struct tcf_chain *chain, *tmp;
 
-	if (!block)
-		return;
-
-	/* XXX: Standalone actions are not allowed to jump to any chain, and
-	 * bound actions should be all removed after flushing. However,
-	 * filters are destroyed in RCU callbacks, we have to hold the chains
-	 * first, otherwise we would always race with RCU callbacks on this list
-	 * without proper locking.
-	 */
+	/* At this point, all the chains should have refcnt == 1. */
+	rtnl_lock();
+	list_for_each_entry_safe(chain, tmp, &block->chain_list, list)
+		tcf_chain_put(chain);
+	rtnl_unlock();
+	kfree(block);
+}
 
-	/* Wait for existing RCU callbacks to cool down. */
-	rcu_barrier();
+/* XXX: Standalone actions are not allowed to jump to any chain, and bound
+ * actions should be all removed after flushing. However, filters are destroyed
+ * in RCU callbacks, we have to hold the chains first, otherwise we would
+ * always race with RCU callbacks on this list without proper locking.
+ */
+static void tcf_block_put_deferred(struct work_struct *work)
+{
+	struct tcf_block *block = container_of(work, struct tcf_block, work);
+	struct tcf_chain *chain;
 
+	rtnl_lock();
 	/* Hold a refcnt for all chains, except 0, in case they are gone. */
 	list_for_each_entry(chain, &block->chain_list, list)
 		if (chain->index)
@@ -292,13 +308,27 @@ void tcf_block_put(struct tcf_block *block)
 	list_for_each_entry(chain, &block->chain_list, list)
 		tcf_chain_flush(chain);
 
-	/* Wait for RCU callbacks to release the reference count. */
+	INIT_WORK(&block->work, tcf_block_put_final);
+	/* Wait for RCU callbacks to release the reference count and make
+	 * sure their works have been queued before this.
+	 */
 	rcu_barrier();
+	tcf_queue_work(&block->work);
+	rtnl_unlock();
+}
 
-	/* At this point, all the chains should have refcnt == 1. */
-	list_for_each_entry_safe(chain, tmp, &block->chain_list, list)
-		tcf_chain_put(chain);
-	kfree(block);
+void tcf_block_put(struct tcf_block *block)
+{
+	if (!block)
+		return;
+
+	INIT_WORK(&block->work, tcf_block_put_deferred);
+	/* Wait for existing RCU callbacks to cool down, make sure their works
+	 * have been queued before this. We can not flush pending works here
+	 * because we are holding the RTNL lock.
+	 */
+	rcu_barrier();
+	tcf_queue_work(&block->work);
 }
 EXPORT_SYMBOL(tcf_block_put);
 
@@ -1030,6 +1060,10 @@ EXPORT_SYMBOL(tcf_exts_get_dev);
 
 static int __init tc_filter_init(void)
 {
+	tc_filter_wq = alloc_ordered_workqueue("tc_filter_workqueue", 0);
+	if (!tc_filter_wq)
+		return -ENOMEM;
+
 	rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, 0);
 	rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL, 0);
 	rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_ctl_tfilter,
-- 
cgit v1.2.3


From c96a48385d53089ee9977dd0bce82a9493984484 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Thu, 26 Oct 2017 18:24:29 -0700
Subject: net_sched: use tcf_queue_work() in basic filter

Defer the tcf_exts_destroy() in RCU callback to
tc filter workqueue and get RTNL lock.

Reported-by: Chris Mi <chrism@mellanox.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jiri Pirko <jiri@resnulli.us>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_basic.c | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index d89ebafd2239..f177649a2419 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -34,7 +34,10 @@ struct basic_filter {
 	struct tcf_result	res;
 	struct tcf_proto	*tp;
 	struct list_head	link;
-	struct rcu_head		rcu;
+	union {
+		struct work_struct	work;
+		struct rcu_head		rcu;
+	};
 };
 
 static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp,
@@ -82,15 +85,26 @@ static int basic_init(struct tcf_proto *tp)
 	return 0;
 }
 
-static void basic_delete_filter(struct rcu_head *head)
+static void basic_delete_filter_work(struct work_struct *work)
 {
-	struct basic_filter *f = container_of(head, struct basic_filter, rcu);
+	struct basic_filter *f = container_of(work, struct basic_filter, work);
 
+	rtnl_lock();
 	tcf_exts_destroy(&f->exts);
 	tcf_em_tree_destroy(&f->ematches);
+	rtnl_unlock();
+
 	kfree(f);
 }
 
+static void basic_delete_filter(struct rcu_head *head)
+{
+	struct basic_filter *f = container_of(head, struct basic_filter, rcu);
+
+	INIT_WORK(&f->work, basic_delete_filter_work);
+	tcf_queue_work(&f->work);
+}
+
 static void basic_destroy(struct tcf_proto *tp)
 {
 	struct basic_head *head = rtnl_dereference(tp->root);
-- 
cgit v1.2.3


From e910af676b565ecc16bcd6c896ecb68157396ecc Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Thu, 26 Oct 2017 18:24:30 -0700
Subject: net_sched: use tcf_queue_work() in bpf filter

Defer the tcf_exts_destroy() in RCU callback to
tc filter workqueue and get RTNL lock.

Reported-by: Chris Mi <chrism@mellanox.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jiri Pirko <jiri@resnulli.us>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_bpf.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 520c5027646a..037a3ae86829 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -49,7 +49,10 @@ struct cls_bpf_prog {
 	struct sock_filter *bpf_ops;
 	const char *bpf_name;
 	struct tcf_proto *tp;
-	struct rcu_head rcu;
+	union {
+		struct work_struct work;
+		struct rcu_head rcu;
+	};
 };
 
 static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = {
@@ -257,9 +260,21 @@ static void __cls_bpf_delete_prog(struct cls_bpf_prog *prog)
 	kfree(prog);
 }
 
+static void cls_bpf_delete_prog_work(struct work_struct *work)
+{
+	struct cls_bpf_prog *prog = container_of(work, struct cls_bpf_prog, work);
+
+	rtnl_lock();
+	__cls_bpf_delete_prog(prog);
+	rtnl_unlock();
+}
+
 static void cls_bpf_delete_prog_rcu(struct rcu_head *rcu)
 {
-	__cls_bpf_delete_prog(container_of(rcu, struct cls_bpf_prog, rcu));
+	struct cls_bpf_prog *prog = container_of(rcu, struct cls_bpf_prog, rcu);
+
+	INIT_WORK(&prog->work, cls_bpf_delete_prog_work);
+	tcf_queue_work(&prog->work);
 }
 
 static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog)
-- 
cgit v1.2.3


From b1b5b04fdb6da262aef37ef83b9f2e41326720ef Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Thu, 26 Oct 2017 18:24:31 -0700
Subject: net_sched: use tcf_queue_work() in cgroup filter

Defer the tcf_exts_destroy() in RCU callback to
tc filter workqueue and get RTNL lock.

Reported-by: Chris Mi <chrism@mellanox.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jiri Pirko <jiri@resnulli.us>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_cgroup.c | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index d48452f87975..a97e069bee89 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -23,7 +23,10 @@ struct cls_cgroup_head {
 	struct tcf_exts		exts;
 	struct tcf_ematch_tree	ematches;
 	struct tcf_proto	*tp;
-	struct rcu_head		rcu;
+	union {
+		struct work_struct	work;
+		struct rcu_head		rcu;
+	};
 };
 
 static int cls_cgroup_classify(struct sk_buff *skb, const struct tcf_proto *tp,
@@ -57,15 +60,26 @@ static const struct nla_policy cgroup_policy[TCA_CGROUP_MAX + 1] = {
 	[TCA_CGROUP_EMATCHES]	= { .type = NLA_NESTED },
 };
 
+static void cls_cgroup_destroy_work(struct work_struct *work)
+{
+	struct cls_cgroup_head *head = container_of(work,
+						    struct cls_cgroup_head,
+						    work);
+	rtnl_lock();
+	tcf_exts_destroy(&head->exts);
+	tcf_em_tree_destroy(&head->ematches);
+	kfree(head);
+	rtnl_unlock();
+}
+
 static void cls_cgroup_destroy_rcu(struct rcu_head *root)
 {
 	struct cls_cgroup_head *head = container_of(root,
 						    struct cls_cgroup_head,
 						    rcu);
 
-	tcf_exts_destroy(&head->exts);
-	tcf_em_tree_destroy(&head->ematches);
-	kfree(head);
+	INIT_WORK(&head->work, cls_cgroup_destroy_work);
+	tcf_queue_work(&head->work);
 }
 
 static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
-- 
cgit v1.2.3


From 94cdb47566b799649e996e1fb9de2a503dada763 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Thu, 26 Oct 2017 18:24:32 -0700
Subject: net_sched: use tcf_queue_work() in flow filter

Defer the tcf_exts_destroy() in RCU callback to
tc filter workqueue and get RTNL lock.

Reported-by: Chris Mi <chrism@mellanox.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jiri Pirko <jiri@resnulli.us>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_flow.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 2a3a60ec5b86..67f3a2af6aab 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -57,7 +57,10 @@ struct flow_filter {
 	u32			divisor;
 	u32			baseclass;
 	u32			hashrnd;
-	struct rcu_head		rcu;
+	union {
+		struct work_struct	work;
+		struct rcu_head		rcu;
+	};
 };
 
 static inline u32 addr_fold(void *addr)
@@ -369,14 +372,24 @@ static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = {
 	[TCA_FLOW_PERTURB]	= { .type = NLA_U32 },
 };
 
-static void flow_destroy_filter(struct rcu_head *head)
+static void flow_destroy_filter_work(struct work_struct *work)
 {
-	struct flow_filter *f = container_of(head, struct flow_filter, rcu);
+	struct flow_filter *f = container_of(work, struct flow_filter, work);
 
+	rtnl_lock();
 	del_timer_sync(&f->perturb_timer);
 	tcf_exts_destroy(&f->exts);
 	tcf_em_tree_destroy(&f->ematches);
 	kfree(f);
+	rtnl_unlock();
+}
+
+static void flow_destroy_filter(struct rcu_head *head)
+{
+	struct flow_filter *f = container_of(head, struct flow_filter, rcu);
+
+	INIT_WORK(&f->work, flow_destroy_filter_work);
+	tcf_queue_work(&f->work);
 }
 
 static int flow_change(struct net *net, struct sk_buff *in_skb,
-- 
cgit v1.2.3


From 0552c8afa077889b4704ef5ee88b03063ad45023 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Thu, 26 Oct 2017 18:24:33 -0700
Subject: net_sched: use tcf_queue_work() in flower filter

Defer the tcf_exts_destroy() in RCU callback to
tc filter workqueue and get RTNL lock.

Reported-by: Chris Mi <chrism@mellanox.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jiri Pirko <jiri@resnulli.us>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_flower.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index b480d7c792ba..5b5722c8b32c 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -87,7 +87,10 @@ struct cls_fl_filter {
 	struct list_head list;
 	u32 handle;
 	u32 flags;
-	struct rcu_head	rcu;
+	union {
+		struct work_struct work;
+		struct rcu_head	rcu;
+	};
 	struct net_device *hw_dev;
 };
 
@@ -215,12 +218,22 @@ static int fl_init(struct tcf_proto *tp)
 	return 0;
 }
 
-static void fl_destroy_filter(struct rcu_head *head)
+static void fl_destroy_filter_work(struct work_struct *work)
 {
-	struct cls_fl_filter *f = container_of(head, struct cls_fl_filter, rcu);
+	struct cls_fl_filter *f = container_of(work, struct cls_fl_filter, work);
 
+	rtnl_lock();
 	tcf_exts_destroy(&f->exts);
 	kfree(f);
+	rtnl_unlock();
+}
+
+static void fl_destroy_filter(struct rcu_head *head)
+{
+	struct cls_fl_filter *f = container_of(head, struct cls_fl_filter, rcu);
+
+	INIT_WORK(&f->work, fl_destroy_filter_work);
+	tcf_queue_work(&f->work);
 }
 
 static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f)
-- 
cgit v1.2.3


From e071dff2a6beeccb6f9744f9a0251ab773ca2ab8 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Thu, 26 Oct 2017 18:24:34 -0700
Subject: net_sched: use tcf_queue_work() in fw filter

Defer the tcf_exts_destroy() in RCU callback to
tc filter workqueue and get RTNL lock.

Reported-by: Chris Mi <chrism@mellanox.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jiri Pirko <jiri@resnulli.us>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_fw.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index 941245ad07fd..99183b8621ec 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -46,7 +46,10 @@ struct fw_filter {
 #endif /* CONFIG_NET_CLS_IND */
 	struct tcf_exts		exts;
 	struct tcf_proto	*tp;
-	struct rcu_head		rcu;
+	union {
+		struct work_struct	work;
+		struct rcu_head		rcu;
+	};
 };
 
 static u32 fw_hash(u32 handle)
@@ -119,12 +122,22 @@ static int fw_init(struct tcf_proto *tp)
 	return 0;
 }
 
-static void fw_delete_filter(struct rcu_head *head)
+static void fw_delete_filter_work(struct work_struct *work)
 {
-	struct fw_filter *f = container_of(head, struct fw_filter, rcu);
+	struct fw_filter *f = container_of(work, struct fw_filter, work);
 
+	rtnl_lock();
 	tcf_exts_destroy(&f->exts);
 	kfree(f);
+	rtnl_unlock();
+}
+
+static void fw_delete_filter(struct rcu_head *head)
+{
+	struct fw_filter *f = container_of(head, struct fw_filter, rcu);
+
+	INIT_WORK(&f->work, fw_delete_filter_work);
+	tcf_queue_work(&f->work);
 }
 
 static void fw_destroy(struct tcf_proto *tp)
-- 
cgit v1.2.3


From df2735ee8e6ca202a8630f237b59401a25193be1 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Thu, 26 Oct 2017 18:24:35 -0700
Subject: net_sched: use tcf_queue_work() in matchall filter

Defer the tcf_exts_destroy() in RCU callback to
tc filter workqueue and get RTNL lock.

Reported-by: Chris Mi <chrism@mellanox.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jiri Pirko <jiri@resnulli.us>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_matchall.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index eeac606c95ab..c33f711b9019 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -21,7 +21,10 @@ struct cls_mall_head {
 	struct tcf_result res;
 	u32 handle;
 	u32 flags;
-	struct rcu_head	rcu;
+	union {
+		struct work_struct work;
+		struct rcu_head	rcu;
+	};
 };
 
 static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp,
@@ -41,13 +44,23 @@ static int mall_init(struct tcf_proto *tp)
 	return 0;
 }
 
+static void mall_destroy_work(struct work_struct *work)
+{
+	struct cls_mall_head *head = container_of(work, struct cls_mall_head,
+						  work);
+	rtnl_lock();
+	tcf_exts_destroy(&head->exts);
+	kfree(head);
+	rtnl_unlock();
+}
+
 static void mall_destroy_rcu(struct rcu_head *rcu)
 {
 	struct cls_mall_head *head = container_of(rcu, struct cls_mall_head,
 						  rcu);
 
-	tcf_exts_destroy(&head->exts);
-	kfree(head);
+	INIT_WORK(&head->work, mall_destroy_work);
+	tcf_queue_work(&head->work);
 }
 
 static int mall_replace_hw_filter(struct tcf_proto *tp,
-- 
cgit v1.2.3


From c0d378ef1266546a39f2df00a56ff1f74166a2b7 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Thu, 26 Oct 2017 18:24:36 -0700
Subject: net_sched: use tcf_queue_work() in u32 filter

Defer the tcf_exts_destroy() in RCU callback to
tc filter workqueue and get RTNL lock.

Reported-by: Chris Mi <chrism@mellanox.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jiri Pirko <jiri@resnulli.us>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_u32.c | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 10b8d851fc6b..dadd1b344497 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -68,7 +68,10 @@ struct tc_u_knode {
 	u32 __percpu		*pcpu_success;
 #endif
 	struct tcf_proto	*tp;
-	struct rcu_head		rcu;
+	union {
+		struct work_struct	work;
+		struct rcu_head		rcu;
+	};
 	/* The 'sel' field MUST be the last field in structure to allow for
 	 * tc_u32_keys allocated at end of structure.
 	 */
@@ -418,11 +421,21 @@ static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n,
  * this the u32_delete_key_rcu variant does not free the percpu
  * statistics.
  */
+static void u32_delete_key_work(struct work_struct *work)
+{
+	struct tc_u_knode *key = container_of(work, struct tc_u_knode, work);
+
+	rtnl_lock();
+	u32_destroy_key(key->tp, key, false);
+	rtnl_unlock();
+}
+
 static void u32_delete_key_rcu(struct rcu_head *rcu)
 {
 	struct tc_u_knode *key = container_of(rcu, struct tc_u_knode, rcu);
 
-	u32_destroy_key(key->tp, key, false);
+	INIT_WORK(&key->work, u32_delete_key_work);
+	tcf_queue_work(&key->work);
 }
 
 /* u32_delete_key_freepf_rcu is the rcu callback variant
@@ -432,11 +445,21 @@ static void u32_delete_key_rcu(struct rcu_head *rcu)
  * for the variant that should be used with keys return from
  * u32_init_knode()
  */
+static void u32_delete_key_freepf_work(struct work_struct *work)
+{
+	struct tc_u_knode *key = container_of(work, struct tc_u_knode, work);
+
+	rtnl_lock();
+	u32_destroy_key(key->tp, key, true);
+	rtnl_unlock();
+}
+
 static void u32_delete_key_freepf_rcu(struct rcu_head *rcu)
 {
 	struct tc_u_knode *key = container_of(rcu, struct tc_u_knode, rcu);
 
-	u32_destroy_key(key->tp, key, true);
+	INIT_WORK(&key->work, u32_delete_key_freepf_work);
+	tcf_queue_work(&key->work);
 }
 
 static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key)
-- 
cgit v1.2.3


From c2f3f31d402be4849b06282c3a5278f2865c9fcc Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Thu, 26 Oct 2017 18:24:37 -0700
Subject: net_sched: use tcf_queue_work() in route filter

Defer the tcf_exts_destroy() in RCU callback to
tc filter workqueue and get RTNL lock.

Reported-by: Chris Mi <chrism@mellanox.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jiri Pirko <jiri@resnulli.us>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_route.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index 9ddde65915d2..4b14ccd8b8f2 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -57,7 +57,10 @@ struct route4_filter {
 	u32			handle;
 	struct route4_bucket	*bkt;
 	struct tcf_proto	*tp;
-	struct rcu_head		rcu;
+	union {
+		struct work_struct	work;
+		struct rcu_head		rcu;
+	};
 };
 
 #define ROUTE4_FAILURE ((struct route4_filter *)(-1L))
@@ -254,12 +257,22 @@ static int route4_init(struct tcf_proto *tp)
 	return 0;
 }
 
-static void route4_delete_filter(struct rcu_head *head)
+static void route4_delete_filter_work(struct work_struct *work)
 {
-	struct route4_filter *f = container_of(head, struct route4_filter, rcu);
+	struct route4_filter *f = container_of(work, struct route4_filter, work);
 
+	rtnl_lock();
 	tcf_exts_destroy(&f->exts);
 	kfree(f);
+	rtnl_unlock();
+}
+
+static void route4_delete_filter(struct rcu_head *head)
+{
+	struct route4_filter *f = container_of(head, struct route4_filter, rcu);
+
+	INIT_WORK(&f->work, route4_delete_filter_work);
+	tcf_queue_work(&f->work);
 }
 
 static void route4_destroy(struct tcf_proto *tp)
-- 
cgit v1.2.3


From d4f84a41dc615c166555cd332b0235bf6b9bcb4a Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Thu, 26 Oct 2017 18:24:38 -0700
Subject: net_sched: use tcf_queue_work() in rsvp filter

Defer the tcf_exts_destroy() in RCU callback to
tc filter workqueue and get RTNL lock.

Reported-by: Chris Mi <chrism@mellanox.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jiri Pirko <jiri@resnulli.us>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_rsvp.h | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index b1f6ed48bc72..bdbc541787f8 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -97,7 +97,10 @@ struct rsvp_filter {
 
 	u32				handle;
 	struct rsvp_session		*sess;
-	struct rcu_head			rcu;
+	union {
+		struct work_struct		work;
+		struct rcu_head			rcu;
+	};
 };
 
 static inline unsigned int hash_dst(__be32 *dst, u8 protocol, u8 tunnelid)
@@ -282,12 +285,22 @@ static int rsvp_init(struct tcf_proto *tp)
 	return -ENOBUFS;
 }
 
-static void rsvp_delete_filter_rcu(struct rcu_head *head)
+static void rsvp_delete_filter_work(struct work_struct *work)
 {
-	struct rsvp_filter *f = container_of(head, struct rsvp_filter, rcu);
+	struct rsvp_filter *f = container_of(work, struct rsvp_filter, work);
 
+	rtnl_lock();
 	tcf_exts_destroy(&f->exts);
 	kfree(f);
+	rtnl_unlock();
+}
+
+static void rsvp_delete_filter_rcu(struct rcu_head *head)
+{
+	struct rsvp_filter *f = container_of(head, struct rsvp_filter, rcu);
+
+	INIT_WORK(&f->work, rsvp_delete_filter_work);
+	tcf_queue_work(&f->work);
 }
 
 static void rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f)
-- 
cgit v1.2.3


From 27ce4f05e2abbe2d3ec7434e456619a5178cd3bd Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Thu, 26 Oct 2017 18:24:39 -0700
Subject: net_sched: use tcf_queue_work() in tcindex filter

Defer the tcf_exts_destroy() in RCU callback to
tc filter workqueue and get RTNL lock.

Reported-by: Chris Mi <chrism@mellanox.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jiri Pirko <jiri@resnulli.us>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_tcindex.c | 38 +++++++++++++++++++++++++++++++++-----
 1 file changed, 33 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index 14a7e08b2fa9..beaa95e09c25 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -27,14 +27,20 @@
 struct tcindex_filter_result {
 	struct tcf_exts		exts;
 	struct tcf_result	res;
-	struct rcu_head		rcu;
+	union {
+		struct work_struct	work;
+		struct rcu_head		rcu;
+	};
 };
 
 struct tcindex_filter {
 	u16 key;
 	struct tcindex_filter_result result;
 	struct tcindex_filter __rcu *next;
-	struct rcu_head rcu;
+	union {
+		struct work_struct work;
+		struct rcu_head rcu;
+	};
 };
 
 
@@ -133,12 +139,34 @@ static int tcindex_init(struct tcf_proto *tp)
 	return 0;
 }
 
+static void tcindex_destroy_rexts_work(struct work_struct *work)
+{
+	struct tcindex_filter_result *r;
+
+	r = container_of(work, struct tcindex_filter_result, work);
+	rtnl_lock();
+	tcf_exts_destroy(&r->exts);
+	rtnl_unlock();
+}
+
 static void tcindex_destroy_rexts(struct rcu_head *head)
 {
 	struct tcindex_filter_result *r;
 
 	r = container_of(head, struct tcindex_filter_result, rcu);
-	tcf_exts_destroy(&r->exts);
+	INIT_WORK(&r->work, tcindex_destroy_rexts_work);
+	tcf_queue_work(&r->work);
+}
+
+static void tcindex_destroy_fexts_work(struct work_struct *work)
+{
+	struct tcindex_filter *f = container_of(work, struct tcindex_filter,
+						work);
+
+	rtnl_lock();
+	tcf_exts_destroy(&f->result.exts);
+	kfree(f);
+	rtnl_unlock();
 }
 
 static void tcindex_destroy_fexts(struct rcu_head *head)
@@ -146,8 +174,8 @@ static void tcindex_destroy_fexts(struct rcu_head *head)
 	struct tcindex_filter *f = container_of(head, struct tcindex_filter,
 						rcu);
 
-	tcf_exts_destroy(&f->result.exts);
-	kfree(f);
+	INIT_WORK(&f->work, tcindex_destroy_fexts_work);
+	tcf_queue_work(&f->work);
 }
 
 static int tcindex_delete(struct tcf_proto *tp, void *arg, bool *last)
-- 
cgit v1.2.3


From 2d132eba1d972ea6c0e47286e4c821b4a3c5b84d Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Thu, 26 Oct 2017 18:24:40 -0700
Subject: net_sched: add rtnl assertion to tcf_exts_destroy()

After previous patches, it is now safe to claim that
tcf_exts_destroy() is always called with RTNL lock.

Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jiri Pirko <jiri@resnulli.us>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_api.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 045d13679ad6..231181c602ed 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -909,6 +909,7 @@ void tcf_exts_destroy(struct tcf_exts *exts)
 #ifdef CONFIG_NET_CLS_ACT
 	LIST_HEAD(actions);
 
+	ASSERT_RTNL();
 	tcf_exts_to_list(exts, &actions);
 	tcf_action_destroy(&actions, TCA_ACT_UNBIND);
 	kfree(exts->actions);
-- 
cgit v1.2.3


From 46e235c15ca44f34cb79f4dbec909b8c51999dc1 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Thu, 26 Oct 2017 18:24:41 -0700
Subject: net_sched: fix call_rcu() race on act_sample module removal

Similar to commit c78e1746d3ad
("net: sched: fix call_rcu() race on classifier module unloads"),
we need to wait for flying RCU callback tcf_sample_cleanup_rcu().

Cc: Yotam Gigi <yotamg@mellanox.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jiri Pirko <jiri@resnulli.us>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_sample.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index ec986ae52808..a9f9a2ccc664 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -264,6 +264,7 @@ static int __init sample_init_module(void)
 
 static void __exit sample_cleanup_module(void)
 {
+	rcu_barrier();
 	tcf_unregister_action(&act_sample_ops, &sample_net_ops);
 }
 
-- 
cgit v1.2.3